In [1]:
import pandas as pd
import sklearn
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
path = "C:/Users/ericluo04/Documents/GitHub/Bots-Project/Code/"

### Data Cleaning

In [3]:
df = pd.read_csv(path + "2. HK Training/training/Data/modeling_1.csv")
df['tweet'] = df['tweet'].apply(nltk.word_tokenize)
stemmer = PorterStemmer()
df['tweet'] = df['tweet'].apply(lambda x: [stemmer.stem(y) for y in x])
df['tweet'] = df['tweet'].apply(lambda x: ' '.join(x))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['label'], train_size=.8)

stops = stopwords.words('english')
cv = CountVectorizer(strip_accents='unicode', stop_words=frozenset(stops + ['hong', 'kong', 'hongkong', 'thi', 'rt', 'thank', 'hk', 'hongkongprotest', 'hkprotest', 'u2026', 'uff0c', 'http', 'co']), min_df=2)
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

### Train Multinomial Naive Bayes Model

In [5]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_cv, y_train)
predictions = naive_bayes.predict(X_test_cv)

print('Accuracy score: ', accuracy_score(y_test, predictions))
print('Precision score: ', precision_score(y_test, predictions))
print('Recall score: ', recall_score(y_test, predictions))

Accuracy score:  0.8828736105393166
Precision score:  0.8809826344769166
Recall score:  0.8783783783783784


### Predictive Features

In [6]:
neg_class_prob_sorted = naive_bayes.feature_log_prob_[0, :].argsort()
pos_class_prob_sorted = naive_bayes.feature_log_prob_[1, :].argsort()

print('These are the 10 most predictive features for a pro-Hong Kong tweet:', np.take(cv.get_feature_names(), neg_class_prob_sorted[-10:]))
print('These are the 10 most predictive features for a pro-China tweet:', np.take(cv.get_feature_names(), pos_class_prob_sorted[-10:]))

These are the 10 most predictive features for a pro-Hong Kong tweet: ['support' 'standwithhk' 'freedom' 'china' 'nba' 'polic' 'fightforfreedom'
 'freehongkong' 'chinazi' 'standwithhongkong']
These are the 10 most predictive features for a pro-China tweet: ['u5f92' 'china' 'u66b4' 'rioter' 'hkrioter' 'u9999' 'riot' 'u6e2f'
 'protest' 'polic']


In [10]:
naive_bayes.predict_proba(cv.transform(df['tweet']))

array([[9.99999995e-01, 5.17663849e-09],
       [8.11475382e-01, 1.88524618e-01],
       [9.99937675e-01, 6.23250365e-05],
       ...,
       [1.27444010e-07, 9.99999873e-01],
       [3.53348010e-14, 1.00000000e+00],
       [2.05177027e-12, 1.00000000e+00]])

### Naive Bayes into Master File

In [48]:
master = pd.read_csv(path + "2. HK Training/polarities/master_new.csv")

In [49]:
master.rename(columns = {'polarity':'CNNpolarity'}, 
              inplace = True) 

In [53]:
master['NBpolarity'] = naive_bayes.predict_proba(cv.transform(master['tweet']))[:,1]

In [63]:
# decode ASCII from JSON 
chinese = r" \u9999\u6e2f\u66b4\u4e71\u5c31\u662f\u4f60\u5011\u9019\u7a2e\u4eba\u641e\u4e82\u7684\uff0c\u81ea\u5df1\u5c31\u662f\u500b\u8822"
chinese.encode().decode('unicode-escape')

' 香港暴乱就是你們這種人搞亂的，自己就是個蠢'

#### Pro-HK

In [57]:
pd.options.display.max_colwidth = 200
master[master['NBpolarity'] <= .01].sample(10)[['tweet', 'CNNpolarity', 'NBpolarity']]

Unnamed: 0,tweet,CNNpolarity,NBpolarity
68281,Than you for your support and share the Real to the world \U0001f30e.\n\nAdd oil\nFighting!!\nPound of you all\u2764\nWe are\u2026 https://t.co/zKKnAomdKE,0.998296,0.006386035
355455,Why enterprises kneel down to #chinazi ? Is this restriction necessary? \U0001f1f9\U0001f1fc \U0001f1f9\U0001f1fc \U0001f1f9\U0001f1fc \U0001f1f9\U0001f1fc \U0001f1f9\U0001f1fc \n\,7.8e-05,4.572441e-16
272175,Thank you USA #standwithhongkong #FreedomOfSpeech #HongKong #AntiELAB #antitotalitarianism https://t.co/qbn7ZNQhiO,2e-06,1.072024e-10
316859,@SenRickScott @AmericaNewsroom Thank you for your strongest commitment to #StandWithHongKong. Please help us from h\u2026 https://t.co/i8N5xmQT6M,2.5e-05,1.809178e-06
205359,What\u2019s happening in Canada? \nCredit to #lihkg \n#chinazi #Canada https://t.co/WW8WogpkdS,0.000925,2.171729e-06
296066,@GovMikeHuckabee Stand with NBA #StandWithHongKong,1.8e-05,8.740954e-06
177113,@NBA Let\u2019s talk about Morey before the game started.\n #StandWithHongKong #FightForFreedom,7e-06,2.601366e-08
349307,@Blizzard_Ent BOOOO!!! Blizzard support Chinazi^s. \n\nhttps://t.co/jiHAkRE9VU,0.001321,1.435742e-05
304557,"The Chinese people support 911, support terrorism. They clearly do not understand what freedom really means.\u2026 https://t.co/0lRuFxu12o",0.004273,0.001377597
328773,#FreeHongKong #FreedomOfSpeech https://t.co/KZJLcQtpIR,5.9e-05,2.547101e-06


#### Pro-China

In [62]:
master[master['NBpolarity'] >= .99].sample(10)[['tweet', 'CNNpolarity', 'NBpolarity']]

Unnamed: 0,tweet,CNNpolarity,NBpolarity
243778,@zifeiyu_1003 @PXL2s6pR3sxpetC \u5982\u679c\u662f\u771f\u8bb0\u8005\uff0c \u8bf7\u4ed6\u4eec\u79c9\u6301\u5ba2\u89c2\u516c\u6b63\u7684\u8bb0\u5f55\u539f\u5219\uff0c\u5411\u4e16\u75,0.953211,1.0
168804,\u90ed\u5a92\u4f53 https://t.co/QUA6wrtYHy https://t.co/rPFmYzHbQn\n10\u67081\u65e5\uff0c\u9999\u6e2f\u9ed1\u8b66\u5728\u5168\u4e16\u754c\u7684\u5a92\u4f53\u955c\u5934\u524d\u5f00\,0.689242,1.0
281668,\u542c\u8bf4\u53d1\u63a8\u6587\u80fd\u5feb\u70b9\u5347\u7ea7\u7684\uff0c\u521a\u521a\u5f00\u59cb\u7528\u63a8\u7279\uff0c\u770b\u5230\u8fd9\u4e48\u591a\u624b\u8db3\u5173\u6ce8\u91cf,0.689242,1.0
134737,RT @shouseikan: \u6628\u65e5\u306f\u4eac\u90fd\u306eNHK\u6587\u5316\u30bb\u30f3\u30bf\u30fc\u3067\u8b1b\u5ea7\u3002\u4eca\u56de\u3001\u5927\u5909\u5370\u8c61\u6df1\u304b\u3063\u305,0.869428,0.999761
131822,@amiactuallyok @hoccgoomusic @lihkg_forum \u5514\u901a\u5514\u7cfb\uff1f\u91cc\u4e00\u67aa\u5f00\u5f97\u592a\u8fdf\u5566\uff0c\u4e0d\u7136\u7684\u8bdd\u4f60\u73ed\u6050\u6016\u5206,0.902536,0.999999
392748,@octopus663 \u6bd2\u6587\u5ba3\uff01\n\u66b4\u5f92\u884c\u4e3a\uff0c\u5929\u7406\u96be\u5bb9\u3002\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\x,0.689242,1.0
379362,@LamCK15 @ZhouHua94189188 @bbcchinese \u9999\u6e2f\u5bf9\u4e2d\u56fd\u5d1b\u8d77\u6709\u5de8\u5927\u7684\u8d21\u732e\uff0c\u90fd\u627f\u8ba4\u3002\u4f46\u4e0d\u53ea\u662f\u9999\u6e,0.985986,0.99999
237323,@shhcaa \u9999\u6e2f\u7684\u66b4\u5f92\u4eec\u6253\u7740\u6c11\u4e3b\u81ea\u7531\u7684\u53e3\u53f7\uff0c\u5e72\u7740\u732a\u72d7\u4e0d\u5982\u7684\u52fe\u5f53\uff0c\u4e0e\u75af\u5b,0.91102,1.0
244777,@LifetimeUSCN \u9999\u6e2f\u66b4\u4e71\u5c31\u662f\u4f60\u5011\u9019\u7a2e\u4eba\u641e\u4e82\u7684\uff0c\u81ea\u5df1\u5c31\u662f\u500b\u8822\u6750,0.51668,1.0
371970,@MarineMcgregor5 \u66b4\u529b\u8fd8\u662f\u4e0d\u65ad\u5347\u7ea7\uff0c\u66b4\u5f92\u4eec\u5df2\u7ecf\u4e27\u5fc3\u75c5\u72c2\uff0c\u5373\u4f7f\u662f\u9999\u6e2f\u5927\u591a\u6570\,0.689242,1.0


#### In Between

In [65]:
master[(master['NBpolarity'] >= .495) & (master['NBpolarity'] <= .505)].sample(10)[['tweet', 'CNNpolarity', 'NBpolarity']]

Unnamed: 0,tweet,CNNpolarity,NBpolarity
95590,#HongKong\nhttps://t.co/jW6FVtSYPN,0.689242,0.503139
95809,Politiet i Hongkong efter demonstrationer.,0.78167,0.503139
75050,RT @joshuawongcf: [For Our Home and Our Homeland - My announcement on 2019 District Council Election] \n\n1. Democracy starts with us standin\u2026,0.875221,0.50138
77776,@TotallyAnders Hongkong?,0.968974,0.503139
100978,"Centinaia di studenti davanti alla scuola dello studente colpito dalla polizia a #HongKong, @guardian\u2026 https://t.co/5Cp1VxFL7G",0.779777,0.496392
91698,@globaltimesnews Does the Hongkong^s people knows about the terrorism?,0.953522,0.497549
98550,"#FreeCatalanPoliticalPrisoners \u2705131lights\u2764in #Catalonia to say:""It allows us to maintain the dignity that unites th\u2026 https://t.co/VyIe6z53Ri",0.999445,0.495825
180379,The disguised #hkpolice threatening and pushing reporters. Why? Because freedom of press is prohibited in\u2026 https://t.co/l849vP7LMF,0.097804,0.501659
99762,"@JorisVDBroucke Politiek in t algemeen is een degoutante farce geworden : van links tot rechts ... ""La naus\xe9e"" zoal\u2026 https://t.co/Go01sN87c9",0.999299,0.501289
100592,\u9999\u6e2f\u304b\u3089\u3010\u62e1\u6563\u5e0c\u671b\u3011\uff01\uff01\U0001f64f\n\nhttps://t.co/3ygCEvYMzj\n\n#HongKong #\u9999\u6e2f #\u30c7\u30e2 #\u5409\u91ce\u5bb6 #\u5143\u,0.657302,0.504642


In [1]:
master.to_csv('master_new.csv')

NameError: name 'master' is not defined