In [62]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer


In [63]:
#Import Censored Data
censored_df=pd.read_json('../ExtractedData/input_withheldtweets.json', lines=True)
#import mostly not censored data
uncensored_df=pd.read_json('../Data/Mostly_Not_Censored.json', lines=True)

In [64]:
#merge and shufflem
df = pd.concat([censored_df, uncensored_df], ignore_index=True, sort=False)
df = shuffle(df)

In [65]:
df.withheld_in_countries

In [66]:
print("censored", df["withheld_in_countries"].notnull().sum())
print("uncensored", df["withheld_in_countries"].isnull().sum())

censored 4443
uncensored 0


In [67]:
df.loc[df["withheld_in_countries"].notnull(),"withheld_in_countries",]=0
df.loc[df["withheld_in_countries"].isnull(),"withheld_in_countries",]=1

In [68]:
df_x=df["text"]
df_y=df["withheld_in_countries"]
df_x

4103    RT @Pasnge_: Anjir Pacarnya Siapa Ini Nakal Ba...
2721    RT @souljaboy: Soulja Boy (Big Draco) - Rick &...
2455                  @ya_haaaaaa хммм красные найс точно
3062    RT @luis_zazano: ahora si. con mi hermana http...
2604    RT @Jamil52547185: احلى سهرة مع سحاقيتين نار و...
                              ...                        
3758    ==旧キ連南米人民軍設立記念日==\n\nイングソックに栄光であれ\n1810年4月11日 ...
762     Grum - First Contact https://t.co/5yUqLp9Uqn v...
1118    RT @HentaiTeengirl: Ohayou! 💙 #hentaicommunity...
1836    RT @ICafrinresist: "Iranian security forces ha...
988          RT @offensivemem3s_: https://t.co/QDA2xHnq85
Name: text, Length: 4443, dtype: object

In [69]:
cv = CountVectorizer()

In [70]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)

In [71]:
x_train.head()

607     HAHAHSHAHAHA why is that you @abrmali https://...
3411    @Neelth1 @alibromi I made this for him too. ht...
3408    RT @whitepeonyfarm: ❤️🐾VICTORIA - #ADOPTED! We...
4072             El que anda arrancao’, anda de mal humor
1417    RT @Enes_Kanter: Hellllll Yeeeaaahhhh!!! 👏 htt...
Name: text, dtype: object

In [72]:
cv = CountVectorizer()

In [73]:
x_traincv=cv.fit_transform(x_train)
x_train.iloc[0]

'HAHAHSHAHAHA why is that you @abrmali https://t.co/lMJD0xMOYj'

In [74]:
x_testcv=cv.transform(x_test)
x_testcv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [75]:
mnb = MultinomialNB()

In [76]:

y_train=y_train.astype('int')
y_train

607     0
3411    0
3408    0
4072    0
1417    0
       ..
1560    0
1894    0
2318    0
19      0
1241    0
Name: withheld_in_countries, Length: 3554, dtype: int32

In [77]:
mnb.fit(x_traincv,y_train)

In [78]:
testmessage=x_test.iloc[0]
testmessage

'RT @DavidCanek1: Hay gente que debe entender que no son el centro de universo. Si te ignoran es porque no les interesas y ya. Si te ofendes…'

In [79]:
predictions=mnb.predict(x_testcv)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [80]:
a=np.array(y_test)
a

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [81]:
count=0
for i in range (len(predictions)):
    if predictions[i]==a[i]:
        count=count+1
count

889

In [82]:
len(predictions)

889

In [83]:
count/predictions

  count/predictions


array([inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
       inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
       inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
       inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
       inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
       inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
       inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
       inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
       inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
       inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
       inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
       inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
       inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
       inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, in

In [84]:
testSentence = ["we need genocide"]
testcv=cv.transform(testSentence)
print(testcv)
predictions=mnb.predict_proba(testcv)
predictions

  (0, 4383)	1
  (0, 7466)	1
  (0, 11327)	1


array([[1.]])