In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
#Import Censored Data
censored_df=pd.read_json('../ExtractedData/input_withheldtweets.json', lines=True)

In [3]:
#import mostly not censored data
uncensored_df=pd.read_json('../Data/Mostly_Not_Censored.json', lines=True)

In [4]:
#merge and shufflem
df = pd.concat([censored_df, uncensored_df], ignore_index=True, sort=False)
df = shuffle(df)

In [5]:
df.withheld_in_countries

1022    [DE, FR]
3931         NaN
2452         NaN
3610         NaN
1708        [DE]
          ...   
2488         NaN
3927         NaN
1566        [TR]
2006        [DE]
3008         NaN
Name: withheld_in_countries, Length: 4443, dtype: object

In [6]:
print("censored", df["withheld_in_countries"].notnull().sum())
print("uncensored", df["withheld_in_countries"].isnull().sum())

censored 2062
uncensored 2381


In [7]:
df.loc[df["withheld_in_countries"].notnull(),"withheld_in_countries",]=0
df.loc[df["withheld_in_countries"].isnull(),"withheld_in_countries",]=1

In [8]:
df_x=df["text"]
df_y=df["withheld_in_countries"]
df_x

1022         RT @offensivemem3s_: https://t.co/JUp3Ht4gYJ
3931    RT @deep_dab: 😂🤣🤣🤣 FACTS! https://t.co/9B41EjzDkR
2452    Seeing yt ppl in chargers really do irritate t...
3610    @luvsse @seungeunista qn sos  Dale el user LE ...
1708    @CarmenLuvanaXXX USS   EQUAL  FREE  OR  NOT  G...
                              ...                        
2488    RT @GISELE_MULUMBA: 10k\n♥️♥️♥️20k\n❤❤❤❤30k\n🌺...
3927    RT @plodaek: ‼️‼️‼️‼️‼️ EMERGENCY ALERT ‼️‼️‼️...
1566    RT @osimsek_herkul: Allah, sizlere sabır-ı cem...
2006    RT @dawg_lb: "Support and Vote to re-elect Gov...
3008    @besiktas_sedat @muhaciroglu81 Satın alma opsi...
Name: text, Length: 4443, dtype: object

In [9]:
cv = CountVectorizer()

In [10]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)

In [11]:
x_train.head()

3275                      @MilaniMitchell Beautiful photo
4351    RT @LivrosQueMudam: Parece eu ignorando boa pa...
992     RT @dawg_lb: Maxine, who are you to determine ...
1533    RT @KurdisCat: "Abans que l'estat turc va enva...
2954              @Duraraken مافات الاوان عندك وقت تحذفين
Name: text, dtype: object

In [12]:
cv = CountVectorizer()

In [13]:
x_traincv=cv.fit_transform(x_train)
x_train.iloc[0]



'@MilaniMitchell Beautiful photo'

In [14]:
x_testcv=cv.transform(x_test)

In [15]:
x_testcv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
mnb = MultinomialNB()

In [17]:

y_train=y_train.astype('int')
y_train

3275    1
4351    1
992     0
1533    0
2954    1
       ..
236     0
1895    0
2075    0
3171    1
340     0
Name: withheld_in_countries, Length: 3554, dtype: int32

In [18]:
mnb.fit(x_traincv,y_train)

In [19]:
testmessage=x_test.iloc[0]
testmessage

'@cryforthepinks @BLACKPINK 1x687\nLISA IS COMING\nI vote  #MTVMIAWFANDOMBLINK + #MTVMIAWHITLOVESICKGIRLS at #PremiosMTVMIAW (@BLACKPINK)'

In [20]:
predictions=mnb.predict(x_testcv)
predictions

array([1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,

In [21]:
a=np.array(y_test)
a

array([1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,

In [22]:
count=0
for i in range (len(predictions)):
    if predictions[i]==a[i]:
        count=count+1
count

768

In [23]:
len(predictions)

889

In [24]:
count/predictions

  count/predictions


array([768.,  inf,  inf, 768.,  inf,  inf,  inf,  inf,  inf, 768.,  inf,
        inf, 768.,  inf,  inf,  inf, 768.,  inf,  inf,  inf,  inf,  inf,
        inf,  inf,  inf,  inf, 768.,  inf, 768.,  inf, 768.,  inf,  inf,
        inf,  inf, 768.,  inf, 768., 768., 768.,  inf,  inf,  inf,  inf,
       768.,  inf, 768., 768.,  inf, 768., 768.,  inf, 768.,  inf, 768.,
       768.,  inf,  inf,  inf, 768., 768.,  inf, 768.,  inf,  inf, 768.,
       768.,  inf,  inf,  inf, 768., 768., 768.,  inf, 768.,  inf,  inf,
        inf, 768.,  inf, 768.,  inf,  inf, 768., 768., 768.,  inf, 768.,
       768., 768.,  inf, 768.,  inf,  inf,  inf, 768.,  inf,  inf, 768.,
        inf,  inf,  inf, 768., 768., 768., 768., 768.,  inf,  inf,  inf,
        inf,  inf,  inf,  inf,  inf, 768., 768.,  inf, 768., 768.,  inf,
       768., 768., 768.,  inf,  inf,  inf,  inf, 768., 768.,  inf, 768.,
        inf,  inf,  inf, 768., 768., 768.,  inf,  inf, 768.,  inf,  inf,
        inf,  inf, 768.,  inf, 768.,  inf,  inf,  i

In [25]:
predictions=mnb.predict(["The scouts are playgrounds for priests"])

ValueError: Expected 2D array, got scalar array instead:
array=The scouts are playgrounds for priests.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.