In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from collections import Counter
import math
import tqdm
import ast

In [2]:
#saving dict function
def save_dict(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

#load dict function
def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

## Naïve Bayes Multinomial Event Model

In [11]:
train = pd.read_csv('mapped_dataset.csv')  #to read

#to maintain the list format
train['mapped_text'] = train['mapped_text'].map(ast.literal_eval)

train.drop({'Unnamed: 0'},axis=1,inplace=True)

In [12]:
train.head()

Unnamed: 0,id,text,label,mapped_text
0,1943,bombshell senate intel committee hear reveal f...,1,"[26288, 86478, 26916, 21374, 71869, 21939, 822..."
1,2557,terrorist group openly laud trump idiot brag s...,1,"[71953, 4753, 17513, 86987, 21812, 25843, 5777..."
2,11200,senator dick durbin need civics lesson susan r...,1,"[87650, 32198, 51043, 80029, 29773, 6081, 6144..."
3,21306,white student union form facebook page organiz...,1,"[33214, 7240, 56509, 48542, 6360, 28184, 37945..."
4,30432,factbox contender pick key job trump administr...,0,"[38968, 47489, 40571, 31586, 72887, 21812, 568..."


In [13]:
train_0 = train.loc[train['label']==0]
train_1 = train.loc[train['label']==1]

In [50]:
word_index = load_obj('word_index')

In [51]:
len(word_index)

89369

In [52]:
phi_y_1 = train['label'].sum()/len(train)
phi_y_0 = 1 - phi_y_1

In [9]:
#0 real 1 fake
n = len(word_index)

phi_0 = np.ones(n) 
phi_1 = np.ones(n)

d_0 = sum([len(i) for i in train_0.mapped_text])
d_1 = sum([len(i) for i in train_1.mapped_text])

for word, value in word_index.items():
    for tup in value:
        if tup[2] == 0:
            phi_0[word] += tup[1]
        else:
            phi_1[word] += tup[1]
phi_0 = phi_0/(d_0+n)
phi_1 = phi_1/(d_1+n)

In [75]:
d_0, d_1

(28062994, 32321536)

In [18]:
phi_1[0:25]

array([2.15976691e-07, 1.54269065e-07, 6.17076259e-08, 6.17076259e-08,
       8.94760575e-07, 9.25614388e-08, 3.08538129e-08, 6.17076259e-08,
       9.25614388e-08, 3.08538129e-08, 6.17076259e-08, 6.17076259e-08,
       3.08538129e-08, 1.51183683e-06, 6.17076259e-08, 1.85122878e-07,
       3.08538129e-08, 6.17076259e-08, 2.15976691e-07, 9.25614388e-08,
       6.17076259e-08, 9.25614388e-08, 2.77684316e-07, 4.62807194e-07,
       3.08538129e-08])

In [68]:
phi_0[0:25]

array([1.77604985e-07, 3.55209969e-08, 3.55209969e-08, 3.55209969e-08,
       1.88261284e-06, 3.55209969e-08, 7.10419939e-08, 3.55209969e-08,
       3.55209969e-08, 1.42083988e-07, 3.55209969e-08, 3.55209969e-08,
       7.10419939e-08, 9.59066917e-07, 3.55209969e-08, 4.61772960e-07,
       7.10419939e-08, 7.10419939e-08, 1.42083988e-07, 3.55209969e-08,
       3.55209969e-08, 3.55209969e-08, 3.55209969e-08, 3.19688972e-07,
       7.10419939e-08])

## Testing

In [26]:
test = pd.read_csv('test.csv')
vocabulary = load_obj('vocabulary')  #to read

In [20]:
test.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,3,trump ob even obama name cod website image chr...,1
1,12,bad news trump mitch mcconnell say repeal obam...,1
2,14,heiress disney empire know gop scammed u shred...,1
3,22,meghan mccain tweet amaze response doug jones ...,1
4,25,despicable trump suggest female senator would ...,1


In [32]:
def mapping_2(text):
    text = text.split()
    out = [vocabulary[word] if word in vocabulary.keys() else word for word in text]
    return out

In [33]:
test['mapped_text'] = test.text.apply(lambda x: mapping_2(x))

In [34]:
test.head()

Unnamed: 0.1,Unnamed: 0,text,label,mapped_text
0,3,trump ob even obama name cod website image chr...,1,"[21812, 74606, 557, 86148, 17003, 54874, 6461,..."
1,12,bad news trump mitch mcconnell say repeal obam...,1,"[11037, 35297, 21812, 82237, 73835, 31195, 219..."
2,14,heiress disney empire know gop scammed u shred...,1,"[26797, 82428, 59899, 70102, 7190, 88749, 101,..."
3,22,meghan mccain tweet amaze response doug jones ...,1,"[62364, 3322, 39245, 41026, 37822, 19659, 7988..."
4,25,despicable trump suggest female senator would ...,1,"[49304, 21812, 35200, 61615, 87650, 15738, 983..."


In [72]:
def predict(mapped_text):
    phi_0_test = np.log(phi_y_0)
    phi_1_test = np.log(phi_y_1)
    for word in mapped_text:
        if type(word) == type('ciao'):
            phi_0_test += np.log(1/(d_0+n))
            phi_1_test += np.log(1/(d_1+n))
        else:
            phi_0_test += np.log(phi_0[word])
            phi_1_test += np.log(phi_1[word])
    print(phi_0_test, phi_1_test)
    if phi_0_test > phi_1_test:
        return 0
    return 1

In [62]:
mapped_text_list = list(test['mapped_text'])
test['predict'] = [predict(test) for test in mapped_text_list]

In [71]:
test.loc[test['label']!=test['predict']]

Unnamed: 0.1,Unnamed: 0,text,label,mapped_text,predict
129,708,mccain f cked senate republican trumpcare much...,1,"[3322, 77805, 28146, 86478, 27998, 43040, 7383...",0
145,782,republican viciously attack mccain give job cl...,1,"[27998, 42768, 80192, 3322, 81717, 72887, 4916...",0
201,1072,investigator u russia tie turn attention flynn...,1,"[9607, 101, 11112, 23911, 54307, 61083, 772, 6...",0
204,1083,new poll trump make others popular think detai...,1,"[32382, 29577, 21812, 78414, 24279, 87668, 783...",0
214,1129,top republican really blame obama gop baseball...,1,"[20700, 27998, 17583, 35464, 86148, 7190, 2082...",0
...,...,...,...,...,...
8974,44865,south korean practice case north korea attack ...,0,"[40525, 67443, 44592, 86428, 22388, 35899, 801...",1
8975,44871,pakistan reject role scapegoat u failure afgha...,0,"[84890, 82361, 3048, 34086, 101, 62768, 46675,...",1
8976,44879,blunt instrument list ban article say china ce...,0,"[8548, 71658, 60437, 6991, 71549, 31195, 54749...",1
8978,44884,u put pressure pakistan help afghan war washin...,0,"[101, 77842, 41422, 84890, 49149, 43606, 87480...",1
