In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from collections import Counter
import math
import tqdm
import ast

In [2]:
#saving dict function
def save_dict(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

#load dict function
def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

## Naïve Bayes Multinomial Event Model

In [3]:
train = pd.read_csv('mapped_dataset.csv')  #to read

#to maintain the list format
train['mapped_text'] = train['mapped_text'].map(ast.literal_eval)
train.drop({'Unnamed: 0'},axis=1,inplace=True)

In [4]:
train.head()

Unnamed: 0,id,text,label,mapped_text
0,1943,bombshell senate intel committee hear reveal f...,1,"[26288, 86478, 26916, 21374, 71869, 21939, 822..."
1,2557,terrorist group openly laud trump idiot brag s...,1,"[71953, 4753, 17513, 86987, 21812, 25843, 5777..."
2,11200,senator dick durbin need civics lesson susan r...,1,"[87650, 32198, 51043, 80029, 29773, 6081, 6144..."
3,21306,white student union form facebook page organiz...,1,"[33214, 7240, 56509, 48542, 6360, 28184, 37945..."
4,30432,factbox contender pick key job trump administr...,0,"[38968, 47489, 40571, 31586, 72887, 21812, 568..."


In [6]:
train_0 = train.loc[train['label']==0]
train_1 = train.loc[train['label']==1]

In [9]:
word_index = load_obj('word_index')

In [10]:
phi_y_1 = train['label'].sum()/len(train)
phi_y_0 = 1 - phi_y_1

In [11]:
# Class priors
phi_y_1 = train['label'].sum()/len(train)
phi_y_0 = 1 - phi_y_1

# Conditionals 
n = len(word_index)

phi_0 = np.ones(n) 
phi_1 = np.ones(n)

d_0 = sum([len(i) for i in train_0.mapped_text])
d_1 = sum([len(i) for i in train_1.mapped_text])

for word, value in word_index.items():
    for tup in value:
        if tup[2] == 0:
            phi_0[word] += tup[1]
        else:
            phi_1[word] += tup[1]
            
phi_0 = phi_0/(d_0+n)
phi_1 = phi_1/(d_1+n)

In [11]:
phi_1[0:25]

array([1.47340225e-06, 1.05243018e-06, 4.20972071e-07, 4.20972071e-07,
       6.10409503e-06, 6.31458106e-07, 2.10486035e-07, 4.20972071e-07,
       6.31458106e-07, 2.10486035e-07, 4.20972071e-07, 4.20972071e-07,
       2.10486035e-07, 1.03138157e-05, 4.20972071e-07, 1.26291621e-06,
       2.10486035e-07, 4.20972071e-07, 1.47340225e-06, 6.31458106e-07,
       4.20972071e-07, 6.31458106e-07, 1.89437432e-06, 3.15729053e-06,
       2.10486035e-07])

In [12]:
phi_0[0:25]

array([1.19890938e-06, 2.39781875e-07, 2.39781875e-07, 2.39781875e-07,
       1.27084394e-05, 2.39781875e-07, 4.79563750e-07, 2.39781875e-07,
       2.39781875e-07, 9.59127501e-07, 2.39781875e-07, 2.39781875e-07,
       4.79563750e-07, 6.47411063e-06, 2.39781875e-07, 3.11716438e-06,
       4.79563750e-07, 4.79563750e-07, 9.59127501e-07, 2.39781875e-07,
       2.39781875e-07, 2.39781875e-07, 2.39781875e-07, 2.15803688e-06,
       4.79563750e-07])

## Testing

In [18]:
test = pd.read_csv('test.csv')
test.rename(columns={'Unnamed: 0': 'id'}, inplace=True) 

In [19]:
test.head()

Unnamed: 0,id,text,label
0,3,trump ob even obama name cod website image chr...,1
1,12,bad news trump mitch mcconnell say repeal obam...,1
2,14,heiress disney empire know gop scammed u shred...,1
3,22,meghan mccain tweet amaze response doug jones ...,1
4,25,despicable trump suggest female senator would ...,1


In [20]:
vocabulary = load_obj('vocabulary')

In [21]:
def mapping_2(text):
    text = text.split()
    out = [vocabulary[word] if word in vocabulary.keys() else word for word in text]
    return out

In [22]:
test['mapped_text'] = test.text.apply(lambda x: mapping_2(x))

In [23]:
test.head()

Unnamed: 0,id,text,label,mapped_text
0,3,trump ob even obama name cod website image chr...,1,"[21812, 74606, 557, 86148, 17003, 54874, 6461,..."
1,12,bad news trump mitch mcconnell say repeal obam...,1,"[11037, 35297, 21812, 82237, 73835, 31195, 219..."
2,14,heiress disney empire know gop scammed u shred...,1,"[26797, 82428, 59899, 70102, 7190, 88749, 101,..."
3,22,meghan mccain tweet amaze response doug jones ...,1,"[62364, 3322, 39245, 41026, 37822, 19659, 7988..."
4,25,despicable trump suggest female senator would ...,1,"[49304, 21812, 35200, 61615, 87650, 15738, 983..."


In [25]:
def predict(mapped_text):
    
    phi_0_test = np.log(phi_y_0)
    phi_1_test = np.log(phi_y_1)
    
    for word in mapped_text:
        if type(word) == type('str'):
            phi_0_test += np.log(1/(d_0+n))
            phi_1_test += np.log(1/(d_1+n))
        else:
            phi_0_test += np.log(phi_0[word])
            phi_1_test += np.log(phi_1[word])
            
    if phi_0_test > phi_1_test:
        return 0
    return 1

In [26]:
test['predict'] = test.mapped_text.apply(lambda x: predict(x))

In [27]:
test.loc[test['label']!=test['predict']].shape

(4105, 5)