In [1]:
import pandas as pd
import re
import numpy as np
import nltk as nl
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.tag import pos_tag
import pickle
from collections import defaultdict
from collections import Counter
import math
import tqdm
import ast

## Import Data

In [2]:
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')

In [3]:
fake['label'] = 1
true['label'] = 0

In [4]:
fake.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [5]:
df = pd.concat([fake,true],ignore_index=True)

In [6]:
df

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0


In [7]:
df.shape

(44898, 5)

## First Analysis

In [8]:
fake.subject.unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east'], dtype=object)

In [9]:
true.subject.unique()

array(['politicsNews', 'worldnews'], dtype=object)

In [10]:
df.drop('subject', axis = 'columns',inplace=True) 
df.drop('date', axis = 'columns',inplace=True) 

In [11]:
# Modificare? Troppo simile indiano
df['text'] = df['title'] + ' ' + df['text'] 

In [12]:
df.drop('title', axis = 'columns',inplace=True) 

In [13]:
df.head()

Unnamed: 0,text,label
0,Donald Trump Sends Out Embarrassing New Year’...,1
1,Drunk Bragging Trump Staffer Started Russian ...,1
2,Sheriff David Clarke Becomes An Internet Joke...,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,1
4,Pope Francis Just Called Out Donald Trump Dur...,1


# Preprocessing

In [14]:
#saving dict function
def save_dict(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

#load dict function
def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [15]:
def preprocess2(data):
    '''
    tagged = pos_tag(data.split())
    names = [t[0] for t in tagged if t[1] == 'NNP']
    print(names)
    '''
    x = re.sub('[^a-zA-Z]', ' ',data) 
    
    #lowering words
    lower=str.lower(x).split() 
    words=set(stopwords.words('english'))
    
    #removing stopwords
    no_stopwords=[w for w in lower if not w in words]  
    lmtzr = WordNetLemmatizer()
    
    #stemming
    cleaned=[lmtzr.lemmatize(w,pos="v") for w in no_stopwords]
    cleaned=[lmtzr.lemmatize(w,pos="n") for w in cleaned]
    
    return (" ".join( cleaned ))

In [67]:
#to run only the first time
df['text'] = df.text.apply(lambda x: preprocess2(x))

In [22]:
df = pd.read_csv('preprocessed.csv')  #to read 
df.drop({'Unnamed: 0'},axis=1,inplace=True)

In [23]:
df.head()

Unnamed: 0,text,label
0,donald trump send embarrass new year eve messa...,1
1,drink brag trump staffer start russian collusi...,1
2,sheriff david clarke become internet joke thre...,1
3,trump ob even obama name cod website image chr...,1
4,pope francis call donald trump christmas speec...,1


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    44898 non-null  object
 1   label   44898 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 701.7+ KB


# Parameters

### Don't Run every time

In [25]:
train = df.sample(n = math.floor(0.8*len(df)))
train.shape

(35918, 2)

In [26]:
test = df.drop(train.index)

In [27]:
test.shape

(8980, 2)

In [28]:
train.to_csv('train.csv')
test.to_csv('test.csv')

In [29]:
train.head()

Unnamed: 0,text,label
30346,nobel laureate say trump policy may lead budge...,0
33007,bet trump win north korea h bomb prediction si...,0
16912,obama communist environmental arm tell kid red...,1
24392,house tax panel chair corporate tax cut may ta...,0
21239,british tv personality blame trump muslim ban ...,1


In [30]:
train.reset_index(inplace=True)

In [31]:
train = train.rename(columns = {'index':'id'})

In [32]:
train

Unnamed: 0,id,text,label
0,30346,nobel laureate say trump policy may lead budge...,0
1,33007,bet trump win north korea h bomb prediction si...,0
2,16912,obama communist environmental arm tell kid red...,1
3,24392,house tax panel chair corporate tax cut may ta...,0
4,21239,british tv personality blame trump muslim ban ...,1
...,...,...,...
35913,20566,sin socialism doctor pump air infant lung hand...,1
35914,13261,journalist sheryl attkisson famous dc bureau c...,1
35915,21193,syrian muslim man whose family perish trip cou...,1
35916,8860,fox news help domestic terrorist first year pr...,1


### Start from here

In [26]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.drop({'Unnamed: 0.1','Unnamed: 0.1.1'},axis=1,inplace=True)
test.drop({'Unnamed: 0.1','Unnamed: 0.1.1'},axis=1,inplace=True)

In [27]:
#to maintain the list format
train['mapped_text'] = train['mapped_text'].map(ast.literal_eval)
#to maintain the list format
test['mapped_text'] = test['mapped_text'].map(ast.literal_eval)

In [28]:
print(type(train['mapped_text'][0]))
print(type(test['mapped_text'][0]))

<class 'list'>
<class 'list'>


In [42]:
train

Unnamed: 0.1,Unnamed: 0,text,label,mapped_text
0,7882,mitt romney go donald trump exactly like mitt ...,1,"[12437, 50296, 19730, 70224, 48026, 38803, 547..."
1,20642,yikes year hillary lie drive well know democra...,1,"[40611, 7759, 45026, 52731, 3062, 16735, 94715..."
2,2311,internet hero make trump putin facebook friend...,1,"[66433, 57110, 43504, 48026, 46185, 48247, 189..."
3,36117,palestinian snub penny visit jerusalem move ca...,0,"[83741, 63169, 33551, 24177, 53492, 63335, 378..."
4,2923,donald trump even steal cake design obama inau...,1,"[70224, 48026, 94341, 81545, 80357, 90402, 643..."
...,...,...,...,...
35913,12675,democrat operative catch plan bully woman trum...,1,"[51156, 52832, 71488, 41152, 58880, 86096, 480..."
35914,21882,detroit squat squatter take turn want miss,1,"[27965, 18503, 4421, 53845, 9346, 70384, 77819]"
35915,20869,oops donald trump name miss ballot florida oh ...,1,"[46007, 70224, 48026, 72398, 77819, 79811, 802..."
35916,8302,jeb bush bring mom rally crack awkward joke ab...,1,"[5051, 69505, 46784, 55703, 39905, 33789, 7671..."


In [33]:
def create_vocabulary(df):

    #the Dict we wanna build
    vocabulary = defaultdict()

    #set in which i collect all the terms
    term_set = set()
    text = list(df['text'])
    for elem in text:
        try:
            term_set =term_set.union(set(elem.split()))
        except:
            pass

    #convert the set in list to enumerate
    term_list = list(term_set)

    for i, elem in enumerate(term_list):
        vocabulary[elem]= i 

    save_dict(vocabulary,'vocabulary')

In [34]:
create_vocabulary(train)   #to create

In [36]:
vocabulary = load_obj('vocabulary')  #to read

In [37]:
len(vocabulary)

89039

#### USED ONLY THE FIRST TIME TO CREATE IT

In [38]:
def mapping(text):
    text = text.split()
    out = [vocabulary[word] for word in text]
    return out

In [40]:
train['mapped_text'] = train.text.apply(lambda x: mapping(x))

In [42]:
train.head()

Unnamed: 0,id,text,label,mapped_text
0,30346,nobel laureate say trump policy may lead budge...,0,"[79910, 43802, 28875, 54182, 10144, 32671, 215..."
1,33007,bet trump win north korea h bomb prediction si...,0,"[5735, 54182, 28918, 25509, 88797, 43564, 6458..."
2,16912,obama communist environmental arm tell kid red...,1,"[59591, 9270, 413, 20026, 11421, 65944, 38687,..."
3,24392,house tax panel chair corporate tax cut may ta...,0,"[25988, 32797, 53901, 87028, 61368, 32797, 306..."
4,21239,british tv personality blame trump muslim ban ...,1,"[21131, 49545, 22005, 4321, 54182, 47472, 4043..."


In [43]:
train.to_csv('mapped_dataset.csv')

##### TO USE EVERYTIME WE WANT TO LOAD IT BACK

In [23]:
train = pd.read_csv('mapped_dataset.csv')  #to read

#to maintain the list format
train['mapped_text'] = train['mapped_text'].map(ast.literal_eval)

In [45]:
type(train['mapped_text'][0])

list

In [46]:
train.head()

Unnamed: 0,id,text,label,mapped_text
0,30346,nobel laureate say trump policy may lead budge...,0,"[79910, 43802, 28875, 54182, 10144, 32671, 215..."
1,33007,bet trump win north korea h bomb prediction si...,0,"[5735, 54182, 28918, 25509, 88797, 43564, 6458..."
2,16912,obama communist environmental arm tell kid red...,1,"[59591, 9270, 413, 20026, 11421, 65944, 38687,..."
3,24392,house tax panel chair corporate tax cut may ta...,0,"[25988, 32797, 53901, 87028, 61368, 32797, 306..."
4,21239,british tv personality blame trump muslim ban ...,1,"[21131, 49545, 22005, 4321, 54182, 47472, 4043..."


### Separation of the train dataset into fake and real articles

In [47]:
train_0 = train.loc[train['label']==0]
train_1 = train.loc[train['label']==1]

In [48]:
train_0

Unnamed: 0,id,text,label,mapped_text
0,30346,nobel laureate say trump policy may lead budge...,0,"[79910, 43802, 28875, 54182, 10144, 32671, 215..."
1,33007,bet trump win north korea h bomb prediction si...,0,"[5735, 54182, 28918, 25509, 88797, 43564, 6458..."
3,24392,house tax panel chair corporate tax cut may ta...,0,"[25988, 32797, 53901, 87028, 61368, 32797, 306..."
5,39181,jumblatt lebanon weak hariri resignation beiru...,0,"[88607, 10759, 81028, 86774, 23709, 20969, 226..."
6,34565,u government commit help flint michigan fix wa...,0,"[76408, 57784, 10530, 55211, 65517, 76655, 802..."
...,...,...,...,...
35904,33587,trump meet republican leadership party unity d...,0,"[54182, 86831, 84149, 86153, 63326, 43804, 346..."
35906,32127,democrat disarray eve convention nominate clin...,0,"[64917, 34908, 34022, 10526, 44390, 71, 64605,..."
35907,34823,syrian army iranian proxy demand surrender reb...,0,"[36400, 48591, 22806, 27227, 39720, 77391, 271..."
35911,26954,uk farage person interest trump russia investi...,0,"[85998, 11800, 61245, 48109, 54182, 63648, 308..."


In [49]:
train_1

Unnamed: 0,id,text,label,mapped_text
2,16912,obama communist environmental arm tell kid red...,1,"[59591, 9270, 413, 20026, 11421, 65944, 38687,..."
4,21239,british tv personality blame trump muslim ban ...,1,"[21131, 49545, 22005, 4321, 54182, 47472, 4043..."
7,21783,controversy christian flag engulf small town l...,1,"[22064, 40013, 35714, 3704, 62102, 69139, 5762..."
8,21882,detroit squat squatter take turn want miss,1,"[18658, 3137, 36563, 15503, 33146, 64872, 45446]"
9,12611,oops new evidence show hillary email lose mont...,1,"[74353, 5944, 54068, 19311, 20586, 6613, 36053..."
...,...,...,...,...
35913,20566,sin socialism doctor pump air infant lung hand...,1,"[68878, 51534, 23182, 17517, 39564, 70213, 287..."
35914,13261,journalist sheryl attkisson famous dc bureau c...,1,"[32767, 46932, 79745, 48378, 65636, 57805, 304..."
35915,21193,syrian muslim man whose family perish trip cou...,1,"[36400, 47472, 52381, 69218, 75453, 57523, 335..."
35916,8860,fox news help domestic terrorist first year pr...,1,"[60167, 66327, 55211, 68268, 4664, 59091, 7414..."


## Naïve Bayes: Multinomial Event Model

In [50]:
phi_y = train['label'].sum()/len(train)
phi_y

0.5217718135753661

In [52]:
from tqdm import tqdm
tqdm.pandas()

d = defaultdict(list)

def add_value(txt,index,label,d):
    for word in set(txt):
        counts = txt.count(word)
        d[word]+=[(index,counts,label)]

train.progress_apply(lambda x: add_value(x['mapped_text'],x['id'],x['label'],d),axis = 1)

100%|██████████| 35918/35918 [00:35<00:00, 1020.18it/s]


0        None
1        None
2        None
3        None
4        None
         ... 
35913    None
35914    None
35915    None
35916    None
35917    None
Length: 35918, dtype: object

In [53]:
len(d)  ## penso sia minore perchè stiamo applicando la funzione a TRAIN mentre il vocabolario è stato costruito su DF 

89039

In [55]:
save_dict(d,'word_index')

In [56]:
word_index = load_obj('word_index')

In [57]:
len(word_index)

89039

In [61]:
#0 real 1 fake
n = len(word_index)

phi_0 = np.ones(n) 
phi_1 = np.ones(n)

d_0 = sum([len(i) for i in train_0.mapped_text])
d_1 = sum([len(i) for i in train_1.mapped_text])

for word, value in word_index.items():
    for tup in value:
        if tup[2] == 0:
            phi_0[word] += tup[1]
        else:
            phi_1[word] += tup[1]
phi_0 = phi_0/(d_0+n)
phi_1 = phi_1/(d_1+n)

In [62]:
phi_1[0:100]

array([2.11025224e-07, 4.22050448e-07, 4.22050448e-07, 4.22050448e-07,
       2.11025224e-07, 6.33075672e-07, 1.26615134e-06, 2.11025224e-07,
       8.65203418e-06, 2.11025224e-07, 6.33075672e-07, 1.89922701e-06,
       2.11025224e-07, 2.11025224e-07, 4.22050448e-07, 4.22050448e-07,
       2.11025224e-07, 2.11025224e-07, 1.13320545e-04, 4.22050448e-07,
       1.56158666e-05, 8.44100895e-07, 1.28725387e-05, 4.22050448e-07,
       4.22050448e-07, 4.22050448e-07, 2.11025224e-07, 6.33075672e-07,
       4.22050448e-07, 1.54048413e-05, 4.22050448e-07, 2.11025224e-07,
       4.22050448e-07, 1.05512612e-06, 4.22050448e-07, 4.22050448e-07,
       6.33075672e-07, 1.05512612e-06, 6.33075672e-07, 1.89922701e-06,
       1.26615134e-06, 6.33075672e-07, 6.47847437e-05, 6.33075672e-07,
       1.05512612e-06, 2.11025224e-07, 2.11025224e-07, 2.11025224e-07,
       8.44100895e-07, 8.08226607e-05, 8.44100895e-07, 1.26615134e-06,
       4.22050448e-07, 2.11025224e-06, 2.53230269e-06, 2.11025224e-07,
      