In [40]:
import pandas as pd
import re
import numpy as np
import nltk as nl
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.tag import pos_tag
import pickle
from collections import defaultdict
from collections import Counter
import math
import tqdm
import ast

## Import Data

In [2]:
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')

In [3]:
fake['label'] = 1
true['label'] = 0

In [4]:
fake.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [5]:
df = pd.concat([fake,true],ignore_index=True)

In [6]:
df

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0


In [7]:
df.shape

(44898, 5)

## First Analysis

In [8]:
fake.subject.unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east'], dtype=object)

In [9]:
true.subject.unique()

array(['politicsNews', 'worldnews'], dtype=object)

In [10]:
df.drop('subject', axis = 'columns',inplace=True) 
df.drop('date', axis = 'columns',inplace=True) 

In [11]:
# Modificare? Troppo simile indiano
df['text'] = df['title'] + ' ' + df['text'] 

In [12]:
df.drop('title', axis = 'columns',inplace=True) 

In [13]:
df.head()

Unnamed: 0,text,label
0,Donald Trump Sends Out Embarrassing New Year’...,1
1,Drunk Bragging Trump Staffer Started Russian ...,1
2,Sheriff David Clarke Becomes An Internet Joke...,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,1
4,Pope Francis Just Called Out Donald Trump Dur...,1


# Preprocessing

In [19]:
#saving dict function
def save_dict(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

#load dict function
def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [15]:
def preprocess2(data):
    '''
    tagged = pos_tag(data.split())
    names = [t[0] for t in tagged if t[1] == 'NNP']
    print(names)
    '''
    x = re.sub('[^a-zA-Z]', ' ',data) 
    
    #lowering words
    lower=str.lower(x).split() 
    words=set(stopwords.words('english'))
    
    #removing stopwords
    no_stopwords=[w for w in lower if not w in words]  
    lmtzr = WordNetLemmatizer()
    
    #stemming
    cleaned=[lmtzr.lemmatize(w,pos="v") for w in no_stopwords]
    cleaned=[lmtzr.lemmatize(w,pos="n") for w in cleaned]
    
    return (" ".join( cleaned ))

In [67]:
#to run only the first time
df['text'] = df.text.apply(lambda x: preprocess2(x))

In [113]:
df = pd.read_csv('preprocessed.csv')  #to read 

In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,donald trump send embarrass new year eve messa...,1
1,1,drink brag trump staffer start russian collusi...,1
2,2,sheriff david clarke become internet joke thre...,1
3,3,trump ob even obama name cod website image chr...,1
4,4,pope francis call donald trump christmas speec...,1


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  44898 non-null  int64 
 1   text        44898 non-null  object
 2   label       44898 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.0+ MB


In [20]:
def create_vocabulary(df):

    #the Dict we wanna build
    vocabulary = defaultdict()

    #set in which i collect all the terms
    term_set = set()
    text = list(df['text'])
    for elem in text:
        try:
            term_set =term_set.union(set(elem.split()))
        except:
            pass

    #convert the set in list to enumerate
    term_list = list(term_set)

    for i, elem in enumerate(term_list):
        vocabulary[elem]= i 

    save_dict(vocabulary,'vocabulary')

In [None]:
create_vocabulary(df)   #to create

In [21]:
vocabulary = load_obj('vocabulary')  #to read

In [22]:
len(vocabulary)

98132

#### USED ONLY THE FIRST TIME TO CREATE IT

In [23]:
def mapping(text):
    text = text.split()
    out = [vocabulary[word] for word in text]
    return out

In [11]:
df['mapped_text'] = df.text.apply(lambda x: mapping(x))

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,label,mapped_text
0,0,donald trump send embarrass new year eve messa...,1,"[70224, 48026, 71767, 29317, 7203, 7759, 73615..."
1,1,drink brag trump staffer start russian collusi...,1,"[9047, 46709, 48026, 46078, 34043, 20147, 2169..."
2,2,sheriff david clarke become internet joke thre...,1,"[30383, 91062, 77676, 23464, 66433, 8578, 8769..."
3,3,trump ob even obama name cod website image chr...,1,"[48026, 82760, 94341, 64354, 72398, 5206, 6520..."
4,4,pope francis call donald trump christmas speec...,1,"[26459, 24198, 73700, 70224, 48026, 62043, 834..."


In [13]:
df.to_csv('mapped_dataset.csv')

##### TO USE EVERYTIME WE WANT TO LOAD IT BACK

In [24]:
df = pd.read_csv('mapped_dataset.csv')  #to read

#to maintain the list format
df['mapped_text'] = df['mapped_text'].map(ast.literal_eval)

In [43]:
type(df['mapped_text'][0])

list

In [42]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,text,label,mapped_text
0,0,0,donald trump send embarrass new year eve messa...,1,"[70224, 48026, 71767, 29317, 7203, 7759, 73615..."
1,1,1,drink brag trump staffer start russian collusi...,1,"[9047, 46709, 48026, 46078, 34043, 20147, 2169..."
2,2,2,sheriff david clarke become internet joke thre...,1,"[30383, 91062, 77676, 23464, 66433, 8578, 8769..."
3,3,3,trump ob even obama name cod website image chr...,1,"[48026, 82760, 94341, 64354, 72398, 5206, 6520..."
4,4,4,pope francis call donald trump christmas speec...,1,"[26459, 24198, 73700, 70224, 48026, 62043, 834..."


# Parameters

In [14]:
train = df.sample(n = math.floor(0.8*len(df)))
train.shape

(35918, 4)

In [15]:
test = df.drop(train.index)

In [16]:
test.shape

(8980, 4)

In [None]:
train.to_csv('train.csv')
test.to_csv('test.csv')

In [None]:
train.head()

In [31]:
train.reset_index(inplace=True)

In [20]:
train = train.rename(columns = {'index':'id'})

In [105]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.drop({'Unnamed: 0.1','Unnamed: 0.1.1'},axis=1,inplace=True)
test.drop({'Unnamed: 0.1','Unnamed: 0.1.1'},axis=1,inplace=True)

In [106]:
#to maintain the list format
train['mapped_text'] = train['mapped_text'].map(ast.literal_eval)
#to maintain the list format
test['mapped_text'] = test['mapped_text'].map(ast.literal_eval)

In [109]:
print(type(train['mapped_text'][0]))
print(type(test['mapped_text'][0]))

<class 'list'>
<class 'list'>


In [108]:
train

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,text,label,mapped_text
0,7882,7882,7882,mitt romney go donald trump exactly like mitt ...,1,"[12437, 50296, 19730, 70224, 48026, 38803, 547..."
1,20642,20642,20642,yikes year hillary lie drive well know democra...,1,"[40611, 7759, 45026, 52731, 3062, 16735, 94715..."
2,2311,2311,2311,internet hero make trump putin facebook friend...,1,"[66433, 57110, 43504, 48026, 46185, 48247, 189..."
3,36117,36117,36117,palestinian snub penny visit jerusalem move ca...,0,"[83741, 63169, 33551, 24177, 53492, 63335, 378..."
4,2923,2923,2923,donald trump even steal cake design obama inau...,1,"[70224, 48026, 94341, 81545, 80357, 90402, 643..."
...,...,...,...,...,...,...
35913,12675,12675,12675,democrat operative catch plan bully woman trum...,1,"[51156, 52832, 71488, 41152, 58880, 86096, 480..."
35914,21882,21882,21882,detroit squat squatter take turn want miss,1,"[27965, 18503, 4421, 53845, 9346, 70384, 77819]"
35915,20869,20869,20869,oops donald trump name miss ballot florida oh ...,1,"[46007, 70224, 48026, 72398, 77819, 79811, 802..."
35916,8302,8302,8302,jeb bush bring mom rally crack awkward joke ab...,1,"[5051, 69505, 46784, 55703, 39905, 33789, 7671..."


### Separation of the train dataset into fake and real articles

In [52]:
train_0 = train.loc[train['label']==0]
train_1 = train.loc[train['label']==1]

In [53]:
train_0.drop({'Unnamed: 0.1','Unnamed: 0.1.1'},axis=1,inplace=True)
train_1.drop({'Unnamed: 0.1','Unnamed: 0.1.1'},axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [54]:
train_1

Unnamed: 0.1,index,Unnamed: 0,text,label,mapped_text
0,0,7882,mitt romney go donald trump exactly like mitt ...,1,"[12437, 50296, 19730, 70224, 48026, 38803, 547..."
1,1,20642,yikes year hillary lie drive well know democra...,1,"[40611, 7759, 45026, 52731, 3062, 16735, 94715..."
2,2,2311,internet hero make trump putin facebook friend...,1,"[66433, 57110, 43504, 48026, 46185, 48247, 189..."
4,4,2923,donald trump even steal cake design obama inau...,1,"[70224, 48026, 94341, 81545, 80357, 90402, 643..."
5,5,2759,break trump devastate drop anti lgbt executive...,1,"[32829, 48026, 51480, 60088, 16361, 95392, 261..."
...,...,...,...,...,...
35913,35913,12675,democrat operative catch plan bully woman trum...,1,"[51156, 52832, 71488, 41152, 58880, 86096, 480..."
35914,35914,21882,detroit squat squatter take turn want miss,1,"[27965, 18503, 4421, 53845, 9346, 70384, 77819]"
35915,35915,20869,oops donald trump name miss ballot florida oh ...,1,"[46007, 70224, 48026, 72398, 77819, 79811, 802..."
35916,35916,8302,jeb bush bring mom rally crack awkward joke ab...,1,"[5051, 69505, 46784, 55703, 39905, 33789, 7671..."


## Naïve Bayes: Multinomial Event Model

In [112]:
phi_y = train['label'].sum()/len(train)
phi_y

0.5224400022272955

In [76]:
from tqdm import tqdm
tqdm.pandas()

d = defaultdict(list)

def add_value(txt,index,label,d):
    for word in set(txt):
        counts = txt.count(word)
        d[word]+=[(index,counts,label)]

train.progress_apply(lambda x: add_value(x['mapped_text'],x['Unnamed: 0'],x['label'],d),axis = 1)

100%|██████████| 35918/35918 [00:30<00:00, 1169.92it/s]


0        None
1        None
2        None
3        None
4        None
         ... 
35913    None
35914    None
35915    None
35916    None
35917    None
Length: 35918, dtype: object

In [77]:
len(d)  ## penso sia minore perchè stiamo applicando la funzione a TRAIN mentre il vocabolario è stato costruito su DF 

89439

In [78]:
len(vocabulary)

98132

In [83]:
save_dict(d,'word_index')

In [84]:
word_index = load_obj('word_index')

In [102]:
#0 real 1 fake
d = len(vocabulary)
n = len(word_index)

phi_0 = np.ones(n) 
phi_1 = np.ones(n)

d_0 = sum([len(i) for i in train_0.mapped_text])
d_1 = sum([len(i) for i in train_1.mapped_text])

for word,value in word_index.items():
    print(word)
    print(value)
    break

36869
[(7882, 2, 1), (17462, 1, 1), (43594, 1, 0), (22344, 1, 1), (38069, 1, 0), (42484, 4, 0), (44047, 1, 0), (39144, 2, 0), (28310, 1, 0), (23858, 3, 0), (27631, 2, 0), (42669, 1, 0), (23481, 2, 0), (23897, 2, 0), (42207, 1, 0), (40889, 1, 0), (15783, 2, 1), (42238, 4, 0), (24509, 1, 0), (33963, 1, 0), (44674, 1, 0), (38355, 1, 0), (34914, 3, 0), (15765, 1, 1), (40398, 2, 0), (34124, 2, 0), (32993, 1, 0), (34601, 1, 0), (42436, 1, 0), (37504, 2, 0), (5823, 1, 1), (22458, 1, 1), (9109, 1, 1), (15589, 1, 1), (1062, 1, 1), (21537, 1, 1), (19743, 2, 1), (43428, 3, 0), (23943, 2, 0), (29674, 2, 0), (44724, 1, 0), (37586, 1, 0), (23962, 1, 0), (17518, 2, 1), (29577, 1, 0), (25624, 1, 0), (25208, 2, 0), (11894, 1, 1), (23509, 1, 0), (37102, 1, 0), (33070, 2, 0), (41425, 1, 0), (25139, 3, 0), (37253, 3, 0), (786, 1, 1), (44718, 1, 0), (7614, 1, 1), (25071, 1, 0), (24051, 1, 0), (28795, 1, 0), (42380, 3, 0), (43427, 1, 0), (41277, 1, 0), (18768, 1, 1), (25074, 1, 0), (34667, 1, 0), (36637, 1,