# Fake News Detection 
-----

### Imports

In [3]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.tag import pos_tag
import pickle
from collections import defaultdict
from collections import Counter
import math
import tqdm
import ast

## Data

In [4]:
fake = pd.read_csv('/Users/domenicomattiacinque/Documents/Università/FDS2020/Fake.csv')
true = pd.read_csv('/Users/domenicomattiacinque/Documents/Università/FDS2020/True.csv')

Adding labels and merging into a single dataframe.

In [5]:
fake['label'] = 1
true['label'] = 0

In [6]:
fake.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [7]:
df = pd.concat([fake,true],ignore_index=True)

In [8]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0


In [9]:
df.shape

(44898, 5)

## First Analysis

We dropped the column `subjects` because there is no total matching between fake and true articles so we concluded it would not be relevant for our purpuses. Also the article date will not be used since our approach is based on the text.

In [10]:
fake.subject.unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east'], dtype=object)

In [11]:
true.subject.unique()

array(['politicsNews', 'worldnews'], dtype=object)

In [12]:
df.drop('subject', axis = 'columns',inplace=True) 
df.drop('date', axis = 'columns',inplace=True) 

In [13]:
df['text'] = df['title'] + ' ' + df['text'] 

In [14]:
df.drop('title', axis = 'columns',inplace=True) 

In [15]:
df.head()

Unnamed: 0,text,label
0,Donald Trump Sends Out Embarrassing New Year’...,1
1,Drunk Bragging Trump Staffer Started Russian ...,1
2,Sheriff David Clarke Becomes An Internet Joke...,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,1
4,Pope Francis Just Called Out Donald Trump Dur...,1


In [18]:
df.to_csv('merged.csv')

## Preprocessing

We normalize and stem the text, remove all the stopwords and punctuation.

In [14]:
#saving dict function
def save_dict(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

#load dict function
def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [15]:
def preprocess(data):
    x = re.sub('[^a-zA-Z]', ' ',data) 
    
    #lowering words
    lower = str.lower(x).split()
    
    #removing stopwords
    words = set(stopwords.words('english'))
    no_stopwords = [w for w in lower if not w in words]  
    lmtzr = WordNetLemmatizer()
    
    #stemming
    cleaned = [lmtzr.lemmatize(w,pos="v") for w in no_stopwords]
    cleaned = [lmtzr.lemmatize(w,pos="n") for w in cleaned]
    
    return (" ".join( cleaned ))

In [16]:
df['text'] = df.text.apply(lambda x: preprocess(x))

In [17]:
df.to_csv('preprocessed.csv')

## Train-test split

Choosing randomly the train and test set rows.

In [21]:
train = df.sample(n = math.floor(0.8*len(df)))
train.shape

(35918, 2)

In [1]:
test = df.drop(train.index)
test.shape

NameError: name 'df' is not defined

In [24]:
train.head()

Unnamed: 0,text,label
39545,turkey erdogan take legal action lawmaker call...,0
23220,video watch jam keefe easily obtain eminem ele...,1
8124,believe conservative actually think scalia dea...,1
39248,philippine warn lone wolf attack marawi victor...,0
26590,u state department question gulf motif qatar b...,0


In [25]:
train.reset_index(inplace=True)

In [26]:
train = train.rename(columns = {'index':'id'})

In [27]:
train.head()

Unnamed: 0,id,text,label
0,39545,turkey erdogan take legal action lawmaker call...,0
1,23220,video watch jam keefe easily obtain eminem ele...,1
2,8124,believe conservative actually think scalia dea...,1
3,39248,philippine warn lone wolf attack marawi victor...,0
4,26590,u state department question gulf motif qatar b...,0


In [28]:
train.to_csv('train.csv')
test.to_csv('test.csv')

## Vocabulary

Mapping every word to a numeric value. 

In [29]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.drop({'Unnamed: 0'},axis=1,inplace=True)
test.drop({'Unnamed: 0'},axis=1,inplace=True)

In [30]:
train.head()

Unnamed: 0,id,text,label
0,39545,turkey erdogan take legal action lawmaker call...,0
1,23220,video watch jam keefe easily obtain eminem ele...,1
2,8124,believe conservative actually think scalia dea...,1
3,39248,philippine warn lone wolf attack marawi victor...,0
4,26590,u state department question gulf motif qatar b...,0


In [31]:
def create_vocabulary(df):
    
    vocabulary = defaultdict()

    # Set in which we collect all the terms
    term_set = set()
    text = list(df['text'])
    for elem in text:
        try:
            term_set = term_set.union(set(elem.split()))
        except:
            pass

    # Convert the set in list to enumerate
    term_list = list(term_set)

    for i, elem in enumerate(term_list):
        vocabulary[elem] = i 

    save_dict(vocabulary,'vocabulary')

In [32]:
create_vocabulary(train)   #to create

In [34]:
len(vocabulary)

89116

In [33]:
vocabulary = load_obj('vocabulary')  #to read

### Text mapping

In [35]:
def mapping(text):
    text = text.split()
    out = [vocabulary[word] for word in text]
    return out

In [36]:
train['mapped_text'] = train.text.apply(lambda x: mapping(x))

In [37]:
train.head()

Unnamed: 0,id,text,label,mapped_text
0,39545,turkey erdogan take legal action lawmaker call...,0,"[5546, 27252, 19567, 74942, 48637, 47736, 7485..."
1,23220,video watch jam keefe easily obtain eminem ele...,1,"[10203, 47551, 78336, 30152, 50169, 50613, 327..."
2,8124,believe conservative actually think scalia dea...,1,"[84778, 26772, 85709, 17914, 41905, 31481, 208..."
3,39248,philippine warn lone wolf attack marawi victor...,0,"[51781, 20570, 7185, 86395, 68827, 82657, 8395..."
4,26590,u state department question gulf motif qatar b...,0,"[40706, 45526, 10205, 41330, 86057, 47530, 657..."


In [38]:
train.to_csv('mapped_dataset.csv')

### Inverted index

Inverted index with `count` of each word in each document labelled by `index`.

In [42]:
from tqdm import tqdm
tqdm.pandas()

d = defaultdict(list)

def add_value(txt,index,label,d):
    for word in set(txt):
        counts = txt.count(word)
        d[word]+=[(index,counts,label)]

train.progress_apply(lambda x: add_value(x['mapped_text'],x['id'],x['label'],d),axis = 1)

  from pandas import Panel
100%|██████████| 35918/35918 [00:27<00:00, 1290.74it/s]


0        None
1        None
2        None
3        None
4        None
         ... 
35913    None
35914    None
35915    None
35916    None
35917    None
Length: 35918, dtype: object

In [43]:
save_dict(d,'word_index')