In [1]:
import pandas as pd
import re
import numpy as np
import nltk as nl
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.tag import pos_tag
import pickle
from collections import defaultdict
from collections import Counter
import math
import tqdm
import ast

## Import Data

In [2]:
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')

In [3]:
fake['label'] = 1
true['label'] = 0

In [4]:
fake.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [5]:
df = pd.concat([fake,true],ignore_index=True)

In [6]:
df

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0


In [7]:
df.shape

(44898, 5)

## First Analysis

In [8]:
fake.subject.unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east'], dtype=object)

In [9]:
true.subject.unique()

array(['politicsNews', 'worldnews'], dtype=object)

In [10]:
df.drop('subject', axis = 'columns',inplace=True) 
df.drop('date', axis = 'columns',inplace=True) 

In [11]:
# Modificare? Troppo simile indiano
df['text'] = df['title'] + ' ' + df['text'] 

In [12]:
df.drop('title', axis = 'columns',inplace=True) 

In [13]:
df.head()

Unnamed: 0,text,label
0,Donald Trump Sends Out Embarrassing New Year’...,1
1,Drunk Bragging Trump Staffer Started Russian ...,1
2,Sheriff David Clarke Becomes An Internet Joke...,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,1
4,Pope Francis Just Called Out Donald Trump Dur...,1


# Preprocessing

In [14]:
#saving dict function
def save_dict(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

#load dict function
def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [15]:
def preprocess(data):
    x = re.sub('[^a-zA-Z]', ' ',data) 
    
    #lowering words
    lower = str.lower(x).split()
    
    #removing stopwords
    words = set(stopwords.words('english'))
    no_stopwords = [w for w in lower if not w in words]  
    lmtzr = WordNetLemmatizer()
    
    #stemming
    cleaned = [lmtzr.lemmatize(w,pos="v") for w in no_stopwords]
    cleaned = [lmtzr.lemmatize(w,pos="n") for w in cleaned]
    
    return (" ".join( cleaned ))

In [16]:
df['text'] = df.text.apply(lambda x: preprocess(x))

In [17]:
df.to_csv('preprocessed.csv')

In [18]:
df = pd.read_csv('preprocessed.csv')  #to read 
df.drop({'Unnamed: 0'},axis=1,inplace=True)

In [19]:
df.head()

Unnamed: 0,text,label
0,donald trump send embarrass new year eve messa...,1
1,drink brag trump staffer start russian collusi...,1
2,sheriff david clarke become internet joke thre...,1
3,trump ob even obama name cod website image chr...,1
4,pope francis call donald trump christmas speec...,1


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    44898 non-null  object
 1   label   44898 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 701.7+ KB


# Parameters

### Don't Run every time

In [21]:
train = df.sample(n = math.floor(0.8*len(df)))
train.shape

(35918, 2)

In [22]:
test = df.drop(train.index)

In [23]:
test.shape

(8980, 2)

In [24]:
train.head()

Unnamed: 0,text,label
1943,bombshell senate intel committee hear reveal f...,1
2557,terrorist group openly laud trump idiot brag s...,1
11200,senator dick durbin need civics lesson susan r...,1
21306,white student union form facebook page organiz...,1
30432,factbox contender pick key job trump administr...,0


In [25]:
train.reset_index(inplace=True)

In [26]:
train = train.rename(columns = {'index':'id'})

In [27]:
train.head()

Unnamed: 0,id,text,label
0,1943,bombshell senate intel committee hear reveal f...,1
1,2557,terrorist group openly laud trump idiot brag s...,1
2,11200,senator dick durbin need civics lesson susan r...,1
3,21306,white student union form facebook page organiz...,1
4,30432,factbox contender pick key job trump administr...,0


In [28]:
train.to_csv('train.csv')
test.to_csv('test.csv')

### Vocabulary

In [29]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.drop({'Unnamed: 0'},axis=1,inplace=True)
test.drop({'Unnamed: 0'},axis=1,inplace=True)

In [30]:
train.head()

Unnamed: 0,id,text,label
0,1943,bombshell senate intel committee hear reveal f...,1
1,2557,terrorist group openly laud trump idiot brag s...,1
2,11200,senator dick durbin need civics lesson susan r...,1
3,21306,white student union form facebook page organiz...,1
4,30432,factbox contender pick key job trump administr...,0


In [31]:
def create_vocabulary(df):
    
    vocabulary = defaultdict()

    # Set in which we collect all the terms
    term_set = set()
    text = list(df['text'])
    for elem in text:
        try:
            term_set = term_set.union(set(elem.split()))
        except:
            pass

    # Convert the set in list to enumerate
    term_list = list(term_set)

    for i, elem in enumerate(term_list):
        vocabulary[elem] = i 

    save_dict(vocabulary,'vocabulary')

In [32]:
create_vocabulary(train)   #to create

In [33]:
vocabulary = load_obj('vocabulary')  #to read

In [34]:
len(vocabulary)

89028

### Text mapping

In [35]:
def mapping(text):
    text = text.split()
    out = [vocabulary[word] for word in text]
    return out

In [37]:
train['mapped_text'] = train.text.apply(lambda x: mapping(x))

In [38]:
train.head()

Unnamed: 0,id,text,label,mapped_text
0,1943,bombshell senate intel committee hear reveal f...,1,"[26288, 86478, 26916, 21374, 71869, 21939, 822..."
1,2557,terrorist group openly laud trump idiot brag s...,1,"[71953, 4753, 17513, 86987, 21812, 25843, 5777..."
2,11200,senator dick durbin need civics lesson susan r...,1,"[87650, 32198, 51043, 80029, 29773, 6081, 6144..."
3,21306,white student union form facebook page organiz...,1,"[33214, 7240, 56509, 48542, 6360, 28184, 37945..."
4,30432,factbox contender pick key job trump administr...,0,"[38968, 47489, 40571, 31586, 72887, 21812, 568..."


In [39]:
train.to_csv('mapped_dataset.csv')

##### TO USE EVERYTIME WE WANT TO LOAD IT BACK

In [40]:
train = pd.read_csv('mapped_dataset.csv')  #to read

#to maintain the list format
train['mapped_text'] = train['mapped_text'].map(ast.literal_eval)

In [41]:
type(train['mapped_text'][0])

list

In [42]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,text,label,mapped_text
0,0,1943,bombshell senate intel committee hear reveal f...,1,"[26288, 86478, 26916, 21374, 71869, 21939, 822..."
1,1,2557,terrorist group openly laud trump idiot brag s...,1,"[71953, 4753, 17513, 86987, 21812, 25843, 5777..."
2,2,11200,senator dick durbin need civics lesson susan r...,1,"[87650, 32198, 51043, 80029, 29773, 6081, 6144..."
3,3,21306,white student union form facebook page organiz...,1,"[33214, 7240, 56509, 48542, 6360, 28184, 37945..."
4,4,30432,factbox contender pick key job trump administr...,0,"[38968, 47489, 40571, 31586, 72887, 21812, 568..."


### Inverted index

In [43]:
from tqdm import tqdm
tqdm.pandas()

d = defaultdict(list)

def add_value(txt,index,label,d):
    for word in set(txt):
        counts = txt.count(word)
        d[word]+=[(index,counts,label)]

train.progress_apply(lambda x: add_value(x['mapped_text'],x['id'],x['label'],d),axis = 1)

  from pandas import Panel
100%|██████████| 35918/35918 [00:55<00:00, 642.54it/s]


0        None
1        None
2        None
3        None
4        None
         ... 
35913    None
35914    None
35915    None
35916    None
35917    None
Length: 35918, dtype: object

In [None]:
save_dict(d,'word_index')