## Libarires

In [595]:
import numpy as np
import pandas as pd
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

## Load Data

Here, we limit the number of rows to 1000 for faster processing.

In [610]:
path = 'data/fake_news/fake.csv'
data = pd.read_csv(path)

data = data.head(1000)
data.head()

Unnamed: 0,uuid,ord_in_thread,author,published,title,text,language,crawled,site_url,country,domain_rank,thread_title,spam_score,main_img_url,replies_count,participants_count,likes,comments,shares,type
0,6a175f46bcd24d39b3e962ad0f29936721db70db,0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,english,2016-10-27T01:49:27.168+03:00,100percentfedup.com,US,25689.0,Muslims BUSTED: They Stole Millions In Gov’t B...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
1,2bdc29d12605ef9cf3f09f9875040a7113be5d5b,0,reasoning with facts,2016-10-29T08:47:11.259+03:00,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,english,2016-10-29T08:47:11.259+03:00,100percentfedup.com,US,25689.0,Re: Why Did Attorney General Loretta Lynch Ple...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
2,c70e149fdd53de5e61c29281100b9de0ed268bc3,0,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,english,2016-10-31T01:41:49.479+02:00,100percentfedup.com,US,25689.0,BREAKING: Weiner Cooperating With FBI On Hilla...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
3,7cf7c15731ac2a116dd7f629bd57ea468ed70284,0,Fed Up,2016-11-01T05:22:00.000+02:00,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,english,2016-11-01T15:46:26.304+02:00,100percentfedup.com,US,25689.0,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,0.068,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias
4,0206b54719c7e241ffe0ad4315b808290dbe6c0f,0,Fed Up,2016-11-01T21:56:00.000+02:00,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,english,2016-11-01T23:59:42.266+02:00,100percentfedup.com,US,25689.0,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,0.865,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias


## Data Preprocessing

Here, we will perform the following preprocessing steps for each data types:
### Text Data

Row processed = `text`
Preprocessing:
- Cleaning (removing special characters, numbers, punctuations)
- Stemming
- Encoding

### Numerical Data
Row processed = `published`
Preprocessing:
- Extracting year, month, day
- Calculation of sentiment score

### Categorical data
Row processed = `site_url` and `author`
Preprocessing:
- Cleaning (removing special characters, numbers, punctuations)
- Encoding

## Drop Unneeded Columns

In [611]:
columns=['uuid', 'title', 'thread_title', 'spam_score', 'country', 'ord_in_thread', 'language', 'crawled', 'domain_rank', 'replies_count','participants_count', 'likes', 'comments', 'shares',"main_img_url"]
data.drop(columns, axis=1, inplace=True)
data.dropna(axis=0, inplace=False)
data.head()

Unnamed: 0,author,published,text,site_url,type
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,Print They should pay all the back all the mon...,100percentfedup.com,bias
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,Why Did Attorney General Loretta Lynch Plead T...,100percentfedup.com,bias
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,Red State : \nFox News Sunday reported this mo...,100percentfedup.com,bias
3,Fed Up,2016-11-01T05:22:00.000+02:00,Email Kayla Mueller was a prisoner and torture...,100percentfedup.com,bias
4,Fed Up,2016-11-01T21:56:00.000+02:00,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,100percentfedup.com,bias


Here, we perform the following steps:
- Lowercase all text data
- Replaced any special characters, numbers, and punctuations with a space
- Tokenizing the words
- Checking if all the words are alphabets
- Cleaning stop words
- Stemming the words

In [612]:
stop_words = stopwords.words('english')

def clean_text(text):
  text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
  text = word_tokenize(text)
  text = [word for word in text if word.isalpha()]
  text = [word for word in text if word not in stop_words]
  text = [PorterStemmer().stem(word) for word in text]
  text = ' '.join(text)
  return text

data['text'] = data['text'].apply(clean_text)
data.head()

Unnamed: 0,author,published,text,site_url,type
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,print pay back money plu interest entir famili...,100percentfedup.com,bias
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,attorney gener loretta lynch plead fifth barra...,100percentfedup.com,bias
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,red state fox news sunday report morn anthoni ...,100percentfedup.com,bias
3,Fed Up,2016-11-01T05:22:00.000+02:00,email kayla mueller prison tortur isi chanc re...,100percentfedup.com,bias
4,Fed Up,2016-11-01T21:56:00.000+02:00,email healthcar reform make america great sinc...,100percentfedup.com,bias


Here, we perform the following steps:
- Extracting year, month, and day from the `published` column
- Calculating the sentiment score of the `text` column

In [613]:
data['published'] = pd.to_datetime(data['published'], utc=True)
data['day'] = data['published'].dt.day
data['month'] = data['published'].dt.month
data['year'] = data['published'].dt.year
data.drop('published', axis=1, inplace=True)

data.head()

Unnamed: 0,author,text,site_url,type,day,month,year
0,Barracuda Brigade,print pay back money plu interest entir famili...,100percentfedup.com,bias,26,10,2016
1,reasoning with facts,attorney gener loretta lynch plead fifth barra...,100percentfedup.com,bias,29,10,2016
2,Barracuda Brigade,red state fox news sunday report morn anthoni ...,100percentfedup.com,bias,30,10,2016
3,Fed Up,email kayla mueller prison tortur isi chanc re...,100percentfedup.com,bias,1,11,2016
4,Fed Up,email healthcar reform make america great sinc...,100percentfedup.com,bias,1,11,2016


In [614]:
analyzer = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(text):
  score = analyzer.polarity_scores(text)

  if score['compound'] > 0.05:
    return 'positive'
  elif score['compound'] < -0.05:
    return 'negative'
  else:
    return 'neutral'

data['feeling'] = data['text'].apply(sentiment_analyzer_scores)
data.head()

Unnamed: 0,author,text,site_url,type,day,month,year,feeling
0,Barracuda Brigade,print pay back money plu interest entir famili...,100percentfedup.com,bias,26,10,2016,negative
1,reasoning with facts,attorney gener loretta lynch plead fifth barra...,100percentfedup.com,bias,29,10,2016,negative
2,Barracuda Brigade,red state fox news sunday report morn anthoni ...,100percentfedup.com,bias,30,10,2016,positive
3,Fed Up,email kayla mueller prison tortur isi chanc re...,100percentfedup.com,bias,1,11,2016,positive
4,Fed Up,email healthcar reform make america great sinc...,100percentfedup.com,bias,1,11,2016,positive


Here, we perform TF-IDF encoding for the `text` column.

In [615]:
tfidf = TfidfVectorizer(ngram_range=(1, 2))
data['text'] = tfidf.fit_transform(data['text']).toarray()

In [616]:
data.head()

Unnamed: 0,author,text,site_url,type,day,month,year,feeling
0,Barracuda Brigade,0.0,100percentfedup.com,bias,26,10,2016,negative
1,reasoning with facts,0.0,100percentfedup.com,bias,29,10,2016,negative
2,Barracuda Brigade,0.0,100percentfedup.com,bias,30,10,2016,positive
3,Fed Up,0.0,100percentfedup.com,bias,1,11,2016,positive
4,Fed Up,0.0,100percentfedup.com,bias,1,11,2016,positive


We check null values in the `site_url` and `author` columns. We will fill the null values of the `author` column with the 'Unknown'.

In [617]:
data['author'].isnull().sum(), data['site_url'].isnull().sum()

(146, 0)

In [618]:
data['author'] = data['author'].fillna('Unknown')

Here, we lowercase the text on `site_url` and `author` columns and replace any special characters, numbers, and punctuations with a space.

We then perform Label Encoding for the `site_url` column.

In [619]:
data['site_url'] = data['site_url'].str.lower()
data['author'] = data['author'].str.lower().str.replace(r'[^a-z]', '', regex=True)

encoder = LabelEncoder()
data['site_url'] = encoder.fit_transform(data['site_url'])
data.head(100)

Unnamed: 0,author,text,site_url,type,day,month,year,feeling
0,barracudabrigade,0.0,0,bias,26,10,2016,negative
1,reasoningwithfacts,0.0,0,bias,29,10,2016,negative
2,barracudabrigade,0.0,0,bias,30,10,2016,positive
3,fedup,0.0,0,bias,1,11,2016,positive
4,fedup,0.0,0,bias,1,11,2016,positive
...,...,...,...,...,...,...,...,...
95,anonymous,0.0,3,bs,29,10,2016,negative
96,anonymous,0.0,3,bs,29,10,2016,negative
97,anonymous,0.0,3,bs,29,10,2016,negative
98,anonymous,0.0,3,bs,29,10,2016,positive


For the `author` column, we perform the following steps:
- Create a unique encoding based on each author’s source and their index within that source.
- Using the formula provided, where $i_k$ is the author’s index, T is the number of authors for that source, and the author’s index is $i_k = i_{k-1} + T + 1$, where T is the number of authors for that source and $i_k$ is the author’s index.

In [620]:
author_dict = {}
current_index = 0

for source in data['site_url'].unique():
  authors = data[data['site_url'] == source]['author'].unique()
  author_dict[source] = {author: idx + current_index for idx, author in enumerate(authors)}
  current_index += len(authors)

In [621]:
for source in data['site_url'].unique():
  data.loc[data['site_url'] == source, 'author'] = data[data['site_url'] == source]['author'].apply(lambda x: author_dict[source].get(x))

In [622]:
data.rename(columns={'site_url': 'source'}, inplace=True)
# data['type'] = 'fake'
data.head()

Unnamed: 0,author,text,source,type,day,month,year,feeling
0,0,0.0,0,bias,26,10,2016,negative
1,1,0.0,0,bias,29,10,2016,negative
2,0,0.0,0,bias,30,10,2016,positive
3,2,0.0,0,bias,1,11,2016,positive
4,2,0.0,0,bias,1,11,2016,positive
