# Imports

In [1]:
import numpy as np
import pandas as pd
import string
import re
import nltk

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to /Users/jdidio/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Load data
Here we load the to files, fake and true news, concatenate them togheter and added the label (0 for fake and 1 for non-fake news)

In [2]:
true_df = pd.read_csv('../data/True.csv')
true_df['label'] = 1
true_df = true_df[['label', 'title', 'text']]

fake_df = pd.read_csv('../data/Fake.csv')
fake_df['label'] = 0
fake_df = fake_df[['label', 'title', 'text']]

news_df = pd.concat([true_df, fake_df]).reset_index(drop=True)
news_df.dropna()

news_df

Unnamed: 0,label,title,text
0,1,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...
1,1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...
2,1,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...
3,1,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...
4,1,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...
...,...,...,...
44893,0,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...
44894,0,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...
44895,0,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...
44896,0,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...


# Cleaning dataset

In [3]:
def remove_stopwords(stopwords, text):
    """Remove the stopwords from a given string

    Args:
        stopwords (list(string)): words to remove
        text (string): string from whom to remove the words

    Returns:
        string: the resulting string
    """
    return " ".join([word for word in text.split() if word.lower() not in stopwords])


def remove_punct(text):
    """Remove the ponctuations from a given string "text" and return the result

    Args:
        text (string): text from whom to remove the punctuations

    Returns:
        string: text without punctuations
    """
    text = re.sub('/', ' ', text) ## First change all '/' to ' '
    return text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) # Replace all punctuations by ' '


def add_space(text): # TODO
    """Remove camel cases from text

    Args:
        text (string): text to remove camel cases

    Returns:
        string: text without camel case
    """
    return re.sub('([a-z])([A-Z])', r'\1 \2', text)


def remove_white_space(text):
    """Remove extra white spaces

    Args:
        text (string): text to remove extra white spaces

    Returns:
        string: text without extra white spaces
    """
    return re.sub(' +', ' ', text)


def remove_words_digits(text):
    """Remove words containing digits

    Args:
        text (string): text to remove word containing digits

    Returns:
        string: text without camel case
    """
    return re.sub(r'\w*\d\w*', '', text)


def to_lower(text):
    """Change all words to lower cases

    Args:
        text (string): text to modify

    Returns:
        string: text with only lower case words
    """
    return text.lower()


def lemmatize(text):
    """Lemmatize the text

    Args:
        text (string): text to lemmatize

    Returns:
        string: text lemmatized
    """
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [4]:
news_df['title'] = news_df['title'].apply(lambda x: remove_stopwords(stopwords, x))
news_df['title'] = news_df['title'].apply(lambda x: remove_punct(x))
news_df['title'] = news_df['title'].apply(lambda x: add_space(x))
news_df['title'] = news_df['title'].apply(lambda x: remove_white_space(x))
news_df['title'] = news_df['title'].apply(lambda x: remove_words_digits(x))
news_df['title'] = news_df['title'].apply(lambda x: to_lower(x))
news_df['title'] = news_df['title'].apply(lambda x: lemmatize(x))

news_df['text'] = news_df['text'].apply(lambda x: remove_stopwords(stopwords, x))
news_df['text'] = news_df['text'].apply(lambda x: remove_punct(x))
news_df['text'] = news_df['text'].apply(lambda x: add_space(x))
news_df['text'] = news_df['text'].apply(lambda x: remove_white_space(x))
news_df['text'] = news_df['text'].apply(lambda x: remove_words_digits(x))
news_df['text'] = news_df['text'].apply(lambda x: to_lower(x))
news_df['text'] = news_df['text'].apply(lambda x: lemmatize(x))

In [5]:
X_title = news_df['title']
X_text = news_df['text']
y = news_df['label']

# Fake news detection 

In [6]:
# Title data
x_title_train, x_title_test, y_title_train, y_title_test = train_test_split(X_title, y, test_size=0.3)

x_title_v_train = vectorization.fit_transform(x_title_train)
x_title_v_test = vectorization.transform(x_title_test)

# Text data
x_text_train, x_text_test, y_text_train, y_text_test = train_test_split(X_text, y, test_size=0.3)

x_text_v_train = vectorization.fit_transform(x_text_train)
x_text_v_test = vectorization.transform(x_text_test)

## Decision Tree classifier

### Based on title

In [7]:
DT = DecisionTreeClassifier()
DT.fit(x_title_v_train, y_title_train)
pred_title_dt = DT.predict(x_title_v_test)

In [8]:
print(classification_report(y_title_test, pred_title_dt))

              precision    recall  f1-score   support

           0       0.91      0.90      0.91      7016
           1       0.89      0.91      0.90      6454

    accuracy                           0.90     13470
   macro avg       0.90      0.90      0.90     13470
weighted avg       0.90      0.90      0.90     13470



### Based on text

In [9]:
DT = DecisionTreeClassifier()
DT.fit(x_text_v_train, y_text_train)
pred_text_dt = DT.predict(x_text_v_test)

In [10]:
print(classification_report(y_text_test, pred_text_dt))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7021
           1       1.00      1.00      1.00      6449

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



## Gradient Boosting Classifier

### Based on title

In [11]:
GB = GradientBoostingClassifier()
GB.fit(x_title_v_train, y_title_train)
pred_title_gb = GB.predict(x_title_v_test)

In [12]:
print(classification_report(y_title_test, pred_title_gb))

              precision    recall  f1-score   support

           0       0.95      0.76      0.84      7016
           1       0.78      0.95      0.86      6454

    accuracy                           0.85     13470
   macro avg       0.86      0.85      0.85     13470
weighted avg       0.87      0.85      0.85     13470



### Based on text

In [13]:
GB.fit(x_text_v_train, y_text_train)
pred_text_gb = GB.predict(x_text_v_test)

In [14]:
print(classification_report(y_text_test, pred_text_gb))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7021
           1       0.99      1.00      1.00      6449

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



## Random Forest Classifier

### Based on title

In [15]:
RF = RandomForestClassifier()
RF.fit(x_title_v_train, y_title_train)
pred_title_rf = RF.predict(x_title_v_test)

In [16]:
print(classification_report(y_title_test, pred_title_rf))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94      7016
           1       0.92      0.95      0.94      6454

    accuracy                           0.94     13470
   macro avg       0.94      0.94      0.94     13470
weighted avg       0.94      0.94      0.94     13470



### Based on text

In [17]:
RF.fit(x_text_v_train, y_text_train)
pred_text_rf = RF.predict(x_text_v_test)

In [18]:
print(classification_report(y_text_test, pred_text_rf))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7021
           1       0.99      0.99      0.99      6449

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470

