# Fake News Detection 
## CM3070 Prototype


### Python Environment Setup (setup a venv)


In [None]:
# pip install virtualenv ipykernel
# virtualenv cm3070
# source cm3070/bin/activate
# python -m ipykernel install --user --name=cm3070

#Installed kernelspec cm3070 in /Users/lawrence/Library/Jupyter/kernels/cm3070
#Restart VS Code to see new kernel

### Python Library Setup (install required libraries)

In [None]:
#ML, NLK and other classification libraries
%pip install scikit-learn
%pip install pandas
%pip install transformers
%pip install ekphrasis
%pip install keras-tuner
%pip install flair
%pip install nltk
%pip install tensorflow

#For Jupyter Progress bar bits
%pip install iprogress
%pip install ipywidgets

#output libs
%pip install seaborn

### Unzip datasets if necessary

In [15]:
### Unzip datasets if necessary
#Only FAKEDDIT are compressed initially, if from git. 
#This will iterate and unzip if needed on initial run.

import zipfile
import os.path, os

print (f'Working Path is: "{os.getcwd()}"')

prefix = "./datasets/FAKEDDIT/"

fileset = ['all_test_public','all_train','all_validate']

for file in fileset:
    if not os.path.isfile(f'{prefix}{file}.tsv'):
        with zipfile.ZipFile(f'{prefix}{file}.zip',"r") as zip_ref:
            zip_ref.extractall(prefix)
            print (f'Extracted {file}')



Working Path is: "/Users/lawrence/Documents/CourseWork/Level 6/CM3070 - Final/code"
Extracted all_test_public
Extracted all_train
Extracted all_validate


### Import Libraries

In [None]:
#Import main libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import keras as keras
import flair
import urllib
import statistics
import math
import pprint
import sklearn

#Import the NLP cleaning pre-processing tools
import nltk
import re
nltk.download('wordnet') #currently used
nltk.download('stopwords') #currently used
#nltk.download('punkt')


from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from flair.models import TextClassifier


#Import Notebook bars
from tqdm.notebook import tqdm

#Get our Evaluation metrics
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay 
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier

#Output
import seaborn as sns
import matplotlib.pyplot as plt



### General Setup

In [None]:
#Add pretty status progress bar status to notebook
tqdm.pandas()


## Pipeline

### 1.1 - Import and Normalize Benchmark Data
* Import to named df and concatencate data as needed.
* Add is_true column.
* Flatten truthiness to true/false class.
* Set appropriate description column from dataset, and move column to before is_true




In [None]:
def import_isot():
    #Import both csv's
    isot_true = pd.read_csv('./datasets/ISOT/True.csv')
    isot_fake = pd.read_csv('./datasets/ISOT/Fake.csv')
    #Add truthiness column
    isot_true['is_fake']=False
    isot_fake['is_fake']=True
    #Concat both CSV's
    df = pd.concat([isot_true, isot_fake])

    #rename 2 -> description
    df.rename(columns={ "text": "description" }, inplace = True)
    #move to end -1
    df.insert(len(df.columns)-2, 'description', df.pop('description')) #-2 as starts from 0
    #need to flatten out half truth etc -> true / false
    
    return df

def import_liar():
    df = pd.read_csv('./datasets/LIAR/test.tsv',  sep='\t',  header=None)
    #add our truthiness column, and capitalize it to conform to True False / Other
    df['is_fake'] = df.iloc[:, 1].str.capitalize()
    
    #rename 2 -> description
    df.rename(columns={ df.columns[2]: "description" }, inplace = True)
    #move to end -1
    df.insert(len(df.columns)-2, 'description', df.pop('description')) #-2 as starts from 0
    #need to flatten out half truth etc -> true / false
    #tbd
    return df

def import_fnn():
    #Import
    fnn_gc_fake = pd.read_csv('./datasets/FakeNewsNet/dataset/gossipcop_fake.csv')
    fnn_gc_true = pd.read_csv('./datasets/FakeNewsNet/dataset/gossipcop_real.csv')
    fnn_p_fake = pd.read_csv('./datasets/FakeNewsNet/dataset/politifact_fake.csv')
    fnn_p_true = pd.read_csv('./datasets/FakeNewsNet/dataset/politifact_real.csv')
    #Add truthiness colum
    fnn_gc_true['is_fake']=False
    fnn_gc_fake['is_fake']=True
    fnn_p_true['is_fake']=False
    fnn_p_fake['is_fake']=True
    #make into a single set
    df = pd.concat ([fnn_gc_fake, fnn_gc_true, fnn_p_fake, fnn_p_true])
    df.rename(columns={"title": "description"},inplace=True)
    #Move to end -1
    df.insert(len(df.columns)-2, 'description', df.pop('description')) #-2 as starts from 0
    return df

def import_fe():
    df = pd.read_csv('./datasets/FAKEDDIT/all_test_public.tsv', sep='\t')
    df.rename(columns={"title": "description"},inplace=True)
    df['is_fake'] = df['2_way_label']
    #Move to end -1
    df.insert(len(df.columns)-2, 'description', df.pop('description')) #-2 as starts from 0
    #Need to amend 2_way_label -> true / false
    df.is_fake = df.is_fake.replace({ 0: True,  1:False})
    #dropna
    df = df[df['description'].notna()]
    return df



def df_stats(name, df, showdf=False):
    #Display basic stats on data size
    print (f'{name} Size{df.shape}')
    
    print (df.is_fake.value_counts())
    if (showdf):
        display(df)
    #Display Pie chart of Fake/True counts
    pie_df =  df['is_fake'].value_counts()
    #total = df['is_fake'].value_counts().values.sum()
    plot = pie_df.plot.pie(figsize=(5, 5),legend=False,  autopct='%1.1f%%' )
    

    

### 1.1.1 - LIAR Benchmark Dataset

In [None]:
#Data Import and Mangling

#LIAR
liar_df = import_liar()
df_stats("LIAR", liar_df) #balanced



### 1.1.2 - ISOT Benchmark Dataset

In [None]:

#ISOT
isot_df = import_isot()
df_stats("ISOT", isot_df) #balanced


### 1.1.3 - FakeNewsNet Benchmark Dataset

In [None]:

#FakeNewsNet
fnn_df = import_fnn()
df_stats("FakeNewsNet", fnn_df) #unbalanced data set


### 1.1.4 - Fakeddit Benchmark Dataset

In [None]:

#FAKEDDIT
fe_df = import_fe()
df_stats("Fakeddit", fe_df, False) #balanced


### 1.2 - Pre-processing

In [None]:
# Preprocessing 

def clean_text(s):
    #Lowercase, remove html, strip non-alphanumeric, remove spaces, remove stop words, lemmatize

    #if s is NaN then exit
    # if not s==s: 
    #     return s
    
    #1 - lowercase
    s = s.lower() 

    #2 - remove html using basic regex.  
    dehtml = re.compile(r'<[^>]*>')
    s= dehtml.sub('', s)

    #3 - Strip non alphanumeric away to spaces
    s = re.sub (r'[^a-z0-9\s]',' ',s) 
    
    #4 - Remove excess spaces
    s = re.sub (r'\s+',' ', s).strip() 

    #5 - remove stop words from the sentence 
    tokens=  s.split()
    stop_words = set(stopwords.words('english'))
    filtered = [token for token in tokens if token not in stop_words]
    s = " ".join(filtered)

    #6 - lemmatize and remove from sentence
    tokens = s.split()
    lemmatizer = WordNetLemmatizer()
    filtered = [lemmatizer.lemmatize(token) for token in tokens]
    s = " ".join (filtered)

    return s





#Clean a passed in dataframe desc column -> clean_text
def clean_df (df):
    #copy the df
    clean = df.copy()
    #add our processed column with tqdm goodness for progress
    clean['clean_description'] = clean['description'].progress_apply (clean_text)
    #reset the df
    clean.reset_index(inplace=True, drop=True)
    #Move clean_description to before is_true for ease of visibility
    clean.insert(len(df.columns)-1, 'clean_description', clean.pop('clean_description'))
    display(clean.head())
    return clean



#### Test cleaning

In [None]:
sentence = (clean_text('This IS a Lawrence\'s of Arabia\'s <B>rather</B> #Brilliant \t  <i>set</i> of  lots of SentEnces...'))
print (sentence)

#### Clean all data sets

In [None]:
liar_df = clean_df(liar_df)

isot_df = clean_df(isot_df)

fnn_df = clean_df(fnn_df)

fe_df = clean_df(fe_df)

print ("Processed")

#### Create Combined Dataset

In [None]:

collated = [liar_df[['description', 'clean_description', 'is_fake']], 
            isot_df[['description', 'clean_description', 'is_fake']],
            fnn_df[['description', 'clean_description', 'is_fake']],
            fe_df[['description', 'clean_description', 'is_fake']],
            ]
combined_df = pd.concat(collated)

df_stats ("Combined",combined_df)

### 1.3 - Feature Extraction

#### Testing Flair

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger

sentence = (clean_text('This IS a Lawrence\'s of Arabia\'s <B>rather</B> #Brilliant \t  <i>set</i> of  lots of SentEnces...'))
print (sentence)
# create a sentence
sentence = Sentence(sentence)
# load the NER tagger
tagger = SequenceTagger.load('ner')

# predict the named entities in the sentence
tagger.predict(sentence)

# print the predicted named entities
for entity in sentence.get_spans('ner'):
    print(entity)

#can see named entity is listed for lawrence (of) arabia -> person 

#### Prototype Testing 

In [None]:
#Prototype uses a single test classifier and feature extractor  
classifiers=dict()
classifiers['dt'] = DecisionTreeClassifier()

extractors=dict()
extractors['tfidf'] = TfidfVectorizer(stop_words='english', max_df=0.7)

### 1.4 - Training

In [None]:

def evaluate (y_true, y_predicted):
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_predicted) 
    sns.heatmap(cm, annot=True, cmap='rocket_r', fmt= '.5g')
    plt.title('Confusion Matrix')
    plt.ylabel('Actual Values')
    plt.xlabel('Predicted Values')
    plt.show()
    # F1 Scoring and Accuracy
    print('') 
    display('Accuracy, Precision, Recall and F1-score')
    print(classification_report(y_true, y_predicted, digits=5))


def train (name, x_train, x_test, y_train, y_test):
    #name datasource name
    #x_train, x_test use the extractors to train
    #y_train, y_test use the classifiers to train

    transformed_train=extractors['tfidf'].fit_transform(x_train)
    transformed_test=extractors['tfidf'].transform(x_test)

    classifiers['dt'].fit (transformed_train, y_train)
    y_pred = classifiers['dt'].predict(transformed_test)

    print (name)
    evaluate (y_test,y_pred)



    

### 1.5 - Classification (Prediction Scoring)
#### Single example shown for video for speed reasons

In [None]:

#isot_df
#liar_df
#fnn_df
#fe_df
#combined_df

training = dict()

training['ISOT'] = isot_df #too slow to show more than one training sample in video
#training['LIAR'] = liar_df
#training['FNN'] = fnn_df
#training['FEDDIT'] = fe_df
#training['COMBINED']= combined_df
 

for name, df in tqdm(training.items()):
    labels = df.is_fake
    x_train, x_test, y_train, y_test=train_test_split(isot_df['clean_description'], labels, test_size=0.3, random_state=7)

    train ( name,x_train, x_test, y_train, y_test)

    

## Analysis

In [None]:
# To be added at a later stage