# Detecting Fake News with Natural Language Processing

In [10]:
# Load packages
import numpy as np
import pandas as pd
import re
import os

We will use the GLoVe pre-trained word embedding data set to convert words into N-dimensional vectors. We will use 50 dimensional vectors for now. These vectors were trained on Wikipedia 2014 + Gigaword 5 and includes a 400,000 word vocabulary of uncased words. The file (glove.6B.50d.txt) can be downloaded here: https://nlp.stanford.edu/projects/glove/ . In order to run an LSTM, we will need every article to have the same number of words. Most of the news articles in the Fake News dataset are under 200 words long, including the headline and body. Most of the news articles in the Celebrity data set are under 750 words long. We will begin by capping the article length at 200 words. Articles that are shorter than this, will be padded with zeros (i.e. a random word) at the end.

In [None]:
# Supply location of GloVe text file, location of data, and max word length of news article
glove_filepath = 'models/embeddings/glove.6B.50d.txt'
datapath = 'data/fakeNewsDatasets_Perez-Rosas2018'
maxSeqLength = 200

## Load GloVe Embedding Matrix

In [3]:
# Function to load GloVe embedding data, and convert it to three useful formats
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    f = open(gloveFile,'r', encoding="utf8")
    model = {}
    wordsList = []
    embeddings = []
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        wordsList.append(word)
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
        embeddings.append(embedding)
    print ("Done.",len(model)," words loaded!")
    f.close()
    return wordsList, embeddings, model

In [4]:
# We can access the position of a word in the embedding file using "wordsList"
# We can access the embedding of a word using "embeddings". The position in this will match "wordlist".
# We can access the embedding of a word using the dictionary "model". We will not actually use this, but useful to have.
wordsList, embeddings, model = loadGloveModel(glove_filepath)

Loading Glove Model
Done. 400000  words loaded!


## Load and Embed News Articles

In [7]:
# Function that removes punctuation, parentheses, question marks, etc., and leaves only alphanumeric characters
def cleanArticle(string):
    strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

# Function that takes a news article as an input.
# It generates a fixed sequences of integers corresponding to the index of the embedding in the embedding lookup
# It caps the number of embedded words (i.e. article length) at maxSeqLength
# Words that do not exist in GloVe, will be assigned to a random embedding. In this case, the one at position 39999
def getArticleMatrix(article):
    articleMatrix = np.zeros(maxSeqLength, dtype='int32')
    cleanedArticle = cleanArticle(article)
    split = cleanedArticle.split()
    for indexCounter,word in enumerate(split):
        if indexCounter==maxSeqLength:
            break
        try:
            articleMatrix[indexCounter] = wordsList.index(word)
        except ValueError:
            articleMatrix[indexCounter] = 399999 #Vector for unkown words
    return articleMatrix

In [15]:
# Function to load and embed news articles
def tabulate_data(dataset_name):
    """Create a Pandas dataframe out of input Perez-Rosas dataset files
    @param dataset_name: Name of the dataset (fakenews or celebrity)
    @returns Pandas dataframe with columns:
        dataset_name, news_type, news_category, news_headline, news_content
    """
    def remove_numbers(in_str):
        return re.sub(r'[0-9]+', '', in_str)

    result_data_list = []
    data_dir = datapath
    for news_type in ['fake', 'legit']:
        folder = '%s/%s/%s' % (data_dir, dataset_name, news_type)
        for fname in os.listdir(folder):
            result_data = {}
            result_data['dataset_name'] = dataset_name
            result_data['news_type'] = news_type
            if news_type == 'fake':
                result_data['is_fake'] = 1
            else:
                result_data['is_fake'] = 0
            if dataset_name == 'fakeNewsDataset':
                result_data['news_category'] = remove_numbers(fname.split('.')[0])
            result_data['file_name'] = fname
            filepath = os.path.join(folder, fname)
            with open(filepath, 'r', encoding="utf8") as f:
                file_data = f.read().split('\n')
                # Some articles don't have a headline, but only article body.
                if len(file_data) > 1:
                    news_content_data = ' '.join(file_data[2:])
                    result_data['news_headline'] = file_data[0]
                else:
                    news_content_data = file_data[0]
                    result_data['news_headline'] = ''
                result_data['news_content'] = news_content_data
                result_data['news_all'] = ' '.join(file_data[0:])
                result_data['news_embed'] = getArticleMatrix(result_data['news_all'])
                result_data['num_embed_words'] = len(result_data['news_embed'])
                result_data_list.append(result_data)
    df = pd.DataFrame(result_data_list)
    return df

In [16]:
# Load fake news data set
# Note: Embedded words used the article title and body. All embeddings should have a length of maxSeqLength.
fakenews_df = tabulate_data('fakeNewsDataset')
fakenews_df.head()

Unnamed: 0,dataset_name,file_name,is_fake,news_all,news_category,news_content,news_embed,news_headline,news_type,num_embed_words
0,fakeNewsDataset,polit19.fake.txt,1,FBI investigates computer link between Trump a...,polit,(CNN)Federal investigators and computer scie...,"[2419, 20095, 951, 2858, 118, 10468, 5, 443, 3...",FBI investigates computer link between Trump a...,fake,200
1,fakeNewsDataset,tech028.fake.txt,1,Google steals user location information with a...,tech,Alphabet Inc's Google announced on Wednesday t...,"[4361, 13753, 4832, 2044, 419, 17, 7, 12726, 1...",Google steals user location information with a...,fake,200
2,fakeNewsDataset,polit34.fake.txt,1,"Biden: Trump was wiretapped, but not by US ...",polit,Joe Biden said President Donald Trump was in...,"[8725, 10468, 15, 62414, 34, 36, 21, 95, 1984,...","Biden: Trump was wiretapped, but not by US",fake,200
3,fakeNewsDataset,edu27.fake.txt,1,"Harvard Law, Moving to Limit Applicant Pool, W...",edu,"Harvard Law School, moving to close its door...","[3332, 264, 1233, 4, 2514, 19067, 3216, 43, 36...","Harvard Law, Moving to Limit Applicant Pool, W...",fake,200
4,fakeNewsDataset,edu14.fake.txt,1,Microsoft Aims to spread liberalism on the suc...,edu,"With the launch of ""Minecraft"" edition crea...","[2058, 4351, 4, 1635, 22142, 13, 0, 1045, 3, 1...",Microsoft Aims to spread liberalism on the suc...,fake,200


In [19]:
# # Confirm embeddings worked properly. The index of the first 5 words should line up!
# print(fakenews_df.loc[0]['news_all'])
# print(fakenews_df.loc[0]['news_embed'])
# print(wordsList.index("FBI".lower()))
# print(wordsList.index("investigates".lower()))
# print(wordsList.index("computer".lower()))
# print(wordsList.index("link".lower()))
# print(wordsList.index("between".lower()))

## Build Long Short Term Memory (LSTM) Model

We will use TensorFlow to build and train a LSTM model which is capable if producing a binary classifier of fake or not fake, for each news artcile.

Internal team note: The Oriole LSTM notebook (in /models/LSTM_Classification) has a great explaination of deep learning, recurrent neural networks, LSTMs, word embeddings etc. We can rely heavily on this if we want to explain things in detail in our paper. I recommend reading through that notebook.