# Detecting Fake News with Natural Language Processing

In [1]:
# Load packages
import numpy as np
import pandas as pd
import re
import os
import datetime
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

We will use the GLoVe pre-trained word embedding data set to convert words into N-dimensional vectors. We will use 50 dimensional vectors for now. These vectors were trained on Wikipedia 2014 + Gigaword 5 and includes a 400,000 word vocabulary of uncased words. The file (glove.6B.50d.txt) can be downloaded here: https://nlp.stanford.edu/projects/glove/ . In order to run an LSTM, we will need every article to have the same number of words. Most of the news articles in the Fake News dataset are under 200 words long, including the headline and body. Most of the news articles in the Celebrity data set are under 750 words long. We will begin by capping the article length at 200 words. Articles that are shorter than this, will be padded with zeros (i.e. a random word) at the end.

In [2]:
# Supply location of GloVe text file, location of data, and max word length of news article
glove_filepath = 'models/embeddings/glove.6B.50d.txt'
datapath = 'data/fakeNewsDatasets_Perez-Rosas2018'
maxSeqLength = 200
numDimensions = 50

## Load GloVe Embedding Matrix

In [5]:
# Function to load GloVe embedding data, and convert it to three useful formats
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    f = open(gloveFile,'r', encoding="utf8")
    model = {}
    wordsList = []
    embeddings = []
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        wordsList.append(word)
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
        embeddings.append(embedding)
    print ("Done.",len(model)," words loaded!")
    f.close()
    return wordsList, embeddings, model

In [6]:
# We can access the position of a word in the embedding file using "wordsList"
# We can access the embedding of a word using "embeddings". The position in this will match "wordlist".
# We can access the embedding of a word using the dictionary "model". We will not actually use this, but useful to have.
wordsList, embeddings, model = loadGloveModel(glove_filepath)

Loading Glove Model
Done. 400000  words loaded!


## Load and Embed News Articles

In [7]:
# Function that removes punctuation, parentheses, question marks, etc., and leaves only alphanumeric characters
def cleanArticle(string):
    strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

# Function that takes a news article as an input.
# It generates a fixed sequences of integers corresponding to the index of the embedding in the embedding lookup
# It caps the number of embedded words (i.e. article length) at maxSeqLength
# Words that do not exist in GloVe, will be assigned to a random embedding. In this case, the one at position 39999
def getArticleMatrix(article):
    articleMatrix = np.zeros(maxSeqLength, dtype='int32')
    cleanedArticle = cleanArticle(article)
    split = cleanedArticle.split()
    for indexCounter,word in enumerate(split):
        if indexCounter==maxSeqLength:
            break
        try:
            articleMatrix[indexCounter] = wordsList.index(word)
        except ValueError:
            articleMatrix[indexCounter] = 399999 #Vector for unkown words
    return articleMatrix

In [8]:
# Function to load and embed news articles
def tabulate_data(dataset_name):
    """Create a Pandas dataframe out of input Perez-Rosas dataset files
    @param dataset_name: Name of the dataset (fakenews or celebrity)
    @returns Pandas dataframe with columns:
        dataset_name, news_type, news_category, news_headline, news_content
    """
    def remove_numbers(in_str):
        return re.sub(r'[0-9]+', '', in_str)

    result_data_list = []
    data_dir = datapath
    for news_type in ['fake', 'legit']:
        folder = '%s/%s/%s' % (data_dir, dataset_name, news_type)
        for fname in os.listdir(folder):
            result_data = {}
            result_data['dataset_name'] = dataset_name
            result_data['news_type'] = news_type
            if news_type == 'fake':
                result_data['is_fake'] = 1
            else:
                result_data['is_fake'] = 0
            if dataset_name == 'fakeNewsDataset':
                result_data['news_category'] = remove_numbers(fname.split('.')[0])
            result_data['file_name'] = fname
            filepath = os.path.join(folder, fname)
            with open(filepath, 'r', encoding="utf8") as f:
                file_data = f.read().split('\n')
                # Some articles don't have a headline, but only article body.
                if len(file_data) > 1:
                    news_content_data = ' '.join(file_data[2:])
                    result_data['news_headline'] = file_data[0]
                else:
                    news_content_data = file_data[0]
                    result_data['news_headline'] = ''
                result_data['news_content'] = news_content_data
                result_data['news_all'] = ' '.join(file_data[0:])
                result_data['news_embed'] = getArticleMatrix(result_data['news_all'])
                result_data['num_embed_words'] = len(result_data['news_embed'])
                result_data_list.append(result_data)
    df = pd.DataFrame(result_data_list)
    return df

In [9]:
# Load fake news data set
# Note: Embedded words used the article title and body. All embeddings should have a length of maxSeqLength.
fakenews_df = tabulate_data('fakeNewsDataset')
fakenews_df.head()

Unnamed: 0,dataset_name,file_name,is_fake,news_all,news_category,news_content,news_embed,news_headline,news_type,num_embed_words
0,fakeNewsDataset,entmt04.fake.txt,1,He is not very familiar with winning awards f...,entmt,He is not very familiar with winning awards f...,"[18, 14, 36, 191, 3478, 17, 877, 1542, 10, 26,...",,fake,200
1,fakeNewsDataset,edu39.fake.txt,1,Companies and Colleges can get the U.S. Workin...,edu,The U.S. Labor Department announced last Fri...,"[337, 5, 4759, 86, 169, 0, 95, 500, 378, 0, 95...",Companies and Colleges can get the U.S. Workin...,fake,200
2,fakeNewsDataset,tech031.fake.txt,1,"Instagram adds futuristic authentication, faci...",tech,Instagram has added on some futuristic feature...,"[109262, 2144, 20746, 30854, 12662, 3275, 5, 3...","Instagram adds futuristic authentication, faci...",fake,200
3,fakeNewsDataset,tech024.fake.txt,1,Toyota sues Microsoft for contract breach ...,tech,Automobile manufacturer Toyota had signed ...,"[3951, 25115, 2058, 10, 953, 7218, 6190, 5023,...",Toyota sues Microsoft for contract breach,fake,200
4,fakeNewsDataset,polit20.fake.txt,1,"In Second Debate, Donald Trump and Hillary Cli...",polit,Hillary Clinton and Donald Trump clashed in ...,"[6, 126, 1422, 3907, 10468, 5, 4539, 443, 3008...","In Second Debate, Donald Trump and Hillary Cli...",fake,200


In [None]:
# # Confirm embeddings worked properly. The index of the first 5 words should line up!
# print(fakenews_df.loc[0]['news_all'])
# print(fakenews_df.loc[0]['news_embed'])
# print(wordsList.index("FBI".lower()))
# print(wordsList.index("investigates".lower()))
# print(wordsList.index("computer".lower()))
# print(wordsList.index("link".lower()))
# print(wordsList.index("between".lower()))

## Build Long Short Term Memory (LSTM) Model

We will use TensorFlow to build and train a LSTM model which is capable if producing a binary classifier of fake or not fake, for each news artcile.

Internal team note: The Oriole LSTM notebook (in /models/LSTM_Classification) has a great explaination of deep learning, recurrent neural networks, LSTMs, word embeddings etc. We can rely heavily on this if we want to explain things in detail in our paper. I recommend reading through that notebook.

In [10]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [11]:
# Indicate news df to use
news_df = fakenews_df

# Embedded word vector lookup. Convert from list to numpy array
wordVectors = np.asarray(embeddings)

# Split news articles and classification into test and train sets
newsVectors, newsVectors_test, classVector, classVector_test = \
    train_test_split(news_df['news_embed'],
                     news_df['is_fake'],
                     test_size = .2,
                     random_state = 1)

newsVectors, newsVectors_test, classVector, classVector_test = \
    np.asarray(newsVectors), \
    np.asarray(newsVectors_test), \
    np.asarray(classVector), \
    np.asarray(classVector_test)

In [13]:
# Helper functions for training model
# The label is converted to 2 dimensions. First column is flagged with 1 if fake. Second column is 1 if real.
# since our output is a 2 way classification

# def getTrainBatchOld2(ids, labels, batch_num):
#     start_idx = batch_num * batchSize
#     end_idx = start_idx + batchSize
#     out_array = np.zeros([batchSize, maxSeqLength])
#     out_labels = []
#     out_array_idx = 0
#     for i in range(start_idx, end_idx):
#         if labels[i] == 0:
#             out_labels.append([0, 1])
#         else:
#             out_labels.append([1, 0])
#         out_array[out_array_idx] = ids[i]
#         out_array_idx += 1
#     return np.asarray(out_array), np.asarray(out_labels)

def getTrainBatch(ids, label):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    # Get indexes of real and fake news and shuffle them
    real = list(np.where(label==0)[0])
    fake = list(np.where(label==1)[0])
    random.shuffle(real)
    random.shuffle(fake)
    
    # If batch size an even number. Split evenly fake and real
    if (batchSize % 2 == 0):
        comb = real[0:int(batchSize/2)] + fake[0:int(batchSize/2)]
        # Repeat the labels
        labels = [[0,1]] * int(batchSize/2) + [[1,0]] * int(batchSize/2)
    # If batch size an odd number. Split evenly fake and real, and add 1 fake
    else:
        comb = real[0:int(batchSize/2)] + fake[0:int(batchSize/2)]
        comb = comb + fake[int(batchSize/2):int(batchSize/2)+1]
        labels = [[0,1]] * int(batchSize/2) + [[1,0]] * int(batchSize/2)
        labels = labels +  [[1,0]]
    
    for i in range(len(comb)):
        arr[i] = ids[comb[i]]
          
    return arr, np.asarray(labels)

# def getTrainBatchOld1(ids, label):
#     labels = []
#     arr = np.zeros([batchSize, maxSeqLength])
#     for i in range(batchSize):
        
#         # Select an even number of fake and real news for every batch
#         if (i % 2 == 0):
#             # Randomly select from real news
#             num = random.choice(list(np.where(label==0)[0]))
#             arr[i] = ids[num]
#             labels.append([0,1])
#         else:
#             # Randomly select from fake news
#             num = random.choice(list(np.where(label==1)[0]))
#             arr[i] = ids[num]
#             labels.append([1,0])
       
#     return arr, np.asarray(labels)

# Use all test data. Make sure batch size = length(test data) because I did not make batch size dynamic
def getTestBatch(ids, label):
    labels = []
    num_test = len(label)
    arr = np.zeros([num_test, maxSeqLength])
    for i in range(num_test):
        arr[i] = ids[i]
        if label[i] == 0:
            labels.append([0,1])
        else:
            labels.append([1,0])
            
    return arr, np.asarray(labels) 

In [14]:
# Hyperparameters - We can turn up the batchsize and iterations when we want to train much more
# Making batchSize equal to length of the test set, so when obtain results it uses entire test set
batchSize = len(classVector_test)
lstmUnits = 64
numClasses = 2
iterations = 20
learning_rate = 0.001

In [15]:
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

In [16]:
data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors,input_data)

In [17]:
# This is slightly different than example workbook.
# I cast the data as a float in order to get it to work: tf.cast(data,tf.float32)
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits, forget_bias=0.0)
# Add dropout or not?
# lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, final_h_ = tf.nn.dynamic_rnn(lstmCell, tf.cast(data,tf.float32), dtype=tf.float32)

In [18]:
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.0, shape=[numClasses]))
# Transpose rows and columns (0-->1, 1-->0, 2-->2)
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

In [19]:
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

In [20]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=prediction)) 
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [21]:
# %pdb

# Takes approximately 1 hour to run. (4 vCPUs, 15 GB memory)
sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

# Set up Tensorboard
tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

# Train model
for i in range(iterations):
    #Next Batch of reviews
#     if len(newsVectors) % int(batchSize) == 0:
#         num_batches = int(len(newsVectors) / batchSize)
#     else:
#         # TODO: fix this to pad
#         num_batches = int(len(newsVectors) // batchSize)
#     for b in range(num_batches):
#         nextBatch, nextBatchLabels = getTrainBatchNew(newsVectors, classVector, b)
#         sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
    nextBatch, nextBatchLabels = getTrainBatch(newsVectors, classVector)
    
    # Write summary to Tensorboard
    if (i % 50 == 0):
        summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
        writer.add_summary(summary, i)

    #Save the network every 10,000 training iterations, or on last iteration
    if ((i % 10000 == 0 and i != 0) or i == iterations - 1):
        save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
        print("saved to %s" % save_path)
writer.close()

saved to models/pretrained_lstm.ckpt-19


In [22]:
sess.close()

## Test Model

In [23]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('models'))

INFO:tensorflow:Restoring parameters from models/pretrained_lstm.ckpt-19


In [24]:
iterations = 10
for i in range(iterations):
    nextBatch, nextBatchLabels = getTestBatch(newsVectors_test, classVector_test);
    print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100)

Accuracy for this batch: 59.375
Accuracy for this batch: 59.375
Accuracy for this batch: 59.375
Accuracy for this batch: 59.375
Accuracy for this batch: 59.375
Accuracy for this batch: 59.375
Accuracy for this batch: 59.375
Accuracy for this batch: 59.375
Accuracy for this batch: 59.375
Accuracy for this batch: 59.375


In [25]:
# View confusion matrix of one batch
nextBatch, nextBatchLabels = getTestBatch(newsVectors_test, classVector_test)
predictions = sess.run(prediction, {input_data: nextBatch})
predictions = [p[0] > p[1] for p in predictions]
classVector_test
print(confusion_matrix(classVector_test, predictions))
print("Accuracy:", sum(classVector_test==predictions) / len(predictions))

[[56  0]
 [39  1]]
Accuracy: 0.59375
