Application of LSTM and GRU Recurrent Neural Networks in Fake NEWS detection

In [1]:
# importing necessary libraries 
import pandas as pd
import tensorflow as tf
import os
import re
import numpy as np
from string import punctuation
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
# importing neural network libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GRU, LSTM, RNN, SpatialDropout1D

In [3]:
train = pd.read_csv('../fake-news/all_data_fake_news.csv')
train_data = train.copy()

In [4]:
train_data = train_data.set_index('id', drop = True)

KeyError: "None of ['id'] are in the columns"

In [5]:
print(train_data.shape)
train_data.head()

(76933, 4)


Unnamed: 0,Source,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [6]:
# checking for missing values
train_data.isnull().sum()

Source      1965
Headline    1148
Body         120
Label          0
dtype: int64

In [7]:
# dropping missing values from text columns alone. 
train_data[['Headline', 'Source']] = train_data[['Headline', 'Source']].fillna(value = 'Missing')
train_data = train_data.dropna()
train_data.isnull().sum()

Source      0
Headline    0
Body        0
Label       0
dtype: int64

In [9]:
length = []
[length.append(len(str(text))) for text in train_data['Body']]
train_data['length'] = length
train_data.head()

Unnamed: 0,Source,Headline,Body,Label,length
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,6412
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,3325
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,6580
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,630
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,213


In [10]:
min(train_data['length']), max(train_data['length']), round(sum(train_data['length'])/len(train_data['length']))

(1, 142961, 3270)

we can keep 4500 as max features for training the neural network.

**minimum length is 1 ?? Looks like there are some outliers.**

In [11]:
len(train_data[train_data['length'] < 50])

1054

**There are 1054 outliers in this dataset. Outliers can be removed. It is a good practice to check the outliers before removing them**

In [14]:
train_data['Body'][train_data['length'] < 50]

4019                                                      
4084     \n\nMindblowing Reason Elites Fear Donald Trum...
4085                                            Brilliant!
4108                                                    \n
4114     Cloud Centric (C2) Crime Scene Investigation \n\n
                               ...                        
55079                                      Guest   Guest  
55091         \nOctober 28, 2016 The Mothers by stclair by
55171                                                     
55293                                   Trump all the way!
64376                                                     
Name: Body, Length: 1054, dtype: object

*Mostly empty texts. They can be removed since they will surely guide the neural network in the wrong way*

In [15]:
# dropping the outliers
train_data = train_data.drop(train_data['Body'][train_data['length'] < 50].index, axis = 0)

In [16]:
min(train_data['length']), max(train_data['length']), round(sum(train_data['length'])/len(train_data['length']))

(50, 142961, 3315)

In [17]:
max_features = 4500

Preprocessing the Text before feeding it into the neural networks

In [18]:
# Tokenizing the text - converting the words, letters into counts or numbers. 
# We dont need to explicitly remove the punctuations. we have an inbuilt option in Tokenizer for this purpose
tokenizer = Tokenizer(num_words = max_features, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = True, split = ' ')
tokenizer.fit_on_texts(texts = train_data['Body'])
X = tokenizer.texts_to_sequences(texts = train_data['Body'])

In [19]:
# now applying padding to make them even shaped.
X = pad_sequences(sequences = X, maxlen = max_features, padding = 'pre')

In [20]:
print(X.shape)
y = train_data['Label'].values
print(y.shape)

(75759, 4500)
(75759,)


In [21]:
# splitting the data training data for training and validation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

We got our training data preprocessed and ready for training the neural network. 

We have to create a neural network now

In [22]:
# LSTM Neural Network
lstm_model = Sequential(name = 'lstm_nn_model')
lstm_model.add(layer = Embedding(input_dim = max_features, output_dim = 120, name = '1st_layer'))
lstm_model.add(layer = LSTM(units = 120, dropout = 0.2, recurrent_dropout = 0.2, name = '2nd_layer'))
lstm_model.add(layer = Dropout(rate = 0.5, name = '3rd_layer'))
lstm_model.add(layer = Dense(units = 120,  activation = 'relu', name = '4th_layer'))
lstm_model.add(layer = Dropout(rate = 0.5, name = '5th_layer'))
lstm_model.add(layer = Dense(units = len(set(y)),  activation = 'sigmoid', name = 'output_layer'))
# compiling the model
lstm_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [24]:
lstm_model_fit = lstm_model.fit(X_train, y_train, epochs = 1)

Train on 60607 samples


Constructing GRU Neural Network

In [25]:
# GRU neural Network
gru_model = Sequential(name = 'gru_nn_model')
gru_model.add(layer = Embedding(input_dim = max_features, output_dim = 120, name = '1st_layer'))
gru_model.add(layer = GRU(units = 120, dropout = 0.2, 
                          recurrent_dropout = 0.2, recurrent_activation = 'relu', 
                          activation = 'relu', name = '2nd_layer'))
gru_model.add(layer = Dropout(rate = 0.4, name = '3rd_layer'))
gru_model.add(layer = Dense(units = 120, activation = 'relu', name = '4th_layer'))
gru_model.add(layer = Dropout(rate = 0.2, name = '5th_layer'))
gru_model.add(layer = Dense(units = len(set(y)), activation = 'softmax', name = 'output_layer'))
# compiling the model
gru_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [None]:
gru_model.summary()

In [27]:
gru_model_fit = gru_model.fit(X_train, y_train, epochs = 1)

Train on 60607 samples


## Now preparing the test dataset

In [None]:
print(test.shape)
test_data = test.copy()
print(test_data.shape)

In [None]:
test_data = test_data.set_index('id', drop = True)
test_data.shape

Filling the Missing values

In [None]:
test_data = test_data.fillna(' ')
print(test_data.shape)
test_data.isnull().sum()

In [28]:
tokenizer.fit_on_texts(texts = test_data['text'])
test_text = tokenizer.texts_to_sequences(texts = test_data['text'])

In [29]:
test_text = pad_sequences(sequences = test_text, maxlen = max_features, padding = 'pre')

Prediction:

In [30]:
lstm_prediction = lstm_model.predict_classes(test_text)

The LSTM predictions have more accuracy.

In [31]:
submission = pd.DataFrame({'id':test_data.index, 'label':lstm_prediction})
submission.shape

(5200, 2)

In [32]:
submission.head()

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,1
3,20803,0
4,20804,1


In [33]:
submission.to_csv('submission.csv', index = False)