Application of LSTM and GRU Recurrent Neural Networks in Fake NEWS detection

In [9]:
# importing necessary libraries 
import pandas as pd
import tensorflow as tf
import os
import re
import numpy as np
from string import punctuation
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [10]:
# importing neural network libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GRU, LSTM, RNN, SpatialDropout1D

In [11]:
train = pd.read_csv('./fake-news/train.csv')
test = pd.read_csv('./fake-news/test.csv')
train_data = train.copy()
test_data = test.copy()

In [12]:
train_data = train_data.set_index('id', drop = True)

In [13]:
print(train_data.shape)
train_data.head()

(20800, 4)


Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [14]:
print(test_data.shape)
test_data.head()

(5200, 4)


Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [15]:
# checking for missing values
train_data.isnull().sum()

title      558
author    1957
text        39
label        0
dtype: int64

out of 20,000 training samples, around 40 samples (bothering only the text column) have missing values. so we can drop them at once

In [16]:
# dropping missing values from text columns alone. 
train_data[['title', 'author']] = train_data[['title', 'author']].fillna(value = 'Missing')
train_data = train_data.dropna()
train_data.isnull().sum()

title     0
author    0
text      0
label     0
dtype: int64

In [17]:
length = []
[length.append(len(str(text))) for text in train_data['text']]
train_data['length'] = length
train_data.head()

Unnamed: 0_level_0,title,author,text,label,length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,4930
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,4160
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,7692
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,3237
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,938


In [18]:
min(train_data['length']), max(train_data['length']), round(sum(train_data['length'])/len(train_data['length']))

(1, 142961, 4553)

we can keep 4500 as max features for training the neural network.

**minimum length is 1 ?? Looks like there are some outliers.**

In [19]:
len(train_data[train_data['length'] < 50])

207

**There are 107 outliers in this dataset. Outliers can be removed. It is a good practice to check the outliers before removing them**

In [20]:
train_data['text'][train_data['length'] < 50]

id
82                                                   
169                                                  
173                                   Guest   Guest  
196            They got the heater turned up on high.
295                                                  
                             ...                     
20350                         I hope nobody got hurt!
20418                                 Guest   Guest  
20431    \nOctober 28, 2016 The Mothers by stclair by
20513                                                
20636                              Trump all the way!
Name: text, Length: 207, dtype: object

*Mostly empty texts. They can be removed since they will surely guide the neural network in the wrong way*

In [21]:
# dropping the outliers
train_data = train_data.drop(train_data['text'][train_data['length'] < 50].index, axis = 0)

In [22]:
min(train_data['length']), max(train_data['length']), round(sum(train_data['length'])/len(train_data['length']))

(50, 142961, 4598)

In [23]:
max_features = 4500

Preprocessing the Text before feeding it into the neural networks

In [24]:
# Tokenizing the text - converting the words, letters into counts or numbers. 
# We dont need to explicitly remove the punctuations. we have an inbuilt option in Tokenizer for this purpose
tokenizer = Tokenizer(num_words = max_features, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = True, split = ' ')
tokenizer.fit_on_texts(texts = train_data['text'])
X = tokenizer.texts_to_sequences(texts = train_data['text'])

In [25]:
# now applying padding to make them even shaped.
X = pad_sequences(sequences = X, maxlen = max_features, padding = 'pre')

In [26]:
print(X.shape)
y = train_data['label'].values
print(y.shape)

(20554, 4500)
(20554,)


In [27]:
# splitting the data training data for training and validation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

We got our training data preprocessed and ready for training the neural network. 

We have to create a neural network now

In [28]:
# LSTM Neural Network
lstm_model = Sequential(name = 'lstm_nn_model')
lstm_model.add(layer = Embedding(input_dim = max_features, output_dim = 120, name = '1st_layer'))
lstm_model.add(layer = LSTM(units = 120, dropout = 0.2, recurrent_dropout = 0.2, name = '2nd_layer'))
lstm_model.add(layer = Dropout(rate = 0.5, name = '3rd_layer'))
lstm_model.add(layer = Dense(units = 120,  activation = 'relu', name = '4th_layer'))
lstm_model.add(layer = Dropout(rate = 0.5, name = '5th_layer'))
lstm_model.add(layer = Dense(units = len(set(y)),  activation = 'sigmoid', name = 'output_layer'))
# compiling the model
lstm_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

2022-11-30 17:37:10.313020: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-11-30 17:37:10.313335: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2022-11-30 17:37:10.313373: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ankurpes2ug20cs054): /proc/driver/nvidia/version does not exist
2022-11-30 17:37:10.314409: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [29]:
lstm_model_fit = lstm_model.fit(X_train, y_train, epochs = 1)

  9/514 [..............................] - ETA: 1:03:49 - loss: 0.6894 - accuracy: 0.5069

KeyboardInterrupt: 

Constructing GRU Neural Network

In [None]:
# GRU neural Network
gru_model = Sequential(name = 'gru_nn_model')
gru_model.add(layer = Embedding(input_dim = max_features, output_dim = 120, name = '1st_layer'))
gru_model.add(layer = GRU(units = 120, dropout = 0.2, 
                          recurrent_dropout = 0.2, recurrent_activation = 'relu', 
                          activation = 'relu', name = '2nd_layer'))
gru_model.add(layer = Dropout(rate = 0.4, name = '3rd_layer'))
gru_model.add(layer = Dense(units = 120, activation = 'relu', name = '4th_layer'))
gru_model.add(layer = Dropout(rate = 0.2, name = '5th_layer'))
gru_model.add(layer = Dense(units = len(set(y)), activation = 'softmax', name = 'output_layer'))
# compiling the model
gru_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [None]:
gru_model.summary()

Model: "gru_nn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1st_layer (Embedding)        (None, None, 120)         540000    
_________________________________________________________________
2nd_layer (GRU)              (None, 120)               86760     
_________________________________________________________________
3rd_layer (Dropout)          (None, 120)               0         
_________________________________________________________________
4th_layer (Dense)            (None, 120)               14520     
_________________________________________________________________
5th_layer (Dropout)          (None, 120)               0         
_________________________________________________________________
output_layer (Dense)         (None, 2)                 242       
Total params: 641,522
Trainable params: 641,522
Non-trainable params: 0
________________________________________________

In [None]:
gru_model_fit = gru_model.fit(X_train, y_train, epochs = 1)



Now preparing the test dataset

In [None]:
print(test.shape)
test_data = test.copy()
print(test_data.shape)

(5200, 4)
(5200, 4)


In [None]:
test_data = test_data.set_index('id', drop = True)
test_data.shape

(5200, 3)

**Filling the Missing values**

In [None]:
test_data = test_data.fillna(' ')
print(test_data.shape)
test_data.isnull().sum()

(5200, 3)


title     0
author    0
text      0
dtype: int64

In [None]:
tokenizer.fit_on_texts(texts = test_data['text'])
test_text = tokenizer.texts_to_sequences(texts = test_data['text'])

In [None]:
test_text = pad_sequences(sequences = test_text, maxlen = max_features, padding = 'pre')

Prediction:

In [None]:
lstm_prediction = lstm_model.predict_classes(test_text)

The LSTM predictions have more accuracy.

In [None]:
submission = pd.DataFrame({'id':test_data.index, 'label':lstm_prediction})
submission.shape

(5200, 2)

In [None]:
submission.head()

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,1
3,20803,0
4,20804,1


In [None]:
submission.to_csv('submission.csv', index = False)