# Fake News Classifier Using LSTM

In [41]:
"""
@Author: Divyansh.Gupta
"""
import pandas as pd
import numpy as np

# Dataset Download it from [CLICK HERE](https://https://www.kaggle.com/c/fake-news/data#)

In [42]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [43]:
# Drop all Nan Values
data = data.dropna()

In [44]:
# Drop label variable to get independent variables
X = data.drop('label',axis=1)

In [45]:
# Get dependent variable
y = data['label']

In [46]:
print("Shape of independent features: {} and dependent features {}".format(X.shape,y.shape))

Shape of independent features: (18285, 4) and dependent features (18285,)


In [47]:
from keras.layers import Embedding, Dense, LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.models import Sequential

In [48]:
# Vocab Size 
voc_size = 5000

In [49]:
# Creating copy of dataset
message = X.copy() # 2 type of copy: Shallow and deep copy(). 
#Shallow copy the values and object while deep copy copies the reference of the object of value
message.reset_index(inplace=True)

In [50]:
import nltk # NLP processsing library
import re # Regular Expression 
from nltk.corpus import stopwords, wordnet # Get stopwords, wordnet from nltk
nltk.download('stopwords') # Download list of stopwords
nltk.download('wordnet') # Download list of wordnet

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Text Preprocessing

Along with the preprocessing, i'll try you to show the difference between stemming and lemmetization 

In [51]:
# Preprocessing and Stemming
from nltk.stem.porter import PorterStemmer # Stemming
stemm = PorterStemmer() # create object of PorterStemmer
corpus=[] # List to add words to create corpus of words
for i in range(0,len(message)):
  rev = re.sub('[^a-zA-Z]'," ",message['title'][i]) # Replace all words with space except a-z and A-z
  rev = rev.lower() # Lowercase all the text so that "USA" and "usa" get same index
  rev = rev.split() # Split sentences
  rev = [stemm.stem(word) for word in rev if word not in stopwords.words('english')] # Doing stemming and removing stopwords
  rev = " ".join(rev) # Join all words to get sentences back
  corpus.append(rev) # Appended to corpus

In [52]:
# Preprocessing and Lemmetization
from nltk.stem import WordNetLemmatizer # Lemmetizing
lemm = WordNetLemmatizer() # Create object of WordNetLemmatizer
corpus1=[] # List to add words to create corpus of words
for i in range(0,len(message)):
  rev = re.sub('[^a-zA-Z]'," ",message['title'][i])# Replace all words with space except a-z and A-z
  rev = rev.lower()# Lowercase all the text so that "USA" and "usa" get same index
  rev = rev.split()# Split sentences
  rev = [lemm.lemmatize(word) for word in rev if word not in stopwords.words('english')]# Doing lemmatizing and removing stopwords
  rev = " ".join(rev)# Join all words to get sentences back
  corpus1.append(rev)# Appended to corpus

In [53]:
# Sentence after applying stemming
corpus[1]

'flynn hillari clinton big woman campu breitbart'

In [54]:
# Sentence after applying Lemmatization
corpus1[1]

'flynn hillary clinton big woman campus breitbart'

You can see the differences here.
lemmatization tried to get meaningful words while in stemming it just generate the root word that can be meaningful or meaningless

# One Hot Representation

In [55]:
# Applying One Hot Encoding
one_hot_rep = [one_hot(words, voc_size) for words in corpus]
one_hot_rep1 = [one_hot(words1, voc_size) for words1 in corpus1]

In [56]:
# one hot representation of first word in corpus after applying stemming
one_hot_rep[0]

[4711, 2233, 451, 721, 1711, 2815, 4043, 784, 3624, 1750]

In [57]:
# one hot representation of first word in corpus after applying lemmatizing
one_hot_rep1[0]

[1611, 2233, 3487, 721, 1711, 2815, 4043, 784, 3624, 1387]

# Embedding Representation

In [58]:
sent_len=20 # Max length of sentence
# Applying padding so that all vector would be of same length
# Cause LSTM always need vectors of same length
emb = pad_sequences(one_hot_rep,padding="pre",maxlen=sent_len) 
emb1 = pad_sequences(one_hot_rep1,padding="pre",maxlen=sent_len)

In [59]:
# Embedding representation of first word in corpus after applying stemming
emb[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 4711,
       2233,  451,  721, 1711, 2815, 4043,  784, 3624, 1750], dtype=int32)

In [60]:
# Embedding representation of first word in corpus after applying lemmatizing
emb1[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 1611,
       2233, 3487,  721, 1711, 2815, 4043,  784, 3624, 1387], dtype=int32)

# Model Creation

In [61]:
# model
emb_vec_feature=40 # Output Vector size
model = Sequential() # Initializing Sequential Model
model.add(Embedding(voc_size,emb_vec_feature,input_length=sent_len)) # Adding embedding layer of vocab size * output vector size
model.add(LSTM(100)) # Adding LSTM layer with 100 neurons
model.add(Dense(1, activation="sigmoid")) # Output layer with 1 neuron having sigmoid to tell probability of each class
# Compile model with loss funtion Binary crossentropy and adam optimizer to minimize losses
model.compile(loss='binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
# Summary of model
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 40)            200000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [62]:
# Converting vectors into array
X_final = np.array(emb)
y_final = np.array(y)

In [63]:
# Splitting dataset into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final,y_final, test_size=0.33, random_state=42)

In [64]:
# Model Training
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10,batch_size=64)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fea3c979470>

# Dropout
A single model can be used to simulate having a large number of different network architectures by randomly dropping out nodes during training. This is called dropout and offers a very computationally cheap and remarkably effective regularization method to reduce overfitting and improve generalization error in deep neural networks of all kinds.

Here, I'm just showing how to add dropout layer

In [65]:
# Adding Dropout
from keras.layers import Dropout
# model
emb_vec_feature1=40
model1 = Sequential()
model1.add(Embedding(voc_size,emb_vec_feature1,input_length=sent_len))
model1.add(Dropout(0.3))
model1.add(LSTM(100))
model1.add(Dropout(0.3))
model1.add(Dense(1, activation="sigmoid"))
model1.compile(loss='binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model1.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 20, 40)            200000    
_________________________________________________________________
dropout_2 (Dropout)          (None, 20, 40)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [66]:
X_final1 = np.array(emb1)
y_final1 = np.array(y)

In [67]:
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_final1,y_final1, test_size=0.33, random_state=42)

In [68]:
model1.fit(X_train1, y_train1, validation_data=(X_test1, y_test1), epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fea3c41b208>

In [69]:
# Predict labels for testing set
y_pred = model.predict_classes(X_test)
y_pred1 = model1.predict_classes(X_test1)



In [70]:
# Drawing Confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(confusion_matrix(y_test1,y_pred1))


[[3057  362]
 [ 195 2421]]
[[3130  289]
 [ 311 2305]]


In [84]:
# Accuracy Score
from sklearn.metrics import accuracy_score
print("Model:",accuracy_score(y_test,y_pred))
print()
print("Model with Dropout",accuracy_score(y_test1,y_pred1))

Model: 0.907705053852527

Model with Dropout 0.9005799502899752
