### Fake News Classifier using LSTM
#### Saurabh Chatterjee

**Dataset**: https://www.kaggle.com/c/fake-news/data#

**Dataset Description:** 

**id**: unique id for a news article \
**title**: the title of a news article \
**author**: author of the news article \
**text**: the text of the article; could be incomplete \
**label**: a label that marks the article as potentially unreliable: 
- 1: unreliable
- 0: reliable

In [12]:
import pandas as pd
import numpy as np
import tensorflow as tf     # using version 2.10.1

In [42]:
df = pd.read_csv('fake_news_dataset/train.csv')
df.shape

(20800, 5)

In [43]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [44]:
# Check Null values:
df.isnull() .sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [45]:
# DROP the Null/NaN Values (since it is text data and adding new text data may distort performance)
df = df.dropna()
df.reset_index(inplace=True)         ## RESET INDEX After the ROWS DROPS ***

In [46]:
# Independent Features
X = df.drop('label', axis=1)        # Drop the Label Coluumn 
X.shape

(18285, 5)

In [47]:
# Label / Dependent feature
y = df['label']
y.shape

(18285,)

In [20]:
# Vocabulary Size
voc_size = 5000

In [48]:
messages = X.copy()
''' Going to consider only the 'Title' Column Features of Df messages for Fake News Classifier.'''

" Going to consider only the 'Title' Column Features of Df messages for Fake News Classifier."

#### Cleaning Data: Removing Symbols, Removing Stopwords and Lemmatization

In [22]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [49]:
messages['title']

0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        FLYNN: Hillary Clinton, Big Woman on Campus - ...
2                        Why the Truth Might Get You Fired
3        15 Civilians Killed In Single US Airstrike Hav...
4        Iranian woman jailed for fictional unpublished...
                               ...                        
18280    Rapper T.I.: Trump a ’Poster Child For White S...
18281    N.F.L. Playoffs: Schedule, Matchups and Odds -...
18282    Macy’s Is Said to Receive Takeover Approach by...
18283    NATO, Russia To Hold Parallel Exercises In Bal...
18284                            What Keeps the F-35 Alive
Name: title, Length: 18285, dtype: object

In [50]:
lemmatizer = WordNetLemmatizer()        # for Lemmatization
corpus = []

for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])         # Replace Characters "OTHER THAN' (^) a-z and A-Z in the sentence   (Cleaning)
    review = review.lower()         # lower the Case
    review = review.split()         # Get the WORDS as a LIST (Split based on Space)

    # Removing Stop-Words and LEMMATIZATION:
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [51]:
corpus      # title data after cleaning

['house dem aide even see comey letter jason chaffetz tweeted',
 'flynn hillary clinton big woman campus breitbart',
 'truth might get fired',
 'civilian killed single u airstrike identified',
 'iranian woman jailed fictional unpublished story woman stoned death adultery',
 'jackie mason hollywood would love trump bombed north korea lack trans bathroom exclusive video breitbart',
 'beno hamon win french socialist party presidential nomination new york time',
 'back channel plan ukraine russia courtesy trump associate new york time',
 'obama organizing action partner soros linked indivisible disrupt trump agenda',
 'bbc comedy sketch real housewife isi cause outrage',
 'russian researcher discover secret nazi military base treasure hunter arctic photo',
 'u official see link trump russia',
 'yes paid government troll social medium blog forum website',
 'major league soccer argentine find home success new york time',
 'well fargo chief abruptly step new york time',
 'anonymous donor pay 

#### Vector Embeddings and Classification Model

In [None]:
from keras.preprocessing.text import one_hot                # One-Hot Encoder (Keras)
from keras_preprocessing.sequence import pad_sequences      # Pre and Post Padding

In [56]:
# One-Hot Representation:
onehot_repr = [one_hot(words, voc_size) for words in corpus]    # returns one-hot vector 1-INDICES AS A LIST of size voc_size (500) for each Sentence
onehot_repr

[[1166, 1183, 1524, 4073, 1011, 3555, 1871, 1907, 4486, 2798],
 [4656, 3600, 599, 4459, 329, 4518, 2028],
 [3534, 4745, 3732, 172],
 [2319, 1720, 1665, 4018, 75, 3023],
 [205, 329, 4190, 2538, 3606, 3838, 329, 1481, 1399, 235],
 [3975,
  123,
  4421,
  1105,
  4292,
  4840,
  121,
  4135,
  1898,
  2873,
  2439,
  4294,
  3650,
  2466,
  2028],
 [3431, 1298, 4539, 1219, 149, 1427, 4944, 2305, 1159, 3561, 1145],
 [1854, 119, 355, 4176, 4750, 220, 4840, 1102, 1159, 3561, 1145],
 [4298, 4876, 1175, 171, 3923, 1548, 3099, 3276, 4840, 4359],
 [132, 2168, 1756, 2629, 3890, 4026, 2466, 478],
 [3271, 4811, 3003, 2293, 978, 2311, 2955, 1945, 728, 3935, 3626],
 [4018, 2820, 1011, 2264, 4840, 4750],
 [1571, 282, 655, 3170, 3626, 1582, 4260, 51, 1798],
 [2803, 4149, 1010, 431, 4990, 1323, 75, 1159, 3561, 1145],
 [2662, 2051, 1644, 608, 3941, 1159, 3561, 1145],
 [2425, 4215, 4239, 2229, 787, 2134, 984, 3751, 1284, 4958],
 [58, 2056, 3600],
 [2500, 1199, 1648, 557, 4840, 1555, 366, 2028],
 [4855, 48

In [57]:
# PADDING: To make Length of all Sentences Equal

sent_length = 20     # set Max Sentence Length
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)       # Pre-Padding
print(embedded_docs)

[[   0    0    0 ... 1907 4486 2798]
 [   0    0    0 ...  329 4518 2028]
 [   0    0    0 ... 4745 3732  172]
 ...
 [   0    0    0 ... 1159 3561 1145]
 [   0    0    0 ... 3408  911  845]
 [   0    0    0 ... 3517 3837 4277]]


In [59]:
# To represent EACH WORD: Feature Vector Size (like Word2Vec)
embedding_dim = 40        # sets the Embedding Layer Size

In [None]:
from keras.layers import Embedding                          # Embedding Layer
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense

In [63]:
model = Sequential()
model.add(Embedding(voc_size, embedding_dim, input_length=sent_length))      ## Creates EMBEDDING Weight Layer of DIMENSION: ** (voc_size, embedding_dim) ** (5000, 20)
model.add(LSTM(100))                            # LSTM
model.add(Dense(10, activation='relu'))
model.add(Dense(1,activation='sigmoid'))        # Classification layer

""" The Embedding Layer will take as input an integer matrix of size (batch, input_length=20), and the largest integer (i.e. word index) in the input 
should be no larger than vocabulary size. Now model.output_shape is (None, input_length=20, embedding_dim=40), where (input_length = sent_length = 20) and `None` is the batch dimension."""

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 20, 40)            200000    
                                                                 
 lstm_3 (LSTM)               (None, 100)               56400     
                                                                 
 dense_4 (Dense)             (None, 10)                1010      
                                                                 
 dense_5 (Dense)             (None, 1)                 11        
                                                                 
Total params: 257,421
Trainable params: 257,421
Non-trainable params: 0
_________________________________________________________________
None


In [65]:
# For Training and Test
X_final = np.array(embedded_docs)       # shape: (18285, 20)
y_final = np.array(y)

X_final.shape, y_final.shape

((18285, 20), (18285,))

In [66]:
# Train-Test Split:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [67]:
# Model Training
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1375b5abee0>

In [None]:
# Try adding DROPOUT
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [68]:
# Test Set Prediction
y_pred = model.predict(X_test)

# Set y=1 where y_pred output (probability) > 0.5 else 0
y_pred = np.where(y_pred > 0.5, 1, 0)



In [69]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)        # Confusion Matrix

array([[3115,  304],
       [ 231, 2385]], dtype=int64)

In [70]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9113504556752279
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      3419
           1       0.89      0.91      0.90      2616

    accuracy                           0.91      6035
   macro avg       0.91      0.91      0.91      6035
weighted avg       0.91      0.91      0.91      6035

