## Fake News Classifier Using LSTM

Dataset: https://www.kaggle.com/c/fake-news/data#
Download the train.csv and upload to google colab file

In [1]:
import pandas as pd

In [5]:
df=pd.read_csv('train.csv')

In [6]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
df.shape

(20800, 5)

In [8]:
# check if any null value
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [9]:
###Drop Nan Values
df=df.dropna()

In [10]:
df.shape

(18285, 5)

In [11]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [12]:
## Get the Independent Features

X=df.drop('label',axis=1)

In [13]:
## Get the Dependent features
y=df['label']

In [14]:
X.shape

(18285, 4)

In [15]:
y.shape

(18285,)

In [16]:
import tensorflow as tf

In [17]:
tf.__version__

'2.15.0'

In [18]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

###  vocabulary size 
 We are saying we have total 5000 vocabulary i.e. unique words if we have more words it will consider top 5000 frquent word and index all word accordinly

In [20]:
### Vocabulary size to index the words
voc_size=5000

### Onehot Representation

In [21]:
messages=X.copy()

In [23]:
messages['title'][1] # checking the first title

'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'

In [31]:
messages.reset_index(inplace=True)

In [32]:
messages.head()

Unnamed: 0,index,id,title,author,text
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [33]:
import nltk
import re
from nltk.corpus import stopwords

In [34]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Dataset Preprocessing


*   Using Stop words
*   Using Stemming

In [35]:
from nltk.stem.porter import PorterStemmer ##stemming purpose
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [36]:
corpus[:5]

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri']

### One hot encoding

In [38]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr[:5]

[[961, 3723, 497, 3383, 947, 3843, 621, 3177, 1911, 935],
 [2799, 4414, 1497, 3334, 3799, 3247, 569],
 [3912, 1665, 816, 2762],
 [1356, 4479, 4727, 2723, 551, 3874],
 [1910, 3799, 533, 1306, 1512, 3320, 3799, 2221, 2620, 1858]]

In [39]:
print(corpus[1])
print(onehot_repr[1])

flynn hillari clinton big woman campu breitbart
[2799, 4414, 1497, 3334, 3799, 3247, 569]


### Embedding Representation

In [42]:
sent_length=20 # Fixed the sentence length, Assuming each sentence with length or no of words as 20
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

[[ 961 3723  497 ...    0    0    0]
 [2799 4414 1497 ...    0    0    0]
 [3912 1665  816 ...    0    0    0]
 ...
 [ 661 2081  894 ...    0    0    0]
 [1177 3096  104 ...    0    0    0]
 [3051 2712 3527 ...    0    0    0]]


In [43]:
print(corpus[1])
print(onehot_repr[1])
print(embedded_docs[1])

flynn hillari clinton big woman campu breitbart
[2799, 4414, 1497, 3334, 3799, 3247, 569]
[2799 4414 1497 3334 3799 3247  569    0    0    0    0    0    0    0
    0    0    0    0    0    0]


## Creating Model with Embedding and LSTM Layer

In [44]:
## Creating model
embedding_vector_features=40 ##features representation 40 vector dimension

model=Sequential()

# Adding embedding layer
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))

# Adding LSTM Neuron Network layer with 100 Neurons
model.add(LSTM(100))

# Adding sigmoid as my output is binrary
model.add(Dense(1,activation='sigmoid'))

# Adding Optimiser ADAM
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 40)            200000    
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 256501 (1001.96 KB)
Trainable params: 256501 (1001.96 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [45]:
len(embedded_docs),y.shape

(18285, (18285,))

In [46]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [47]:
X_final.shape,y_final.shape

((18285, 20), (18285,))

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

### Finally Training the Model

In [49]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x790ad5b0e500>

### Adding Dropout

Improve the performance of the model optional

In [50]:
# from tensorflow.keras.layers import Dropout
# ## Creating model
# embedding_vector_features=40
# model=Sequential()
# model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
# model.add(Dropout(0.3))
# model.add(LSTM(100))
# model.add(Dropout(0.3))
# model.add(Dense(1,activation='sigmoid'))
# model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

### Performance Metrics And Accuracy

In [51]:
y_pred=model.predict(X_test)



In [52]:
y_pred=np.where(y_pred > 0.6, 1,0) ##AUC ROC Curve putting thresold like if  >0.6 then 1 i.e. fake news

In [53]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[3149,  270],
       [ 317, 2299]])

In [54]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9027340513670257

In [55]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.91      0.92      0.91      3419
           1       0.89      0.88      0.89      2616

    accuracy                           0.90      6035
   macro avg       0.90      0.90      0.90      6035
weighted avg       0.90      0.90      0.90      6035

