## Fake News Classifier Using LSTM

Dataset: https://www.kaggle.com/c/fake-news/data#

In [1]:
import pandas as pd

In [2]:
df_train=pd.read_csv('train.csv')

In [3]:
df_train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df_train.shape

(20800, 5)

In [5]:
df_train.isnull().sum()

Unnamed: 0,0
id,0
title,558
author,1957
text,39
label,0


In [6]:
df_train=df_train.dropna()

In [7]:
df_train.isnull().sum()

Unnamed: 0,0
id,0
title,0
author,0
text,0
label,0


In [8]:
df_train.shape

(18285, 5)

In [9]:
X_train=df_train["title"]
y_train=df_train["label"]

In [10]:
X_train.shape,y_train.shape

((18285,), (18285,))

In [11]:
df_test=pd.read_csv('test.csv')


In [12]:
df_test.columns

Index(['id', 'title', 'author', 'text'], dtype='object')

In [13]:
df_test.shape

(5200, 4)

In [14]:
df_test.isnull().sum()

Unnamed: 0,0
id,0
title,122
author,503
text,7


In [15]:
df_test=df_test.dropna()

In [16]:
df_test.head()


Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
6,20806,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori..."


In [17]:
X_test=df_test["title"]



In [18]:
X_test.shape


(4575,)

In [19]:
import tensorflow as tf

In [20]:
tf.__version__

'2.17.1'

In [21]:

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense


#Text processing using nltk lib



In [22]:
import nltk
import re
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer  ## stemming purpose
from nltk.corpus import stopwords  # Stopwords
ps = PorterStemmer()
corpus = []
corpus_test=[]
for i in range(0, len(X_train)):
  review = re.sub('[^a-zA-Z]', ' ', X_train.iloc[i])
  review = review.lower()
  review = review.split()

  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [105]:

corpus[:5]

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri']

In [27]:
corpus_test=[]
for i in range(0, len(X_test)):
  review = re.sub('[^a-zA-Z]', ' ', X_test.iloc[i])
  review = review.lower()
  review = review.split()

  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus_test.append(review)

In [106]:
corpus_test[:5]

['specter trump loosen tongu purs string silicon valley new york time',
 'nodapl nativ american leader vow stay winter file lawsuit polic',
 'tim tebow attempt anoth comeback time basebal new york time',
 'keiser report meme war e',
 'pelosi call fbi investig find russian donald trump breitbart']

#Onehot Representation

In [107]:
## vocabulary size = 1000
onehot_repr=[one_hot(words,1000) for words in corpus]
onehot_repr[:5]

[[814, 95, 305, 792, 60, 99, 369, 458, 595, 940],
 [615, 713, 783, 850, 763, 556, 184],
 [269, 861, 787, 844],
 [993, 154, 24, 204, 303, 360],
 [456, 763, 745, 486, 794, 850, 763, 836, 152, 370]]

In [31]:
onehot_repr_test=[one_hot(words,1000) for words in corpus_test]

In [32]:
corpus[1]


'flynn hillari clinton big woman campu breitbart'

In [33]:
onehot_repr[1]

[615, 713, 783, 850, 763, 556, 184]

#Embedding Representation

In [34]:
sent_length = 20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)


[[  0   0   0 ... 458 595 940]
 [  0   0   0 ... 763 556 184]
 [  0   0   0 ... 861 787 844]
 ...
 [  0   0   0 ... 706 829 747]
 [  0   0   0 ... 417  75 926]
 [  0   0   0 ... 150 961 849]]


In [35]:
embedded_docs_test=pad_sequences(onehot_repr_test,padding='pre',maxlen=sent_length)

In [36]:
## creating model
embedding_vector_feature=40
model=Sequential()
model.add(Embedding(1000,embedding_vector_feature,input_length=sent_length))
model.add(LSTM(100,dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())



None


In [37]:
model.build(input_shape=(None, sent_length))  # None for batch size, sent_length for input length
model.summary()


In [38]:
len(embedded_docs),y_train.shape

(18285, (18285,))

In [39]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y_train)
X_test=np.array(embedded_docs_test)

In [40]:
X_final.shape,y_final.shape

((18285, 20), (18285,))

In [41]:
X_test.shape

(4575, 20)

In [42]:
from sklearn.model_selection import train_test_split
X_train_split, X_test_split,y_train_split, y_test_split = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [43]:
## fianlly training
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(X_train_split, y_train_split,
          validation_data=(X_test_split, y_test_split),
          epochs=50, batch_size=32,
          callbacks=[early_stopping])


Epoch 1/50
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 46ms/step - accuracy: 0.8188 - loss: 0.3820 - val_accuracy: 0.9032 - val_loss: 0.2219
Epoch 2/50
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 46ms/step - accuracy: 0.9191 - loss: 0.1967 - val_accuracy: 0.9060 - val_loss: 0.2243
Epoch 3/50
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 45ms/step - accuracy: 0.9301 - loss: 0.1722 - val_accuracy: 0.9072 - val_loss: 0.2178
Epoch 4/50
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 47ms/step - accuracy: 0.9389 - loss: 0.1567 - val_accuracy: 0.9087 - val_loss: 0.2228
Epoch 5/50
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 44ms/step - accuracy: 0.9468 - loss: 0.1423 - val_accuracy: 0.9107 - val_loss: 0.2313
Epoch 6/50
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 45ms/step - accuracy: 0.9542 - loss: 0.1282 - val_accuracy: 0.9094 - val_loss: 0.2279


<keras.src.callbacks.history.History at 0x79735d331450>

In [44]:
y_pred=model.predict(X_test_split)

[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step


In [112]:
y_pred=np.where(y_pred>0.5,1,0)

In [113]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [114]:
confusion_matrix(y_test_split,y_pred)

array([[3012,  407],
       [ 153, 2463]])

In [115]:

accuracy_score(y_test_split,y_pred)

0.9072079536039768

In [76]:
from sklearn.metrics import classification_report
print(classification_report(y_test_split,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.88      0.91      3419
           1       0.86      0.94      0.90      2616

    accuracy                           0.91      6035
   macro avg       0.90      0.91      0.91      6035
weighted avg       0.91      0.91      0.91      6035



#Submission on kaggle competition

In [93]:
final_sub = pd.read_csv('test.csv')
final_sub=final_sub['id'].to_frame()

In [94]:
final_sub.columns

Index(['id'], dtype='object')

In [80]:
y_pred0= model.predict(X_test)

[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step


In [95]:
df_test['label']=y_pred0
df_test=df_test[['id','label']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['label']=y_pred0


In [96]:
df_test.columns

Index(['id', 'label'], dtype='object')

In [97]:
final_sub=final_sub.merge(df_test,on='id',how='left')

In [99]:
final_sub['label']=final_sub['label'].apply(lambda x: 1 if x>0.5 else 0)

In [103]:
final_sub.head()

Unnamed: 0,id,label
0,20800,0
1,20801,0
2,20802,1
3,20803,0
4,20804,1


In [104]:
final_sub.to_csv('submission.csv',index=False)