## Fake News Classifier Using Bidirectional LSTM

Dataset: https://www.kaggle.com/c/fake-news/data#

In [1]:
import pandas as pd

In [2]:
df=pd.read_json('/content/Sarcasm_Headlines_Dataset_v2.json', lines = True)

In [3]:
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [4]:
###Drop Nan Values
df=df.dropna()


In [8]:
## Get the Independent Features

X=df.drop(['article_link','is_sarcastic'],axis=1)

In [9]:
## Get the Dependent features
y=df['is_sarcastic']

In [10]:
y.value_counts()

0    14985
1    13634
Name: is_sarcastic, dtype: int64

In [11]:
X.shape

(28619, 1)

In [12]:
y.shape

(28619,)

In [13]:
import tensorflow as tf

In [14]:
tf.__version__

'2.8.2'

In [15]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

In [16]:
### Vocabulary size
voc_size=5000

### Onehot Representation

In [17]:
messages=X.copy()

In [18]:
messages['headline'][1]

'dem rep. totally nails why congress is falling short on gender, racial equality'

In [19]:
messages.reset_index(inplace=True)

In [20]:
import nltk
import re
from nltk.corpus import stopwords

In [21]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [23]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    print(i)
    review = re.sub('[^a-zA-Z]', ' ', messages['headline'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
23619
23620
23621
23622
23623
23624
23625
23626
23627
23628
23629
23630
23631
23632
23633
23634
23635
23636
23637
23638
23639
23640
23641
23642
23643
23644
23645
23646
23647
23648
23649
23650
23651
23652
23653
23654
23655
23656
23657
23658
23659
23660
23661
23662
23663
23664
23665
23666
23667
23668
23669
23670
23671
23672
23673
23674
23675
23676
23677
23678
23679
23680
23681
23682
23683
23684
23685
23686
23687
23688
23689
23690
23691
23692
23693
23694
23695
23696
23697
23698
23699
23700
23701
23702
23703
23704
23705
23706
23707
23708
23709
23710
23711
23712
23713
23714
23715
23716
23717
23718
23719
23720
23721
23722
23723
23724
23725
23726
23727
23728
23729
23730
23731
23732
23733
23734
23735
23736
23737
23738
23739
23740
23741
23742
23743
23744
23745
23746
23747
23748
23749
23750
23751
23752
23753
23754
23755
23756
23757
23758
23759
23760
23761
23762
23763
23764
23765
23766
23767
23768
23769
23770
23771
23772
23773
23774

In [24]:
corpus

['thirtysometh scientist unveil doomsday clock hair loss',
 'dem rep total nail congress fall short gender racial equal',
 'eat veggi delici differ recip',
 'inclement weather prevent liar get work',
 'mother come pretti close use word stream correctli',
 'white inherit',
 'way file tax less stress',
 'richard branson global warm donat nearli much cost fail balloon trip',
 'shadow govern get larg meet marriott confer room b',
 'lot parent know scenario',
 'lesbian consid father indiana amaz one',
 'amanda peet told daughter sex special hug',
 'know regard current treatment ebola',
 'chri christi suggest hillari clinton blame boko haram kidnap hundr schoolgirl',
 'ford develop new suv run pure gasolin',
 'uber ceo travi kalanick step trump econom advisori council',
 'area boy enter jump touch top doorway phase',
 'area man travel gurney',
 'leav person disabl behind',
 'lin manuel miranda would like remind put phone away',
 'journalist kill target intern press rise',
 'guard video game 

In [25]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[3881, 1581, 1802, 4288, 2579, 2582, 1150],
 [2794, 3284, 1588, 1307, 2744, 2608, 1271, 1745, 4512, 837],
 [3384, 3075, 1025, 1956, 724],
 [4442, 2500, 2971, 2842, 701, 393],
 [1788, 4495, 1359, 208, 4435, 3131, 1315, 211],
 [3588, 3435],
 [4693, 3801, 434, 2657, 1437],
 [4917, 4479, 858, 3694, 3164, 1317, 4460, 2331, 4588, 4830, 4214],
 [3854, 1826, 701, 4006, 4748, 1321, 4615, 516, 410],
 [826, 3232, 2339, 2382],
 [3756, 4103, 1264, 4907, 1187, 4935],
 [630, 4949, 3142, 4627, 923, 4809, 1081],
 [2339, 248, 2525, 1071, 1880],
 [4386, 201, 3140, 4774, 1340, 2224, 4797, 1037, 162, 4279, 3622],
 [2348, 2036, 4423, 1672, 2900, 3648, 1276],
 [4630, 1697, 1113, 1001, 2964, 1388, 3829, 3802, 512],
 [1834, 1794, 3823, 4452, 2650, 5, 16, 3549],
 [1834, 2002, 10, 3208],
 [622, 966, 1702, 776],
 [4465, 3538, 2088, 737, 2001, 1159, 682, 3226, 3733],
 [178, 773, 529, 2448, 4452, 2101],
 [3881, 3790, 868, 2766, 3349, 240, 4456, 3159, 316],
 [2855],
 [4478, 578, 3198, 1162, 2672],
 [1995, 1340, 216

### Embedding Representation

In [26]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 2579 2582 1150]
 [   0    0    0 ... 1745 4512  837]
 [   0    0    0 ... 1025 1956  724]
 ...
 [   0    0    0 ...   35 1215 3130]
 [   0    0    0 ...  146 3452 1772]
 [   0    0    0 ...  205 2000 3513]]


In [27]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0, 3881, 1581, 1802, 4288, 2579, 2582, 1150], dtype=int32)

In [28]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 40)            200000    
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [29]:
## Creating model
embedding_vector_features=40
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dropout(0.3))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 40)            200000    
                                                                 
 bidirectional (Bidirectiona  (None, 200)              112800    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 200)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 201       
                                                                 
Total params: 313,001
Trainable params: 313,001
Non-trainable params: 0
_________________________________________________________________
None


In [30]:
len(embedded_docs),y.shape

(28619, (28619,))

In [33]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [34]:
X_final.shape,y_final.shape

((28619, 20), (28619,))

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

### Model Training

In [36]:
### Finally Training
model1.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ffa781c1f10>

### Performance Metrics And Accuracy

In [67]:
y_pred1 = (model.predict(X_test) > 0.5).astype("int32")

In [68]:
from sklearn.metrics import confusion_matrix

In [69]:
confusion_matrix(y_test,y_pred1)

array([[1709, 3207],
       [1754, 2775]])

In [70]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred1)

0.47474854420328216

In [71]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred1))

              precision    recall  f1-score   support

           0       0.49      0.35      0.41      4916
           1       0.46      0.61      0.53      4529

    accuracy                           0.47      9445
   macro avg       0.48      0.48      0.47      9445
weighted avg       0.48      0.47      0.47      9445

