# Fake News Classification using Deep Learning!

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from tensorflow.keras.layers import Embedding,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM,Bidirectional,GRU
from tensorflow.keras.layers import Dense
from sklearn.metrics import classification_report,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to /home/dark/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Reading data from csv
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [4]:
# Displaying rows and columns in dataset
print("There are {} number of rows and {} number of columns for training.".format(train.shape[0],train.shape[1]))
print("There are {} number of rows and {} number of columns for testing.".format(test.shape[0],test.shape[1]))

There are 20800 number of rows and 5 number of columns for training.
There are 5200 number of rows and 4 number of columns for testing.


In [5]:
# Checking the null values in training data.
train.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
# Checking the null values in testing data.
test.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [7]:
# Handling nan values in dataset using empty spaces
def handle_nan(train_data,test_data):
    '''Input: Data to the function containing Nan values.
       Output : Cleaned data containing no Nan values.
       Function: Cleaning Nan values.
     '''
    train = train_data.fillna(" ")
    test  = test_data.fillna(" ")
    return train,test

train,test = handle_nan(train,test)

In [8]:
# Creating a variable "merged" by merging columns "title" and "author"
train["merged"] = train["title"]+" "+train["author"]
test["merged"]  = test["title"]+" "+test["author"]

In [9]:
# Seperating Independent and dependent features
X = train.drop(columns=['label'],axis=1)
y = train['label']

In [10]:
# Creating One-Hot Representations
messages = X.copy()
messages.reset_index(inplace=True)
messages_test = test.copy()
messages_test.reset_index(inplace=True)

In [11]:
# Data Pre-processing
# Performing data preprocessing on column 'title'
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def perform_preprocess(data):
    '''Input: Data to be processed
       Output: Preprocessed data
    '''
    corpus = []
    for i in range(0,len(data)):
        review = re.sub('[^a-zA-Z]',' ',data['merged'][i])
        review = review.lower()
        review = review.split()
        review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

train_corpus = perform_preprocess(messages)
test_corpus  = perform_preprocess(messages_test)
train_corpus[1]

'flynn hillari clinton big woman campu breitbart daniel j flynn'

In [12]:
test_corpus[1]

'russian warship readi strike terrorist near aleppo'

In [13]:
# Converting to one-hot repr.
vocab_size = 5000
one_hot_train = [one_hot(word,vocab_size) for word in train_corpus]
one_hot_test  = [one_hot(word,vocab_size) for word in test_corpus]

In [14]:
one_hot_test[1]

[2191, 3952, 2485, 3634, 4813, 106, 405]

In [15]:
# Embedding Representation 
sent_length = 20
embedd_docs_train = pad_sequences(one_hot_train,padding='pre',maxlen=sent_length)
embedd_docs_test  = pad_sequences(one_hot_test,padding='pre',maxlen=sent_length)
print(embedd_docs_train)

[[   0    0    0 ... 4687 1286  316]
 [   0    0    0 ... 2367 3594 2152]
 [   0    0    0 ...  672 4866 3157]
 ...
 [   0    0    0 ... 1878 4851 4768]
 [   0    0    0 ... 2094 2891  768]
 [   0    0    0 ...  525  621 2413]]


In [16]:
print(embedd_docs_test)

[[   0    0    0 ... 4352  621   40]
 [   0    0    0 ... 4813  106  405]
 [   0    0    0 ...  423 4107 2940]
 ...
 [   0    0    0 ... 4352 3598 1832]
 [   0    0    0 ... 2191 4791 1950]
 [   0    0    0 ... 4352 3114  842]]


In [17]:
# Converting Embedding repr. to array
x_final = np.array(embedd_docs_train)
y_final = np.array(y)
x_test_final = np.array(embedd_docs_test)
# Dimensions of prev. array repr.
x_final.shape,y_final.shape,x_test_final.shape

((20800, 20), (20800,), (5200, 20))

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.1, random_state=42, stratify = y_final)
X_train, x_valid, Y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=42, stratify = y_train)
x_test_final = x_test_final

# Creating Models

### 1.  Logistic Regresssion

In [19]:
model_1 = LogisticRegression(max_iter=900)
model_1.fit(X_train,Y_train)
pred_1 = model_1.predict(x_test)
cr1    = classification_report(y_test,pred_1)
print(cr1)

              precision    recall  f1-score   support

           0       0.71      0.72      0.71      1039
           1       0.71      0.71      0.71      1041

    accuracy                           0.71      2080
   macro avg       0.71      0.71      0.71      2080
weighted avg       0.71      0.71      0.71      2080



### 2. Naive Bayes

In [20]:
model_2 = MultinomialNB()
model_2.fit(X_train,Y_train)
pred_2 = model_2.predict(x_test)
cr2    = classification_report(y_test,pred_2)
print(cr2)

              precision    recall  f1-score   support

           0       0.73      0.65      0.69      1039
           1       0.68      0.76      0.72      1041

    accuracy                           0.71      2080
   macro avg       0.71      0.71      0.70      2080
weighted avg       0.71      0.71      0.70      2080



In [21]:
### 3.  Decision Trees

In [22]:
model_3 = DecisionTreeClassifier()
model_3.fit(X_train,Y_train)
pred_3 = model_3.predict(x_test)
cr3    = classification_report(y_test,pred_3)
print(cr3)

              precision    recall  f1-score   support

           0       0.91      0.93      0.92      1039
           1       0.93      0.91      0.92      1041

    accuracy                           0.92      2080
   macro avg       0.92      0.92      0.92      2080
weighted avg       0.92      0.92      0.92      2080



In [23]:
### 4. Random Forest

In [24]:
model_4 = RandomForestClassifier()
model_4.fit(X_train,Y_train)
pred_4 = model_4.predict(x_test)
cr4    = classification_report(y_test,pred_4)
print(cr4)

              precision    recall  f1-score   support

           0       0.97      0.89      0.93      1039
           1       0.90      0.98      0.94      1041

    accuracy                           0.93      2080
   macro avg       0.94      0.93      0.93      2080
weighted avg       0.94      0.93      0.93      2080



In [25]:
### 5. XGBOOST

In [26]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

model_5 = XGBClassifier()
model_5.fit(X_train,Y_train)
pred_5 = model_5.predict(x_test)
cr5    = classification_report(y_test,pred_5)
print(cr5)





              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1039
           1       0.98      0.99      0.98      1041

    accuracy                           0.98      2080
   macro avg       0.98      0.98      0.98      2080
weighted avg       0.98      0.98      0.98      2080



In [27]:
### 6.  Catboost

In [28]:
model_6 = CatBoostClassifier(iterations=200)
model_6.fit(X_train,Y_train)
pred_6 = model_5.predict(x_test)
cr6    = classification_report(y_test,pred_5)
print(cr6)

Learning rate set to 0.150531
0:	learn: 0.5559482	total: 65.3ms	remaining: 13s
1:	learn: 0.4853795	total: 87.6ms	remaining: 8.68s
2:	learn: 0.4108214	total: 104ms	remaining: 6.8s
3:	learn: 0.3750198	total: 120ms	remaining: 5.87s
4:	learn: 0.3452680	total: 140ms	remaining: 5.45s
5:	learn: 0.3236156	total: 158ms	remaining: 5.11s
6:	learn: 0.3114605	total: 176ms	remaining: 4.85s
7:	learn: 0.3009136	total: 191ms	remaining: 4.58s
8:	learn: 0.2930274	total: 206ms	remaining: 4.38s
9:	learn: 0.2838117	total: 224ms	remaining: 4.26s
10:	learn: 0.2775243	total: 242ms	remaining: 4.16s
11:	learn: 0.2711070	total: 258ms	remaining: 4.04s
12:	learn: 0.2644242	total: 276ms	remaining: 3.97s
13:	learn: 0.2605761	total: 295ms	remaining: 3.92s
14:	learn: 0.2554890	total: 310ms	remaining: 3.83s
15:	learn: 0.2519081	total: 327ms	remaining: 3.76s
16:	learn: 0.2433714	total: 345ms	remaining: 3.72s
17:	learn: 0.2399497	total: 360ms	remaining: 3.64s
18:	learn: 0.2344950	total: 375ms	remaining: 3.57s
19:	learn: 0

165:	learn: 0.0712161	total: 2.44s	remaining: 500ms
166:	learn: 0.0710403	total: 2.45s	remaining: 485ms
167:	learn: 0.0708152	total: 2.46s	remaining: 470ms
168:	learn: 0.0707109	total: 2.48s	remaining: 454ms
169:	learn: 0.0704926	total: 2.49s	remaining: 439ms
170:	learn: 0.0698135	total: 2.5s	remaining: 424ms
171:	learn: 0.0696624	total: 2.51s	remaining: 409ms
172:	learn: 0.0692215	total: 2.52s	remaining: 393ms
173:	learn: 0.0691334	total: 2.53s	remaining: 378ms
174:	learn: 0.0688046	total: 2.54s	remaining: 363ms
175:	learn: 0.0686114	total: 2.55s	remaining: 348ms
176:	learn: 0.0685079	total: 2.56s	remaining: 333ms
177:	learn: 0.0683479	total: 2.58s	remaining: 319ms
178:	learn: 0.0678201	total: 2.59s	remaining: 304ms
179:	learn: 0.0672779	total: 2.6s	remaining: 289ms
180:	learn: 0.0665560	total: 2.61s	remaining: 274ms
181:	learn: 0.0660302	total: 2.62s	remaining: 259ms
182:	learn: 0.0655134	total: 2.64s	remaining: 245ms
183:	learn: 0.0650483	total: 2.65s	remaining: 231ms
184:	learn: 0.

In [29]:
### 7. LTSM

In [30]:
# Creating the LSTM Model for prediction
embedding_feature_vector = 40
model = Sequential()
model.add(Embedding(vocab_size,embedding_feature_vector,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 40)            200000    
_________________________________________________________________
dropout (Dropout)            (None, 20, 40)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               56400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [31]:
# Training the model
model.fit(X_train,Y_train,validation_data=(x_valid,y_valid),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f9fc7c6b340>

In [32]:
predictions = model.predict_classes(x_test)
cr = classification_report(y_test,predictions)
print(cr)



              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1039
           1       0.99      0.99      0.99      1041

    accuracy                           0.99      2080
   macro avg       0.99      0.99      0.99      2080
weighted avg       0.99      0.99      0.99      2080



### Evaluation of Models

In [34]:
score_1 = accuracy_score(y_test,pred_1)
score_2 = accuracy_score(y_test,pred_2)
score_3 = accuracy_score(y_test,pred_3)
score_4 = accuracy_score(y_test,pred_4)
score_5 = accuracy_score(y_test,pred_5)
score_6 = accuracy_score(y_test,pred_6)
score_7 = accuracy_score(y_test,predictions)
results = pd.DataFrame([["Logistic Regression",score_1],["Naive Bayes",score_2],["Decision Tree",score_3],
                       ["Random Forest",score_4],["XGBOOST",score_5],["CatBoost",score_6],["LSTM",score_7]],columns=["Model","Accuracy"])
results

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.7125
1,Naive Bayes,0.705769
2,Decision Tree,0.921154
3,Random Forest,0.933654
4,XGBOOST,0.984135
5,CatBoost,0.984135
6,LSTM,0.988942


From the above results, it appears that LSTM Model gives the highest accuracy amongst various models. Therefore, it is selected as the final model for making predictions on final testing data.

### Predictions on Testing Data

In [35]:
# Making Predictions on test data
predictions_test = pd.DataFrame(model.predict_classes(x_test_final))
test_id = pd.DataFrame(test["id"])
submission = pd.concat([test_id,predictions_test],axis=1)
submission.columns = ["id","label"]
submission.to_csv("Submission.csv",index=False)



In [36]:
submission.head()

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,1
3,20803,0
4,20804,1
