<a href="https://colab.research.google.com/github/gtmray/Fake-News-Classifier/blob/main/Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle 
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list
! kaggle competitions download -c fake-news
!unzip train.csv.zip
!unzip test.csv.zip

Saving kaggle.json to kaggle.json
ref                                                               title                                                 size  lastUpdated          downloadCount  
----------------------------------------------------------------  --------------------------------------------------  ------  -------------------  -------------  
heeraldedhia/groceries-dataset                                    Groceries dataset                                    257KB  2020-09-17 04:36:08           1441  
andrewmvd/trip-advisor-hotel-reviews                              Trip Advisor Hotel Reviews                             5MB  2020-09-30 08:31:20            879  
balraj98/stanford-background-dataset                              Stanford Background Dataset                           17MB  2020-09-26 12:57:59            101  
nehaprabhavalkar/indian-food-101                                  Indian Food 101                                        7KB  2020-09-30 06:23:43      

In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/train.csv')
df = df.dropna()
X = df.drop('label', axis=1)
y = df['label']

In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, SGD

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
#Preprocessing

text = X.copy()
text.reset_index(inplace=True)

corpus = []
ps = PorterStemmer()

for i in range(0, len(text)):
  cleaning = re.sub('[^a-zA-Z]', ' ', text['title'][i])
  cleaning = cleaning.lower()
  cleaning = cleaning.split()

  cleaning = [ps.stem(word) for word in cleaning if not word in stopwords.words('english')]
  cleaning = " ".join(cleaning)
  corpus.append(cleaning)

In [5]:
voc_size = 5000 #Number of words for the one hot encoding
sent_length = 20 #Max length for padding
embedding_vector_features = 40 #Number of vector features for embedding

#One hot encoding
onehot_repr = [one_hot(sentence, voc_size) for sentence in corpus]

#Padding
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)

In [6]:
#Model

model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))
model.add(Dropout(0.4))

model.add(LSTM(100, return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.4))

model.add(LSTM(100, return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.4))

model.add(LSTM(100))
model.add(BatchNormalization())
model.add(Dropout(0.4))

model.add(Dense(1, activation='sigmoid'))

model.summary()

#Compile model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 40)            200000    
_________________________________________________________________
dropout (Dropout)            (None, 20, 40)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 20, 100)           56400     
_________________________________________________________________
batch_normalization (BatchNo (None, 20, 100)           400       
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 100)           80400     
_________________________________________________________________
batch_normalization_1 (Batch (None, 20, 100)           4

In [7]:
#Converting to numpy array

X_final = np.array(embedded_docs)

y_final = np.array(y)

#Splitting dataset to training and testing 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=77)

#Model training

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=100)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f39dcf8a5f8>

In [10]:
#Model performance

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
y_pred = model.predict_classes(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
[[1902  164]
 [ 141 1450]]
0.9165983046212742
              precision    recall  f1-score   support

           0       0.93      0.92      0.93      2066
           1       0.90      0.91      0.90      1591

    accuracy                           0.92      3657
   macro avg       0.91      0.92      0.92      3657
weighted avg       0.92      0.92      0.92      3657

