# Import Important Libraries

In [83]:
import re
import pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from keras.preprocessing.text import one_hot
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Embedding

In [None]:
nltk.download('stopwords')

# Load Datasets

In [84]:
df = pd.read_csv('news data/train.csv')

In [85]:
df.head()

(20800, 5)

# Pre-Processing

In [None]:
### Vocabulary size
voc_size=5000

In [86]:
# Drop rows with missing values
df = df.dropna()

# Split features and label
xTrain =df.drop('label',axis=1)
yTrain = df['label']

print('--- Data Shape ---')
print('xTrain shape: ', xTrain.shape)
print('yTrain shape: ', yTrain.shape)

--- Data Shape ---
xTrain shape:  (18285, 4)
yTrain shape:  (18285,)


### One-Hot Representation

In [None]:
ps = PorterStemmer()

In [None]:
messages = xTrain.copy()

In [None]:
messages['title'][1]

In [None]:
messages = messages.reset_index()

In [None]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus

In [None]:
onehot_repr = [one_hot(words, voc_size) for words in corpus] 
onehot_repr

### Embedding Representation

In [None]:
sent_length = 20
embedded_docs = pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)

In [None]:
embedded_docs[0]

# Model Definition

In [None]:
embedding_vector_features = 40

model=Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length = sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
xTrain = np.array(embedded_docs)
yTrain = np.array(yTrain)

# Model Training

In [None]:
trainX, valX, trainY, valY = train_test_split(xTrain, yTrain, test_size=0.20, random_state=42)

In [None]:
model.fit(trainX, trainY,
          validation_data = (valX, valY),
          epochs=10,
          batch_size=64)

# Model Validation

In [None]:
predictions = (model.predict(valX) > 0.5).astype("int32")

In [None]:
accuracy_score(valY, predictions)

# Save Model

In [None]:
model.save("Fake News Classifier.h5")

# Save important data for later predictions

In [None]:
# Save corpus
file = open('corpus', 'wb')
pickle.dump(corpus, file)
file.close()

# Save one-hot representation
file = open('onehot', 'wb')
pickle.dump(corpus, file)
file.close()

# Save embedded doc
file = open('embedded', 'wb')
pickle.dump(corpus, file)
file.close()