In [3]:
import re
from collections import Counter

import numpy  as np
import pandas as pd
import nltk   as nl
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Input, concatenate, Dropout, GRU
from tensorflow.python.keras.optimizers import  RMSprop
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [None]:
train = pd.read_csv("Train.csv", dtype=object)
test  = pd.read_csv("Test.csv", dtype=object)

In [None]:
concatenated_train = train['content'].map(str) + train['title'].map(str)
reduced_train = pd.DataFrame(concatenated_train, columns=['text'])

concatenated_test = test['content'].map(str) + test['title'].map(str) 
reduced_test = pd.DataFrame(concatenated_test, columns=['text'])

In [None]:
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(reduced_train['text'])

x_train_tokens = tokenizer.texts_to_sequences(reduced_train['text'])
x_test_tokens = tokenizer.texts_to_sequences(reduced_test['text'])

num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)
print('The mean number of tokens is {}'.format(np.mean(num_tokens)))
print('The max number of tokens is {}'.format(np.max(num_tokens)))

max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
print('The chosen max tokens is {}'.format(max_tokens))
print('The pourcentage of entries that don''t reach the max tokens {}'.format(np.sum(num_tokens < max_tokens) / len(num_tokens)))

pad = 'pre'
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,padding=pad, truncating=pad)
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

print('The new shape of our train data after padding is {}'.format(x_train_pad.shape))

In [None]:
del x_train_tokens
del x_test_tokens
del num_tokens
del concatenated_train
del reduced_train
del concatenated_test
del reduced_test

In [None]:
Y = train['fake']
X = pd.DataFrame(x_train_pad)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
### Assign different weights to each class because the data is not balanced
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',np.unique(train['fake']),train['fake'])
print(class_weights)
class_weights = [0.85, 1.3]

In [None]:
embedding_size = 150

In [None]:
# ref: https://keras.io/getting-started/functional-api-guide/
nlp_input = layers.Input((max_tokens, ))
embedding = Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding1')(nlp_input)

gru = GRU(units=16, return_sequences=True)(embedding)
gru = Dropout(0.2)(gru)
# gru = GRU(units=32, return_sequences=True)(embedding)
# gru = GRU(units=16, return_sequences=True)(gru)
gru = GRU(units=8)(gru)
gru = Dropout(0.2)(gru)

x = Dense(1, activation='sigmoid')(gru)

model = Model(inputs=[nlp_input], outputs=[x])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
epochs = 3
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1, class_weight=class_weights, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

print('The ditribution of our label in the test data is {}'.format(Y_test.value_counts()))

plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

plt.title('Accuracy')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show();

THRESHHOLD = 0.5
predicted = pd.DataFrame(model.predict(X_test))
predicted[predicted<THRESHHOLD] = 0
predicted[predicted>=THRESHHOLD] = 1

THis is just to print confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

predicted_values = predicted[0].values
predicted_values = [int(i) for i in predicted_values]
true_values = Y_test.values
true_values = [int(i) for i in true_values]

labels=[0, 1]
cm = confusion_matrix(true_values, predicted_values, labels)

def plot_confusion_matrix(cm,target_names,title='Confusion matrix',cmap=None,normalize=True):
    import itertools
    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()
    
    
plot_confusion_matrix(cm,labels, normalize=False)
recall = cm[1, 1] / (cm[1,1] + cm[1,0])
print('The recall equals to {}'.format(recall))

In [None]:
test_data = pd.DataFrame(x_test_pad)

print('Shape of test data tensor:', test_data.shape)

test_prediction = pd.DataFrame(model.predict(test_data))
test_prediction.columns = ['fake']

test_prediction[test_prediction['fake'] >= THRESHHOLD] = 1
test_prediction[test_prediction['fake'] < THRESHHOLD] = 0
test_prediction.index = test['Unnamed: 0']
test_prediction.head()

In [None]:
test_prediction["fake"].value_counts()