In [1]:
import re
from collections import Counter

import numpy  as np
import pandas as pd
import nltk   as nl
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import layers, models, optimizers
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Input, concatenate, Dropout, GRU
from tensorflow.python.keras.optimizers import  RMSprop
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping

ModuleNotFoundError: No module named 'keras'

In [None]:
train = pd.read_csv("Train.csv", dtype=object)
test  = pd.read_csv("Test.csv", dtype=object)

In [4]:
concatenated_train = train['content'].map(str) + train['title'].map(str)
reduced_train = pd.DataFrame(concatenated_train, columns=['text'])

concatenated_test = test['content'].map(str) + test['title'].map(str) 
reduced_test = pd.DataFrame(concatenated_test, columns=['text'])

In [5]:
%ls

[0m[01;34m285[0m/                       datakontest_4.ipynb  datakontest_9.ipynb  output.csv
banniere_data_kontest.png  datakontest_5.ipynb  datakontest.ipynb    test.csv
datakontest_1.ipynb        datakontest_6.ipynb  [01;34mglove[0m/               Test.csv
datakontest_2.ipynb        datakontest_7.ipynb  glove.840B.300d.txt  Train.csv
datakontest_3.ipynb        datakontest_8.ipynb  glove.840B.300d.zip


In [None]:
!cd glove

In [None]:
import os
os.listdir('/glove')

In [15]:
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 1000 # max number of words in a question to use

## fill up the missing values
train_X = reduced_train["text"].fillna("_na_").values
test_X = reduced_test["text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)


In [16]:
EMBEDDING_FILE = "./glove.840B.300d.txt"
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

  


In [17]:
Y = train['fake']
X = pd.DataFrame(train_X)
print(X.shape)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(70000, 1000)
(63000, 1000) (63000,)
(7000, 1000) (7000,)


In [21]:
class_weights = [0.85, 1.3]

In [None]:
nlp_input = layers.Input((maxlen, ))
embedding = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable=False)(nlp_input)

# gru = GRU(units=32, return_sequences=True)(embedding)
gru = GRU(units=16, return_sequences=True)(embedding)
gru = Dropout(0.2)(gru)
# gru = GRU(units=8, return_sequences=True)(gru)
gru = GRU(units=4)(gru)
gru = Dropout(0.2)(gru)

x = Dense(1, activation='sigmoid')(gru)

model = Model(inputs=[nlp_input], outputs=[x])
optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 1000, 300)         15000000  
_________________________________________________________________
gru_5 (GRU)                  (None, 1000, 16)          15216     
_________________________________________________________________
dropout_5 (Dropout)          (None, 1000, 16)          0         
_________________________________________________________________
gru_6 (GRU)                  (None, 4)                 252       
_________________________________________________________________
dropout_6 (Dropout)          (None, 4)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 5         
Total para

In [None]:
epochs =  3
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1, class_weight=class_weights, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Instructions for updating:
Use tf.cast instead.
Train on 56700 samples, validate on 6300 samples
Epoch 1/3

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

print('The ditribution of our label in the test data is {}'.format(Y_test.value_counts()))

plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

plt.title('Accuracy')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show();

THRESHHOLD = 0.5
predicted = pd.DataFrame(model.predict(X_test))
predicted[predicted<THRESHHOLD] = 0
predicted[predicted>=THRESHHOLD] = 1

In [None]:
from sklearn.metrics import confusion_matrix

predicted_values = predicted[0].values
predicted_values = [int(i) for i in predicted_values]
true_values = Y_test.values
true_values = [int(i) for i in true_values]

labels=[0, 1]
cm = confusion_matrix(true_values, predicted_values, labels)

def plot_confusion_matrix(cm,target_names,title='Confusion matrix',cmap=None,normalize=True):
    import itertools
    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()
    
    
plot_confusion_matrix(cm,labels, normalize=False)
recall = cm[1, 1] / (cm[1,1] + cm[1,0])
print('The recall equals to {}'.format(recall))

In [None]:
test_data = pd.DataFrame(x_test_pad)

print('Shape of test data tensor:', test_data.shape)

test_prediction = pd.DataFrame(model.predict(test_data))
test_prediction.columns = ['fake']

test_prediction[test_prediction['fake'] >= THRESHHOLD] = 1
test_prediction[test_prediction['fake'] < THRESHHOLD] = 0
test_prediction.index = test['Unnamed: 0']
test_prediction.head()

In [None]:
test_prediction["fake"].value_counts()

In [None]:
test_prediction.to_csv('output.csv', header = False)