# The task

Ok, so let's do some fake news detection.

Given a headline and a body of text, we want to say whether these two are:

* unrelated
* agree with each other
* disagree with each other
* discuss each other

In [2]:
from utils.data import DataSet
from utils.features import *
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

training = DataSet()
test = DataSet(name="competition_test")

Reading dataset in dataset/
Loading files train_bodies.csv, train_stances.csv
Reading dataset in dataset/
Loading files competition_test_bodies.csv, competition_test_stances.csv


Using TensorFlow backend.


In [125]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(100, input_dim=59, activation='relu'))
    model.add(Dense(50, input_dim=100, activation='relu'))
    model.add(Dense(4, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [126]:
d = DataSet("train")
folds,hold_out = kfold_split(d,n_folds=10)
fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

# Load the competition dataset
competition_dataset = DataSet("competition_test")
X_competition, y_competition = generate_features(competition_dataset.data, "competition")

training = DataSet()
test = DataSet(name="competition_test")
Xs = dict()
ys = dict()

# Load/Precompute all features now
X_holdout,y_holdout = generate_features(hold_out_stances,"holdout")
for fold in fold_stances:
    Xs[fold],ys[fold] = generate_features(fold_stances[fold],str(fold))

best_score = 0
best_fold = None

encoder = LabelEncoder()

# Classifier for each fold
for fold in fold_stances:
    ids = list(range(len(folds)))
    del ids[fold]

    X_train = np.vstack(tuple([Xs[i] for i in ids]))
    y_train = np.hstack(tuple([ys[i] for i in ids]))

    X_test = Xs[fold]
    y_test = ys[fold]
    
    encoder.fit(y_train)
    encoded_Y = encoder.transform(y_train)
    dummy_y = np_utils.to_categorical(encoded_Y)

    estimator = KerasClassifier(build_fn=baseline_model, epochs=100, batch_size=128, verbose=0)
    estimator.fit(X_train,dummy_y)
    
    predicted = [LABELS[int(a)] for a in estimator.predict(X_test)]
    actual = [LABELS[int(a)] for a in y_test]

    fold_score, _ = score_submission(actual, predicted)
    max_fold_score, _ = score_submission(actual, actual)

    score = fold_score/float(max_fold_score)

    print("Score for fold "+ str(fold) + " was - " + str(score))
    if score > best_score:
        best_score = score
        best_fold = estimator

Reading dataset in dataset/
Loading files train_bodies.csv, train_stances.csv
Reading dataset in dataset/
Loading files competition_test_bodies.csv, competition_test_stances.csv
Reading dataset in dataset/
Loading files train_bodies.csv, train_stances.csv
Reading dataset in dataset/
Loading files competition_test_bodies.csv, competition_test_stances.csv
Score for fold 0 was - 0.7898994851679333
Score for fold 1 was - 0.7886576337477301
Score for fold 2 was - 0.7979063681302704
Score for fold 3 was - 0.7872082040210497
Score for fold 4 was - 0.7796537415939333
Score for fold 5 was - 0.7639175257731958
Score for fold 6 was - 0.7642918701522551
Score for fold 7 was - 0.7600056569084995
Score for fold 8 was - 0.7869206349206349
Score for fold 9 was - 0.7786770862405805
Scores on the dev set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |   

72.71108250187748

In [127]:
#Run on Holdout set and report the final score on the holdout set
predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
actual = [LABELS[int(a)] for a in y_holdout]

print("Scores on the dev set")
report_score(actual,predicted)
print("")
print("")

#Run on competition dataset
predicted = [LABELS[int(a)] for a in best_fold.predict(X_competition)]
actual = [LABELS[int(a)] for a in y_competition]

print("Scores on the test set")
report_score(actual,predicted)

Scores on the dev set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    173    |    18     |    475    |    96     |
-------------------------------------------------------------
| disagree  |    29     |    19     |    95     |    19     |
-------------------------------------------------------------
|  discuss  |    174    |    52     |   1297    |    277    |
-------------------------------------------------------------
| unrelated |    15     |     0     |    119    |   6764    |
-------------------------------------------------------------
Score: 3390.75 out of 4448.5	(76.2223221310554%)


Scores on the test set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    325    |    28     

72.71108250187748