# The task

Ok, so let's do some fake news detection.

Given a headline and a body of text, we want to say whether these two are:

* unrelated
* agree with each other
* disagree with each other
* discuss each other

In [1]:
from utils.data import DataSet
from utils.features import *
from utils.dependencies import *
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

training = DataSet()
test = DataSet(name="competition_test")

Using TensorFlow backend.


Reading dataset in dataset/
Loading files train_bodies.csv, train_stances.csv
Reading dataset in dataset/
Loading files competition_test_bodies.csv, competition_test_stances.csv


In [2]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(100, input_dim=59, activation='relu'))
    model.add(Dense(50, input_dim=100, activation='relu'))
    model.add(Dense(4, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [15]:
d = DataSet("train")
folds,hold_out = kfold_split(d,n_folds=10)
fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

# Load the competition dataset
competition_dataset = DataSet("competition_test")
X_competition, y_competition = generate_features(competition_dataset.data, "competition")

training = DataSet()
test = DataSet(name="competition_test")
Xs = dict()
ys = dict()

# Load/Precompute all features now
X_holdout,y_holdout = generate_features(hold_out_stances,"holdout")
for fold in fold_stances:
    print(fold)
    Xs[fold],ys[fold] = generate_features(fold_stances[fold],str(fold))

best_score = 0
best_fold = None

encoder = LabelEncoder()
# Classifier for each fold
for fold in range(2):
    ids = list(range(len(folds)))
    del ids[fold]

    X_train = np.vstack(tuple([Xs[i] for i in ids]))
    y_train = np.hstack(tuple([ys[i] for i in ids]))

    X_test = Xs[fold]
    y_test = ys[fold]
    
    encoder.fit(y_train)
    encoded_Y = encoder.transform(y_train)
    dummy_y = np_utils.to_categorical(encoded_Y)

    estimator = KerasClassifier(build_fn=baseline_model, epochs=100, batch_size=128, verbose=0)
    estimator.fit(X_train,dummy_y)
    print(estimator.predict(X_test))
    predicted = [LABELS[int(a)] for a in estimator.predict(X_test)]
    actual = [LABELS[int(a)] for a in y_test]

    fold_score, _ = score_submission(actual, predicted)
    max_fold_score, _ = score_submission(actual, actual)

    score = fold_score/float(max_fold_score)

    print("Score for fold "+ str(fold) + " was - " + str(score))
    if score > best_score:
        best_score = score
        best_fold = estimator

Reading dataset in dataset/
Loading files train_bodies.csv, train_stances.csv
Reading dataset in dataset/
Loading files competition_test_bodies.csv, competition_test_stances.csv
Reading dataset in dataset/
Loading files train_bodies.csv, train_stances.csv
Reading dataset in dataset/
Loading files competition_test_bodies.csv, competition_test_stances.csv
0
1
2
3
4
5
6
7
8
9
[3 3 3 ... 3 3 2]
Score for fold 0 was - 0.796396175533219
[3 3 3 ... 3 2 3]
Score for fold 1 was - 0.784187735717279


In [11]:
#Run on Holdout set and report the final score on the holdout set
predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
actual = [LABELS[int(a)] for a in y_holdout]

print("Scores on the dev set")
report_score(actual,predicted)
print("")
print("")

#Run on competition dataset
predicted = [LABELS[int(a)] for a in best_fold.predict(X_competition)]
actual = [LABELS[int(a)] for a in y_competition]

print("Scores on the test set")
report_score(actual,predicted)

Scores on the dev set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    151    |    22     |    515    |    74     |
-------------------------------------------------------------
| disagree  |    17     |    20     |    108    |    17     |
-------------------------------------------------------------
|  discuss  |    184    |    49     |   1389    |    178    |
-------------------------------------------------------------
| unrelated |    23     |     3     |    206    |   6666    |
-------------------------------------------------------------
Score: 3450.25 out of 4448.5	(77.55985163538271%)


Scores on the test set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    324    |    48    

74.41690805707542

# Towards real stuff

Let's try to check some real news data!  

So you need a headline and a blob of text.

Try out: https://www.bbc.com/

In [30]:
#from utils.features import *
entry1 = ["Trudeau fires Canada's ambassador to China amid Huawei controversy",\
          "It follows controversial comments Mr McCallum made about an extradition case involving a senior executive from the Chinese telecoms giant Huawei.\
           Mr Trudeau said in a statement he had asked John McCallum to step down, but did not offer a reason.\
           The detention of Meng Wanzhou, at the request of the US, angered China and soured Canada's relations with Beijing.\
           Ms Meng, Huawei's chief financial officer, is accused by the US of evading sanctions on Iran. Both she and Huawei deny those allegations.",\
          'agree']
entry2 = ["Trudeau fires Canada's ambassador to China amid Huawei controversy",\
          "No matter how well you think you can park, Stan can do it better. Don’t feel too disheartened though, he doesn’t even need to open the doors once stationary – meaning he can stack cars as close as possible to each other.\
            Stan isn’t some ghostly valet from a budget American horror film though, but a small forklift-style robot that will literally pick your car up and drop it into an ultra-tight space without even needing to see your keys.\
        Created by Stanley Robotics and set to be trialled at Gatwick Airport in August this year, the little droid will then use your flight numbers to ensure that your car is ready and waiting back at the drop-off/pick-up cabin on your return from holiday.",\
          'agree']

new_data = pd.DataFrame([entry1,entry2], columns=['Headline','articleBody','Stance'])
print(new_data.head())

X_new, y_new = generate_features(new_data, "test21es")

print(best_fold.predict(X_new))

                                            Headline  \
0  Trudeau fires Canada's ambassador to China ami...   
1  Trudeau fires Canada's ambassador to China ami...   
2  Michael Jackson doc Leaving Neverland is distu...   

                                         articleBody Stance  
0  It follows controversial comments Mr McCallum ...  agree  
1  No matter how well you think you can park, Sta...  agree  
2  Michael Jackson gave a young boy jewellery in ...  agree  
[2 3 2]
