In [35]:
import pandas as pd
import numpy as np
import sklearn
import os
from tqdm import tqdm, trange, tqdm_notebook
import re
import nltk
from multiprocessing import cpu_count, Pool
from functools import partial
import ipywidgets
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from rewrite.scorer import score_4class

In [36]:
X_train = pd.read_hdf("X_train_full_allfeatures-NOLABEL.h5", key="df")
y_train = pd.read_hdf("y_train_full.h5", key="df")

In [37]:
y_train.head()

0    unrelated
1        agree
2    unrelated
3    unrelated
4     disagree
Name: Stance, dtype: object

In [38]:
X_train.head()

Unnamed: 0,Headline,articleBody,___clean_headline,___clean_headline_tokenized_lemmas,___clean_body,___clean_body_tokenized_lemmas,bin_count,bin_count_early,bin_count_stopless,word_overlap_features,...,chargram_2_first_hits,chargram_8_hits,chargram_8_early_hits,chargram_8_first_hits,chargram_4_hits,chargram_4_early_hits,chargram_4_first_hits,chargram_16_hits,chargram_16_early_hits,chargram_16_first_hits
0,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...,police find mass graves with at least 15 bodie...,"[police, find, mass, graf, with, at, least, 15...",danny boyle is directing the untitled film set...,"[danny, boyle, is, directing, the, untitled, f...",2,0,0,0.014085,...,0,0,0,0,0,0,0,0,0,0
1,Hundreds of Palestinians flee floods in Gaza a...,Hundreds of Palestinians were evacuated from t...,hundreds of palestinians flee floods in gaza a...,"[hundred, of, palestinian, flee, flood, in, ga...",hundreds of palestinians were evacuated from t...,"[hundred, of, palestinian, were, evacuated, fr...",10,7,7,0.046083,...,3,0,0,0,0,0,0,0,0,0
2,"Christian Bale passes on role of Steve Jobs, a...",30-year-old Moscow resident was hospitalized w...,christian bale passes on role of steve jobs ac...,"[christian, bale, pass, on, role, of, steve, j...",30 year old moscow resident was hospitalized w...,"[30, year, old, moscow, resident, wa, hospital...",5,4,1,0.030303,...,1,0,0,0,0,0,0,0,0,0
3,HBO and Apple in Talks for $15/Month Apple TV ...,(Reuters) - A Canadian soldier was shot at the...,hbo and apple in talks for 15 month apple tv s...,"[hbo, and, apple, in, talk, for, 15, month, ap...",reuters a canadian soldier was shot at the can...,"[reuters, a, canadian, soldier, wa, shot, at, ...",3,3,0,0.028169,...,1,0,0,0,0,0,0,0,0,0
4,Spider burrowed through tourist's stomach and ...,"Fear not arachnophobes, the story of Bunbury's...",spider burrowed through tourist s stomach and ...,"[spider, burrowed, through, tourist, s, stomac...",fear not arachnophobes the story of bunbury s ...,"[fear, not, arachnophobes, the, story, of, bun...",9,5,4,0.032727,...,4,0,0,0,0,0,0,0,0,0


In [39]:
print(X_train.shape)
print(y_train.shape)

(49972, 49)
(49972,)


In [40]:
def del_str_cols(df): # df should be X, e.g. X_train or X_dev
    del df["articleBody"]
    del df["Headline"]
    for col_name in df.columns:
        if "___" == col_name[0:3]:
            del df[col_name]
#del_str_cols()

In [41]:
Xy_train = pd.concat([X_train, y_train], axis=1)
print(Xy_train.shape)

(49972, 50)


In [42]:
Xy_train.head()

Unnamed: 0,Headline,articleBody,___clean_headline,___clean_headline_tokenized_lemmas,___clean_body,___clean_body_tokenized_lemmas,bin_count,bin_count_early,bin_count_stopless,word_overlap_features,...,chargram_8_hits,chargram_8_early_hits,chargram_8_first_hits,chargram_4_hits,chargram_4_early_hits,chargram_4_first_hits,chargram_16_hits,chargram_16_early_hits,chargram_16_first_hits,Stance
0,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...,police find mass graves with at least 15 bodie...,"[police, find, mass, graf, with, at, least, 15...",danny boyle is directing the untitled film set...,"[danny, boyle, is, directing, the, untitled, f...",2,0,0,0.014085,...,0,0,0,0,0,0,0,0,0,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,Hundreds of Palestinians were evacuated from t...,hundreds of palestinians flee floods in gaza a...,"[hundred, of, palestinian, flee, flood, in, ga...",hundreds of palestinians were evacuated from t...,"[hundred, of, palestinian, were, evacuated, fr...",10,7,7,0.046083,...,0,0,0,0,0,0,0,0,0,agree
2,"Christian Bale passes on role of Steve Jobs, a...",30-year-old Moscow resident was hospitalized w...,christian bale passes on role of steve jobs ac...,"[christian, bale, pass, on, role, of, steve, j...",30 year old moscow resident was hospitalized w...,"[30, year, old, moscow, resident, wa, hospital...",5,4,1,0.030303,...,0,0,0,0,0,0,0,0,0,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,(Reuters) - A Canadian soldier was shot at the...,hbo and apple in talks for 15 month apple tv s...,"[hbo, and, apple, in, talk, for, 15, month, ap...",reuters a canadian soldier was shot at the can...,"[reuters, a, canadian, soldier, wa, shot, at, ...",3,3,0,0.028169,...,0,0,0,0,0,0,0,0,0,unrelated
4,Spider burrowed through tourist's stomach and ...,"Fear not arachnophobes, the story of Bunbury's...",spider burrowed through tourist s stomach and ...,"[spider, burrowed, through, tourist, s, stomac...",fear not arachnophobes the story of bunbury s ...,"[fear, not, arachnophobes, the, story, of, bun...",9,5,4,0.032727,...,0,0,0,0,0,0,0,0,0,disagree


In [43]:
#Xy_train = Xy_train[Xy_train["Stance"] != "unrelated"]

In [44]:
print(Xy_train.shape)

(49972, 50)


In [45]:
y_train = Xy_train["Stance"]
X_train = Xy_train.drop("Stance", axis=1)
print(X_train.shape)
print(y_train.shape)

(49972, 49)
(49972,)


In [46]:
# START SENTIMENT ANALYSIS

In [47]:
tqdm.pandas()

In [48]:
from nltk.sentiment import vader
sid = vader.SentimentIntensityAnalyzer()

In [49]:
def vader_polarity_scores(df, text_col_name, col_name_prefix):
    pol_scores = df[text_col_name].progress_apply(lambda hl: pd.Series(sid.polarity_scores(hl)))
    cols = pol_scores.columns
    new_cols = []
    for col_name in cols:
        new_cols.append("vader_"+col_name_prefix+"_"+col_name)
    pol_scores.columns = new_cols
    return pol_scores

vader_hl_df = vader_polarity_scores(X_train, "Headline", "hl")
vader_body_df = vader_polarity_scores(X_train, "articleBody", "body")
X_train = pd.concat([X_train, vader_hl_df, vader_body_df], axis=1)

100%|██████████| 49972/49972 [00:28<00:00, 1767.72it/s]
100%|██████████| 49972/49972 [04:41<00:00, 177.42it/s] 


In [50]:
# END SENTIMENT ANALYSIS

In [51]:
del_str_cols(X_train)

In [52]:
print(X_train.shape)
print(y_train.shape)

(49972, 51)
(49972,)


In [53]:
### LOAD TEST SET ###

In [54]:
X_test = pd.read_hdf("X_test_full_allfeatures-NOLABEL.h5", key="df")
y_test = pd.read_hdf("y_test_full.h5", key="df")

Xy_test = pd.concat([X_test, y_test], axis=1)

y_test = Xy_test["Stance"]
X_test = Xy_test.drop("Stance", axis=1)

vader_hl_df = vader_polarity_scores(X_test, "Headline", "hl")
vader_body_df = vader_polarity_scores(X_test, "articleBody", "body")
X_test = pd.concat([X_test, vader_hl_df, vader_body_df], axis=1)

del_str_cols(X_test)

100%|██████████| 25413/25413 [00:17<00:00, 1482.99it/s]
100%|██████████| 25413/25413 [02:21<00:00, 179.89it/s]


In [55]:
X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(X_train, y_train, test_size=0.25, random_state=42, shuffle=True)

In [56]:
print(X_train.shape)
print(y_train.shape)
print(X_dev.shape)
print(y_dev.shape)

(37479, 51)
(37479,)
(12493, 51)
(12493,)


In [57]:
X_train.head()

Unnamed: 0,bin_count,bin_count_early,bin_count_stopless,word_overlap_features,wrf_hl_fake,wrf_hl_fraud,wrf_hl_hoax,wrf_hl_false,wrf_hl_deny,wrf_hl_denies,...,chargram_16_early_hits,chargram_16_first_hits,vader_hl_compound,vader_hl_neg,vader_hl_neu,vader_hl_pos,vader_body_compound,vader_body_neg,vader_body_neu,vader_body_pos
7231,7,6,5,0.057851,0,0,0,0,0,0,...,0,0,0.34,0.0,0.789,0.211,0.8948,0.03,0.883,0.087
48827,6,5,4,0.083333,0,0,0,0,0,0,...,0,0,0.0,0.0,1.0,0.0,0.763,0.106,0.731,0.163
46621,3,1,2,0.015625,0,0,0,0,0,0,...,0,0,-0.4767,0.256,0.744,0.0,-0.7506,0.116,0.805,0.079
15590,1,1,0,0.006494,0,0,1,0,0,0,...,0,0,-0.9393,0.646,0.354,0.0,0.9618,0.04,0.843,0.117
10917,7,5,3,0.015544,0,0,0,0,0,0,...,0,0,-0.296,0.18,0.82,0.0,0.8225,0.024,0.925,0.051


In [81]:
clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)

In [82]:
clf.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1       36629.8190           49.68s
         2       32775.6925           46.67s
         3       29664.1562           47.33s
         4       27110.3178           48.91s
         5       25006.9279           46.48s
         6       23246.4086           45.94s
         7       21772.6165           44.82s
         8       20515.7649           45.46s
         9       19472.4843           44.17s
        10       18553.3752           43.37s
        20       14235.1282           38.48s
        30       12889.4275           36.25s
        40       12286.8561           32.89s
        50       11925.5085           29.90s
        60       11587.7477           27.61s
        70       11313.9165           25.43s
        80       11081.2357           23.20s
        90       10875.4810           21.00s
       100       10704.8520           18.84s
       200        9313.2458            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=14128, subsample=1.0,
              verbose=True, warm_start=False)

In [83]:
preds = clf.predict(X_dev)

In [84]:
print(classification_report(y_dev, preds))
# With VADER sentiment features

             precision    recall  f1-score   support

      agree       0.65      0.26      0.37       910
   disagree       0.50      0.06      0.11       230
    discuss       0.69      0.86      0.77      2223
  unrelated       0.96      0.98      0.97      9130

avg / total       0.88      0.89      0.88     12493



In [85]:
score, max_score = score_4class(y_dev, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

Weighted accuracy: 0.8196351076078292 (4627.25 out of 5645.5)


In [86]:
preds = clf.predict(X_test)
print(classification_report(y_test, preds))

score, max_score = score_4class(y_test, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

             precision    recall  f1-score   support

      agree       0.46      0.17      0.25      1903
   disagree       0.16      0.01      0.01       697
    discuss       0.62      0.77      0.69      4464
  unrelated       0.94      0.98      0.96     18349

avg / total       0.83      0.86      0.83     25413

Weighted accuracy: 0.7553910524621822 (8801.25 out of 11651.25)


In [87]:
preds = clf.predict(X_train)
print(classification_report(y_train, preds))

score, max_score = score_4class(y_train, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

             precision    recall  f1-score   support

      agree       0.82      0.37      0.51      2768
   disagree       0.92      0.20      0.33       610
    discuss       0.73      0.88      0.80      6686
  unrelated       0.97      0.99      0.98     27415

avg / total       0.91      0.91      0.90     37479

Weighted accuracy: 0.8471575711900223 (14332.0 out of 16917.75)


In [75]:
crf = sklearn.ensemble.RandomForestClassifier(n_estimators=400, n_jobs=8, random_state=42)
crf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=8,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [76]:
preds = crf.predict(X_dev)
print(classification_report(y_dev, preds))

             precision    recall  f1-score   support

      agree       0.76      0.33      0.46       910
   disagree       0.84      0.11      0.20       230
    discuss       0.72      0.87      0.79      2223
  unrelated       0.96      0.99      0.97      9130

avg / total       0.90      0.90      0.89     12493



In [77]:
score, max_score = score_4class(y_dev, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

Weighted accuracy: 0.8332742892569303 (4704.25 out of 5645.5)


In [78]:
preds = crf.predict(X_test)
print(classification_report(y_test, preds))

score, max_score = score_4class(y_test, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

             precision    recall  f1-score   support

      agree       0.44      0.14      0.21      1903
   disagree       0.00      0.00      0.00       697
    discuss       0.62      0.76      0.68      4464
  unrelated       0.93      0.98      0.95     18349

avg / total       0.81      0.85      0.82     25413

Weighted accuracy: 0.7426027250295033 (8652.25 out of 11651.25)


In [79]:
preds = crf.predict(X_train)
print(classification_report(y_train, preds))

score, max_score = score_4class(y_train, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

             precision    recall  f1-score   support

      agree       1.00      1.00      1.00      2768
   disagree       1.00      1.00      1.00       610
    discuss       1.00      1.00      1.00      6686
  unrelated       1.00      1.00      1.00     27415

avg / total       1.00      1.00      1.00     37479

Weighted accuracy: 0.9996896750454405 (16912.5 out of 16917.75)


In [70]:
clf.fit(X_test, y_test)
preds = clf.predict(X_test)
print(classification_report(y_test, preds))

preds = clf.predict(X_dev)
print(classification_report(y_dev, preds))

      Iter       Train Loss   Remaining Time 
         1       25260.8826           49.47s
         2       22742.2557            1.01m
         3       20697.6291           55.47s
         4       19029.7974           58.47s
         5       17616.3467           57.94s
         6       16450.4047           57.81s
         7       15480.2821           57.11s
         8       14669.1370           58.96s
         9       13974.5050           56.98s
        10       13398.7380           57.72s
        20       10582.2960           52.74s
        30        9653.6478           52.48s
        40        9170.9300           45.37s
        50        8811.0736           38.89s
        60        8523.0586           34.80s
        70        8267.4833           31.30s
        80        8027.8552           29.27s
        90        7842.8616           27.78s
       100        7633.8265           25.73s
       200        6325.2492            0.00s
             precision    recall  f1-score   support



In [72]:
score, max_score = score_4class(y_dev, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

Weighted accuracy: 0.7736692941280666 (4367.75 out of 5645.5)


In [74]:
feat_imp = clf.feature_importances_
indices = np.argsort(feat_imp)[::-1]
for ii in indices:
    print(X_train.columns[ii]+": "+str(feat_imp[ii]))

vader_body_compound: 0.11721836353874318
bin_count_stopless: 0.11698913079580249
word_overlap_features: 0.10004906933014432
vader_body_neg: 0.07261528319554078
vader_hl_compound: 0.07131559825952277
vader_body_neu: 0.06663446208692016
vader_body_pos: 0.05604037760630517
bin_count: 0.04927385882253784
ngram_2_hits: 0.04328924411982605
bin_count_early: 0.042782794859227015
vader_hl_neg: 0.03902429051737139
ngram_2_early_hits: 0.03182416788213388
chargram_2_hits: 0.03181427032901217
vader_hl_pos: 0.030651339602869768
vader_hl_neu: 0.026352595129833744
polar_body: 0.01576324740683835
chargram_2_early_hits: 0.013416901830357202
wrf_hl_fake: 0.011936043545094177
chargram_4_hits: 0.008322823696756528
wrf_hl_not: 0.008305521185698626
chargram_2_first_hits: 0.008104173190856812
polar_hl: 0.006598122764430021
ngram_3_early_hits: 0.004872491778207736
wrf_hl_denies: 0.004185392536388371
ngram_3_hits: 0.004137805049519469
wrf_hl_hoax: 0.0034690016492651093
wrf_hl_false: 0.0031385593689231407
ngram_

In [None]:
-----

In [None]:
y_train.value_counts(normalize=True)

In [None]:
crf = sklearn.ensemble.RandomForestClassifier(n_estimators=200, n_jobs=8, random_state=42)

In [None]:
crf.fit(X_train, y_train)

In [None]:
preds = crf.predict(X_dev)
print(classification_report(y_dev, preds))

In [None]:
y_train.value_counts(normalize=True)

In [None]:
# 4-class problem:
# By guessing most common category (unrelated), we would achieve approx. 73% accuracy. The baseline model achieves 88%.

In [88]:
### 3 class problem filter out unrelated

In [92]:
Xy_train = pd.concat([X_train, y_train], axis=1)
print(Xy_train.shape)

(37479, 52)


In [93]:
Xy_train = Xy_train[Xy_train["Stance"] != "unrelated"]
print(Xy_train.shape)

(10064, 52)


In [94]:
y_train = Xy_train["Stance"]
X_train = Xy_train.drop("Stance", axis=1)
print(X_train.shape)
print(y_train.shape)

(10064, 51)
(10064,)


In [96]:
Xy_dev = pd.concat([X_dev, y_dev], axis=1)
Xy_dev = Xy_dev[Xy_dev["Stance"] != "unrelated"]
y_dev = Xy_dev["Stance"]
X_dev = Xy_dev.drop("Stance", axis=1)
print(X_dev.shape)
print(y_dev.shape)

(3363, 51)
(3363,)


In [97]:
Xy_test = pd.concat([X_test, y_test], axis=1)
Xy_test = Xy_test[Xy_test["Stance"] != "unrelated"]
y_test = Xy_test["Stance"]
X_test = Xy_test.drop("Stance", axis=1)
print(X_test.shape)
print(y_test.shape)

(7064, 51)
(7064,)


In [98]:
clf.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1        9167.3007           11.43s
         2        8898.4178           11.58s
         3        8672.0611           12.95s
         4        8474.9856           12.21s
         5        8306.5268           11.85s
         6        8162.5121           11.28s
         7        8030.3965           10.86s
         8        7916.1301           10.53s
         9        7824.5040           10.37s
        10        7741.8155           10.40s
        20        7246.2035            9.40s
        30        7016.5002           10.19s
        40        6839.6531            9.62s
        50        6693.1649            8.75s
        60        6561.6648            7.72s
        70        6444.5214            6.93s
        80        6327.9966            6.24s
        90        6221.2381            5.58s
       100        6117.6191            5.05s
       200        5349.9633            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=14128, subsample=1.0,
              verbose=True, warm_start=False)

In [99]:
preds = clf.predict(X_train)
print(classification_report(y_train, preds))

score, max_score = score_4class(y_train, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

             precision    recall  f1-score   support

      agree       0.87      0.43      0.58      2768
   disagree       0.92      0.24      0.38       610
    discuss       0.77      0.98      0.86      6686

avg / total       0.81      0.79      0.75     10064

Weighted accuracy: 0.8393282988871225 (8447.0 out of 10064.0)


In [100]:
preds = clf.predict(X_dev)
print(classification_report(y_dev, preds))

score, max_score = score_4class(y_dev, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

             precision    recall  f1-score   support

      agree       0.70      0.31      0.43       910
   disagree       0.69      0.12      0.20       230
    discuss       0.73      0.96      0.83      2223

avg / total       0.72      0.73      0.68      3363

Weighted accuracy: 0.7966101694915254 (2679.0 out of 3363.0)


In [101]:
preds = clf.predict(X_test)
print(classification_report(y_test, preds))

score, max_score = score_4class(y_test, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

             precision    recall  f1-score   support

      agree       0.42      0.21      0.28      1903
   disagree       0.27      0.03      0.05       697
    discuss       0.66      0.90      0.76      4464

avg / total       0.56      0.62      0.56      7064

Weighted accuracy: 0.7180067950169875 (5072.0 out of 7064.0)


In [102]:
from sklearn.metrics import confusion_matrix

In [104]:
confusion_matrix(y_test, preds, labels=["agree", "disagree", "discuss"], sample_weight=None)

array([[ 391,   18, 1494],
       [ 117,   19,  561],
       [ 432,   34, 3998]])