In [1]:
import pandas as pd
import numpy as np
import sklearn
import os
from tqdm import tqdm, trange, tqdm_notebook
import re
import nltk
from multiprocessing import cpu_count, Pool
from functools import partial
import ipywidgets
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
X_train = pd.read_hdf("X_train_full_allfeatures-NOLABEL.h5", key="df")
y_train = pd.read_hdf("y_train_full.h5", key="df")

In [3]:
y_train.head()

0    unrelated
1        agree
2    unrelated
3    unrelated
4     disagree
Name: Stance, dtype: object

In [4]:
print(X_train.shape)
print(y_train.shape)

(49972, 49)
(49972,)


In [5]:
del X_train["articleBody"]
del X_train["Headline"]
for col_name in X_train.columns:
    if "___" == col_name[0:3]:
        del X_train[col_name]

In [6]:
print(X_train.shape)
print(y_train.shape)

(49972, 43)
(49972,)


In [7]:
X_train.head()

Unnamed: 0,bin_count,bin_count_early,bin_count_stopless,word_overlap_features,wrf_hl_fake,wrf_hl_fraud,wrf_hl_hoax,wrf_hl_false,wrf_hl_deny,wrf_hl_denies,...,chargram_2_first_hits,chargram_8_hits,chargram_8_early_hits,chargram_8_first_hits,chargram_4_hits,chargram_4_early_hits,chargram_4_first_hits,chargram_16_hits,chargram_16_early_hits,chargram_16_first_hits
0,2,0,0,0.014085,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10,7,7,0.046083,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
2,5,4,1,0.030303,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,3,3,0,0.028169,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,9,5,4,0.032727,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0


In [8]:
Xy_train = pd.concat([X_train, y_train], axis=1)
print(Xy_train.shape)

(49972, 44)


In [9]:
Xy_train.head()

Unnamed: 0,bin_count,bin_count_early,bin_count_stopless,word_overlap_features,wrf_hl_fake,wrf_hl_fraud,wrf_hl_hoax,wrf_hl_false,wrf_hl_deny,wrf_hl_denies,...,chargram_8_hits,chargram_8_early_hits,chargram_8_first_hits,chargram_4_hits,chargram_4_early_hits,chargram_4_first_hits,chargram_16_hits,chargram_16_early_hits,chargram_16_first_hits,Stance
0,2,0,0,0.014085,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,unrelated
1,10,7,7,0.046083,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,agree
2,5,4,1,0.030303,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,unrelated
3,3,3,0,0.028169,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,unrelated
4,9,5,4,0.032727,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,disagree


In [11]:
Xy_train = Xy_train[Xy_train["Stance"] != "unrelated"]

In [12]:
print(Xy_train.shape)

(13427, 44)


In [13]:
y_train = Xy_train["Stance"]
X_train = Xy_train.drop("Stance", axis=1)
print(X_train.shape)
print(y_train.shape)

(13427, 43)
(13427,)


In [14]:
X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(X_train, y_train, test_size=0.1, random_state=42, shuffle=True)

In [15]:
print(X_train.shape)
print(y_train.shape)
print(X_dev.shape)
print(y_dev.shape)

(12084, 43)
(12084,)
(1343, 43)
(1343,)


In [16]:
X_train.head()

Unnamed: 0,bin_count,bin_count_early,bin_count_stopless,word_overlap_features,wrf_hl_fake,wrf_hl_fraud,wrf_hl_hoax,wrf_hl_false,wrf_hl_deny,wrf_hl_denies,...,chargram_2_first_hits,chargram_8_hits,chargram_8_early_hits,chargram_8_first_hits,chargram_4_hits,chargram_4_early_hits,chargram_4_first_hits,chargram_16_hits,chargram_16_early_hits,chargram_16_first_hits
21452,9,9,7,0.029762,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
46770,11,5,7,0.115789,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
38655,9,8,5,0.059259,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
39514,6,6,2,0.043165,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43951,5,5,3,0.026882,0,0,0,0,0,0,...,6,0,0,0,0,0,0,0,0,0


In [17]:
clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)

In [18]:
clf.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1       11032.4689            8.34s
         2       10738.9873            8.30s
         3       10494.7511            7.58s
         4       10291.4398            6.95s
         5       10117.9871            6.89s
         6        9972.1088            6.67s
         7        9846.7383            6.78s
         8        9740.6474            6.65s
         9        9648.7340            6.73s
        10        9571.4335            6.67s
        20        9156.8534            6.16s
        30        8996.7904            6.24s
        40        8893.0221            5.78s
        50        8811.6276            5.38s
        60        8744.3585            4.91s
        70        8682.5639            4.49s
        80        8625.2353            4.11s
        90        8571.0818            3.74s
       100        8517.1276            3.38s
       200        8098.2011            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=14128, subsample=1.0,
              verbose=True, warm_start=False)

In [19]:
preds = clf.predict(X_dev)

In [20]:
actual = y_dev

In [21]:
from sklearn.metrics import classification_report

In [22]:
print(classification_report(actual, preds))

             precision    recall  f1-score   support

      agree       0.77      0.19      0.30       384
   disagree       0.50      0.03      0.06        97
    discuss       0.68      0.98      0.81       862

avg / total       0.69      0.69      0.61      1343



In [None]:
# 3-class problem: guessing the most common class (discuss) would yield approx. 66% accuracy.
# The baseline model instead yields 69% acc., suggesting only a small improvement beyond random.

In [23]:
y_train.value_counts(normalize=True)

discuss     0.665922
agree       0.272592
disagree    0.061486
Name: Stance, dtype: float64

In [30]:
crf = sklearn.ensemble.RandomForestClassifier(n_estimators=200, n_jobs=8, random_state=42)

In [31]:
crf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=8,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [32]:
preds = crf.predict(X_dev)
print(classification_report(y_dev, preds))

             precision    recall  f1-score   support

      agree       0.56      0.23      0.33       331
   disagree       0.85      0.11      0.19       102
    discuss       0.68      0.84      0.75       876
  unrelated       0.96      0.98      0.97      3689

avg / total       0.88      0.89      0.87      4998



In [34]:
y_train.value_counts(normalize=True)

unrelated    0.730555
discuss      0.178614
agree        0.074421
disagree     0.016409
Name: Stance, dtype: float64

In [35]:
# 4-class problem:
# By guessing most common category (unrelated), we would achieve approx. 73% accuracy. The baseline model achieves 88%.