In [1]:
import pandas as pd
import numpy as np
import sklearn
import os
from tqdm import tqdm, trange, tqdm_notebook
import re
import nltk
from multiprocessing import cpu_count, Pool
from functools import partial
import ipywidgets
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

In [2]:
X_train = pd.read_hdf("X_train_full_allfeatures-NOLABEL.h5", key="df")
y_train = pd.read_hdf("y_train_full.h5", key="df")

In [3]:
y_train.head()

0    unrelated
1        agree
2    unrelated
3    unrelated
4     disagree
Name: Stance, dtype: object

In [4]:
X_train.head()

Unnamed: 0,Headline,articleBody,___clean_headline,___clean_headline_tokenized_lemmas,___clean_body,___clean_body_tokenized_lemmas,bin_count,bin_count_early,bin_count_stopless,word_overlap_features,...,chargram_2_first_hits,chargram_8_hits,chargram_8_early_hits,chargram_8_first_hits,chargram_4_hits,chargram_4_early_hits,chargram_4_first_hits,chargram_16_hits,chargram_16_early_hits,chargram_16_first_hits
0,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...,police find mass graves with at least 15 bodie...,"[police, find, mass, graf, with, at, least, 15...",danny boyle is directing the untitled film set...,"[danny, boyle, is, directing, the, untitled, f...",2,0,0,0.014085,...,0,0,0,0,0,0,0,0,0,0
1,Hundreds of Palestinians flee floods in Gaza a...,Hundreds of Palestinians were evacuated from t...,hundreds of palestinians flee floods in gaza a...,"[hundred, of, palestinian, flee, flood, in, ga...",hundreds of palestinians were evacuated from t...,"[hundred, of, palestinian, were, evacuated, fr...",10,7,7,0.046083,...,3,0,0,0,0,0,0,0,0,0
2,"Christian Bale passes on role of Steve Jobs, a...",30-year-old Moscow resident was hospitalized w...,christian bale passes on role of steve jobs ac...,"[christian, bale, pass, on, role, of, steve, j...",30 year old moscow resident was hospitalized w...,"[30, year, old, moscow, resident, wa, hospital...",5,4,1,0.030303,...,1,0,0,0,0,0,0,0,0,0
3,HBO and Apple in Talks for $15/Month Apple TV ...,(Reuters) - A Canadian soldier was shot at the...,hbo and apple in talks for 15 month apple tv s...,"[hbo, and, apple, in, talk, for, 15, month, ap...",reuters a canadian soldier was shot at the can...,"[reuters, a, canadian, soldier, wa, shot, at, ...",3,3,0,0.028169,...,1,0,0,0,0,0,0,0,0,0
4,Spider burrowed through tourist's stomach and ...,"Fear not arachnophobes, the story of Bunbury's...",spider burrowed through tourist s stomach and ...,"[spider, burrowed, through, tourist, s, stomac...",fear not arachnophobes the story of bunbury s ...,"[fear, not, arachnophobes, the, story, of, bun...",9,5,4,0.032727,...,4,0,0,0,0,0,0,0,0,0


In [5]:
print(X_train.shape)
print(y_train.shape)

(49972, 49)
(49972,)


In [6]:
def del_str_cols(df): # df should be X, e.g. X_train or X_dev
    del df["articleBody"]
    del df["Headline"]
    for col_name in df.columns:
        if "___" == col_name[0:3]:
            del df[col_name]
#del_str_cols()

In [7]:
Xy_train = pd.concat([X_train, y_train], axis=1)
print(Xy_train.shape)

(49972, 50)


In [8]:
Xy_train.head()

Unnamed: 0,Headline,articleBody,___clean_headline,___clean_headline_tokenized_lemmas,___clean_body,___clean_body_tokenized_lemmas,bin_count,bin_count_early,bin_count_stopless,word_overlap_features,...,chargram_8_hits,chargram_8_early_hits,chargram_8_first_hits,chargram_4_hits,chargram_4_early_hits,chargram_4_first_hits,chargram_16_hits,chargram_16_early_hits,chargram_16_first_hits,Stance
0,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...,police find mass graves with at least 15 bodie...,"[police, find, mass, graf, with, at, least, 15...",danny boyle is directing the untitled film set...,"[danny, boyle, is, directing, the, untitled, f...",2,0,0,0.014085,...,0,0,0,0,0,0,0,0,0,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,Hundreds of Palestinians were evacuated from t...,hundreds of palestinians flee floods in gaza a...,"[hundred, of, palestinian, flee, flood, in, ga...",hundreds of palestinians were evacuated from t...,"[hundred, of, palestinian, were, evacuated, fr...",10,7,7,0.046083,...,0,0,0,0,0,0,0,0,0,agree
2,"Christian Bale passes on role of Steve Jobs, a...",30-year-old Moscow resident was hospitalized w...,christian bale passes on role of steve jobs ac...,"[christian, bale, pass, on, role, of, steve, j...",30 year old moscow resident was hospitalized w...,"[30, year, old, moscow, resident, wa, hospital...",5,4,1,0.030303,...,0,0,0,0,0,0,0,0,0,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,(Reuters) - A Canadian soldier was shot at the...,hbo and apple in talks for 15 month apple tv s...,"[hbo, and, apple, in, talk, for, 15, month, ap...",reuters a canadian soldier was shot at the can...,"[reuters, a, canadian, soldier, wa, shot, at, ...",3,3,0,0.028169,...,0,0,0,0,0,0,0,0,0,unrelated
4,Spider burrowed through tourist's stomach and ...,"Fear not arachnophobes, the story of Bunbury's...",spider burrowed through tourist s stomach and ...,"[spider, burrowed, through, tourist, s, stomac...",fear not arachnophobes the story of bunbury s ...,"[fear, not, arachnophobes, the, story, of, bun...",9,5,4,0.032727,...,0,0,0,0,0,0,0,0,0,disagree


In [9]:
Xy_train = Xy_train[Xy_train["Stance"] != "unrelated"]

In [10]:
print(Xy_train.shape)

(13427, 50)


In [11]:
y_train = Xy_train["Stance"]
X_train = Xy_train.drop("Stance", axis=1)
print(X_train.shape)
print(y_train.shape)

(13427, 49)
(13427,)


In [12]:
# START SENTIMENT ANALYSIS

In [13]:
tqdm.pandas()

In [14]:
from nltk.sentiment import vader
sid = vader.SentimentIntensityAnalyzer()



In [15]:
def vader_polarity_scores(df, text_col_name, col_name_prefix):
    pol_scores = df[text_col_name].progress_apply(lambda hl: pd.Series(sid.polarity_scores(hl)))
    cols = pol_scores.columns
    new_cols = []
    for col_name in cols:
        new_cols.append("vader_"+col_name_prefix+"_"+col_name)
    pol_scores.columns = new_cols
    return pol_scores

vader_hl_df = vader_polarity_scores(X_train, "Headline", "hl")
vader_body_df = vader_polarity_scores(X_train, "articleBody", "body")
X_train = pd.concat([X_train, vader_hl_df, vader_body_df], axis=1)

100%|██████████| 13427/13427 [00:08<00:00, 1564.68it/s]
100%|██████████| 13427/13427 [01:18<00:00, 171.45it/s]


In [16]:
# END SENTIMENT ANALYSIS

In [17]:
del_str_cols(X_train)

In [18]:
print(X_train.shape)
print(y_train.shape)

(13427, 51)
(13427,)


In [19]:
X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(X_train, y_train, test_size=0.1, random_state=42, shuffle=True)

In [20]:
print(X_train.shape)
print(y_train.shape)
print(X_dev.shape)
print(y_dev.shape)

(12084, 51)
(12084,)
(1343, 51)
(1343,)


In [21]:
X_train.head()

Unnamed: 0,bin_count,bin_count_early,bin_count_stopless,word_overlap_features,wrf_hl_fake,wrf_hl_fraud,wrf_hl_hoax,wrf_hl_false,wrf_hl_deny,wrf_hl_denies,...,chargram_16_early_hits,chargram_16_first_hits,vader_hl_compound,vader_hl_neg,vader_hl_neu,vader_hl_pos,vader_body_compound,vader_body_neg,vader_body_neu,vader_body_pos
21452,9,9,7,0.029762,0,0,0,0,0,0,...,0,0,0.0,0.0,1.0,0.0,-0.994,0.1,0.863,0.037
46770,11,5,7,0.115789,0,0,0,0,0,0,...,0,0,0.5267,0.0,0.793,0.207,0.9147,0.02,0.864,0.116
38655,9,8,5,0.059259,0,0,0,0,0,0,...,0,0,0.4215,0.0,0.797,0.203,0.765,0.032,0.901,0.067
39514,6,6,2,0.043165,0,0,0,0,0,0,...,0,0,0.2732,0.186,0.552,0.262,-0.3195,0.058,0.91,0.032
43951,5,5,3,0.026882,0,0,0,0,0,0,...,0,0,0.0,0.0,1.0,0.0,0.8979,0.053,0.858,0.089


In [22]:
clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)

In [23]:
clf.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1       11003.9865           10.15s
         2       10683.7007           11.25s
         3       10416.2505           10.66s
         4       10183.9998           10.37s
         5        9986.4438           10.15s
         6        9816.8080           10.16s
         7        9671.1904           10.02s
         8        9539.2330            9.98s
         9        9424.3425           10.41s
        10        9322.9845           10.63s
        20        8744.6089           10.75s
        30        8467.6826            9.77s
        40        8262.3553            8.71s
        50        8085.7963            7.81s
        60        7921.1398            7.03s
        70        7777.6926            6.36s
        80        7650.9153            5.75s
        90        7536.6099            5.18s
       100        7413.0026            4.71s
       200        6484.8665            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=14128, subsample=1.0,
              verbose=True, warm_start=False)

In [24]:
preds = clf.predict(X_dev)

In [25]:
print(classification_report(y_dev, preds))
# With VADER sentiment features

             precision    recall  f1-score   support

      agree       0.83      0.37      0.51       384
   disagree       0.71      0.12      0.21        97
    discuss       0.73      0.98      0.84       862

avg / total       0.76      0.74      0.70      1343



In [26]:
# 3-class problem: guessing the most common class (discuss) would yield approx. 66% accuracy.
# The baseline model instead yields 69% acc., suggesting only a small improvement beyond random.

In [27]:
-----

SyntaxError: invalid syntax (<ipython-input-27-0855cc4c4ee0>, line 1)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
crf = sklearn.ensemble.RandomForestClassifier(n_estimators=200, n_jobs=8, random_state=42)

In [None]:
crf.fit(X_train, y_train)

In [None]:
preds = crf.predict(X_dev)
print(classification_report(y_dev, preds))

In [None]:
y_train.value_counts(normalize=True)

In [None]:
# 4-class problem:
# By guessing most common category (unrelated), we would achieve approx. 73% accuracy. The baseline model achieves 88%.