In [None]:
import pandas as pd
import numpy as np
import sklearn
import os
from tqdm import tqdm, trange, tqdm_notebook
import re
import nltk
from multiprocessing import cpu_count, Pool
from functools import partial
import ipywidgets
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from rewrite.scorer import score_4class
import utils # utils from CS224U
from scipy.spatial import distance
import random
tqdm.pandas()

In [None]:
X_train = pd.read_hdf("X_train_full_allfeatures-NOLABEL.h5", key="df")
y_train = pd.read_hdf("y_train_full.h5", key="df")

In [None]:
def del_str_cols(df): # df should be X, e.g. X_train or X_dev
    del df["articleBody"]
    del df["Headline"]
    for col_name in df.columns:
        if "___" == col_name[0:3]:
            del df[col_name]
#del_str_cols()

In [None]:
Xy_train = pd.concat([X_train, y_train], axis=1)
print(Xy_train.shape)

In [5]:
y_train = Xy_train["Stance"]
X_train = Xy_train.drop("Stance", axis=1)
print(X_train.shape)
print(y_train.shape)

(49972, 49)
(49972,)


In [6]:
### START GLOVE CODE BLOCK
    # ---> Running this adds glove feature
# GloVe reading code from CS224U instructors
# functions for .apply() by us
#def randvec(w, n=50, lower=-1.0, upper=1.0):
#    """Returns a random vector of length `n`. `w` is ignored."""
#    return utils.randvec(n=n, lower=lower, upper=upper)

glove_dim = 200
glove_src = os.path.join("GloVe", 'glove.6B.'+str(glove_dim)+'d.txt')
GLOVE = utils.glove2dict(glove_src)
def text_to_mean_vec_ignore_unk(text, w2v=GLOVE, dim=glove_dim):
    vec = np.zeros(dim)
    num_added = 0
    for word in text:
        if word in w2v:
            vec += w2v[word]
            num_added += 1
    if num_added > 0:
        return vec/num_added
    else:
        return np.array([random.uniform(-0.5, 0.5) for i in range(glove_dim)])
def get_glove_cos_dist_hl_body(row):
    hl = row["___clean_headline_tokenized_lemmas"]
    body = row["___clean_body_tokenized_lemmas"]
    hl_vec = text_to_mean_vec_ignore_unk(hl)
    body_vec = text_to_mean_vec_ignore_unk(body)
    cosine_dist = distance.cosine(hl_vec, body_vec) # cosine() from scipy
    return cosine_dist
### END GLOVE CODE BLOCK
X_train["hl_body_glove_"+str(glove_dim)+"_cos_dist"] = X_train[["___clean_headline_tokenized_lemmas", "___clean_body_tokenized_lemmas"]].progress_apply(get_glove_cos_dist_hl_body, axis=1)

100%|██████████| 49972/49972 [00:32<00:00, 1550.66it/s]


In [None]:
### START SENTIMENT ANALYSIS
    # ---> Running this block adds VADER sentiment features
from nltk.sentiment import vader
sid = vader.SentimentIntensityAnalyzer()
def vader_polarity_scores(df, text_col_name, col_name_prefix):
    pol_scores = df[text_col_name].progress_apply(lambda hl: pd.Series(sid.polarity_scores(hl)))
    cols = pol_scores.columns
    new_cols = []
    for col_name in cols:
        new_cols.append("vader_"+col_name_prefix+"_"+col_name)
    pol_scores.columns = new_cols
    return pol_scores

vader_hl_df = vader_polarity_scores(X_train, "Headline", "hl")
vader_body_df = vader_polarity_scores(X_train, "articleBody", "body")
X_train = pd.concat([X_train, vader_hl_df, vader_body_df], axis=1)
### END SENTIMENT ANALYSIS

In [None]:
# END SENTIMENT ANALYSIS

In [8]:
del_str_cols(X_train)

In [9]:
print(X_train.shape)
print(y_train.shape)

(49972, 44)
(49972,)


In [10]:
X_train.head()

Unnamed: 0,bin_count,bin_count_early,bin_count_stopless,word_overlap_features,wrf_hl_fake,wrf_hl_fraud,wrf_hl_hoax,wrf_hl_false,wrf_hl_deny,wrf_hl_denies,...,chargram_8_hits,chargram_8_early_hits,chargram_8_first_hits,chargram_4_hits,chargram_4_early_hits,chargram_4_first_hits,chargram_16_hits,chargram_16_early_hits,chargram_16_first_hits,hl_body_glove_200_cos_dist
0,2,0,0,0.014085,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.157001
1,10,7,7,0.046083,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.089189
2,5,4,1,0.030303,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.104475
3,3,3,0,0.028169,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.152586
4,9,5,4,0.032727,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.160243


In [11]:
X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(X_train, y_train, test_size=0.1, random_state=42, shuffle=True)

In [12]:
print(X_train.shape)
print(y_train.shape)
print(X_dev.shape)
print(y_dev.shape)

(44974, 44)
(44974,)
(4998, 44)
(4998,)


In [13]:
clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)

In [14]:
clf.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1       44003.6782           51.34s
         2       39386.2805           47.72s
         3       35644.9415           46.27s
         4       32588.8785           45.25s
         5       30070.6445           44.92s
         6       27965.6642           44.73s
         7       26208.2776           43.82s
         8       24717.7259           43.14s
         9       23471.6261           42.73s
        10       22380.8199           42.45s
        20       17270.0571           40.80s
        30       15812.0082           38.36s
        40       15159.4344           35.65s
        50       14787.8910           32.55s
        60       14534.0643           29.67s
        70       14339.2110           27.21s
        80       14181.4277           24.97s
        90       14052.3365           22.65s
       100       13929.0381           20.46s
       200       13051.6725            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=14128, subsample=1.0,
              verbose=True, warm_start=False)

In [15]:
preds = clf.predict(X_dev)
print(classification_report(y_dev, preds))
score, max_score = score_4class(y_dev, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

             precision    recall  f1-score   support

      agree       0.60      0.16      0.26       331
   disagree       0.29      0.02      0.04       102
    discuss       0.66      0.85      0.75       876
  unrelated       0.96      0.98      0.97      3689

avg / total       0.87      0.89      0.86      4998

Weighted accuracy: 0.8058263305322129 (1798.0 out of 2231.25)


In [18]:
preds = clf.predict(X_test)
print(classification_report(y_test, preds))
score, max_score = score_4class(y_test, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

             precision    recall  f1-score   support

      agree       0.35      0.08      0.14      1903
   disagree       0.09      0.00      0.01       697
    discuss       0.61      0.80      0.70      4464
  unrelated       0.94      0.98      0.96     18349

avg / total       0.81      0.85      0.82     25413

Weighted accuracy: 0.7525372814075743 (8768.0 out of 11651.25)


In [16]:
feat_imp = clf.feature_importances_
indices = np.argsort(feat_imp)[::-1]
for ii in indices:
    print(X_train.columns[ii]+": "+str(feat_imp[ii]))

hl_body_glove_200_cos_dist: 0.1902913508323937
word_overlap_features: 0.16351842500262193
bin_count_stopless: 0.1486098884502007
bin_count: 0.08389369075687826
chargram_2_hits: 0.06642816497813628
bin_count_early: 0.05931518680579846
ngram_2_hits: 0.05281044531087529
ngram_2_early_hits: 0.04216553357922573
chargram_2_early_hits: 0.033335825859544754
ngram_4_hits: 0.017039733089217805
chargram_2_first_hits: 0.016998518563192462
ngram_3_hits: 0.015279521011858558
polar_body: 0.013265917234754216
polar_hl: 0.011756816832107457
wrf_hl_hoax: 0.011040965955630118
ngram_6_hits: 0.011004930758880517
wrf_hl_doubt: 0.009921725583108599
ngram_4_early_hits: 0.007714985927264708
wrf_hl_false: 0.006872908325971317
ngram_5_hits: 0.0060373454428725125
ngram_3_early_hits: 0.00498573075764935
wrf_hl_fake: 0.004143759207083079
wrf_hl_not: 0.004111099024616867
wrf_hl_despite: 0.003371031615300037
wrf_hl_denies: 0.0032366957515727064
wrf_hl_bogus: 0.003057185337776991
ngram_5_early_hits: 0.0026724358373359

In [None]:
crf = sklearn.ensemble.RandomForestClassifier(n_estimators=400, n_jobs=8, random_state=42)
crf.fit(X_train, y_train)

In [None]:
preds = crf.predict(X_dev)
print(classification_report(y_dev, preds))

In [None]:
score, max_score = score_4class(y_dev, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
preds = clf.predict(X_test)
print(classification_report(y_test, preds))

score, max_score = score_4class(y_test, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
-----

In [None]:
crf = sklearn.ensemble.RandomForestClassifier(n_estimators=200, n_jobs=8, random_state=42)

In [None]:
crf.fit(X_train, y_train)

In [None]:
preds = crf.predict(X_dev)
print(classification_report(y_dev, preds))

In [7]:
### LOAD TEST SET ###
X_test = pd.read_hdf("X_TEST_full_allfeatures-NOLABEL.h5", key="df")
y_test = pd.read_hdf("y_TEST_full.h5", key="df")

Xy_test = pd.concat([X_test, y_test], axis=1)

y_test = Xy_test["Stance"]
X_test = Xy_test.drop("Stance", axis=1)

#vader_hl_df = vader_polarity_scores(X_test, "Headline", "hl")
#vader_body_df = vader_polarity_scores(X_test, "articleBody", "body")
#X_test = pd.concat([X_test, vader_hl_df, vader_body_df], axis=1)
X_test["hl_body_glove_"+str(glove_dim)+"_cos_dist"] = X_test[["___clean_headline_tokenized_lemmas", "___clean_body_tokenized_lemmas"]].progress_apply(get_glove_cos_dist_hl_body, axis=1)

del_str_cols(X_test)

100%|██████████| 25413/25413 [00:13<00:00, 1825.55it/s]
