In [None]:
import pandas as pd
import numpy as np
import sklearn
import os
from tqdm import tqdm, trange, tqdm_notebook
import re
import nltk
from multiprocessing import cpu_count, Pool
from functools import partial
import ipywidgets
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from rewrite.scorer import score_4class
import utils # utils from CS224U
from scipy.spatial import distance
import random
tqdm.pandas()

In [None]:
X_train = pd.read_hdf("X_train_full_allfeatures-NOLABEL.h5", key="df")
y_train = pd.read_hdf("y_train_full.h5", key="df")

In [None]:
def del_str_cols(df): # df should be X, e.g. X_train or X_dev
    del df["articleBody"]
    del df["Headline"]
    for col_name in df.columns:
        if "___" == col_name[0:3]:
            del df[col_name]
#del_str_cols()

In [None]:
Xy_train = pd.concat([X_train, y_train], axis=1)
print(Xy_train.shape)

In [5]:
y_train = Xy_train["Stance"]
X_train = Xy_train.drop("Stance", axis=1)
print(X_train.shape)
print(y_train.shape)

(49972, 49)
(49972,)


In [6]:
### START GLOVE CODE BLOCK
    # ---> Running this adds glove feature
# GloVe reading code from CS224U instructors
# functions for .apply() by us
#def randvec(w, n=50, lower=-1.0, upper=1.0):
#    """Returns a random vector of length `n`. `w` is ignored."""
#    return utils.randvec(n=n, lower=lower, upper=upper)

glove_dim = 200
glove_src = os.path.join("GloVe", 'glove.6B.'+str(glove_dim)+'d.txt')
GLOVE = utils.glove2dict(glove_src)
def text_to_mean_vec_ignore_unk(text, w2v=GLOVE, dim=glove_dim):
    vec = np.zeros(dim)
    num_added = 0
    for word in text:
        if word in w2v:
            vec += w2v[word]
            num_added += 1
    if num_added > 0:
        return vec/num_added
    else:
        return np.array([random.uniform(-0.5, 0.5) for i in range(glove_dim)])
def get_glove_cos_dist_hl_body(row):
    hl = row["___clean_headline_tokenized_lemmas"]
    body = row["___clean_body_tokenized_lemmas"]
    hl_vec = text_to_mean_vec_ignore_unk(hl)
    body_vec = text_to_mean_vec_ignore_unk(body)
    cosine_dist = distance.cosine(hl_vec, body_vec) # cosine() from scipy
    return cosine_dist
### END GLOVE CODE BLOCK
X_train["hl_body_glove_"+str(glove_dim)+"_cos_dist"] = X_train[["___clean_headline_tokenized_lemmas", "___clean_body_tokenized_lemmas"]].progress_apply(get_glove_cos_dist_hl_body, axis=1)

100%|██████████| 49972/49972 [00:32<00:00, 1550.66it/s]


In [None]:
### START SENTIMENT ANALYSIS
    # ---> Running this block adds VADER sentiment features
from nltk.sentiment import vader
sid = vader.SentimentIntensityAnalyzer()
def vader_polarity_scores(df, text_col_name, col_name_prefix):
    pol_scores = df[text_col_name].progress_apply(lambda hl: pd.Series(sid.polarity_scores(hl)))
    cols = pol_scores.columns
    new_cols = []
    for col_name in cols:
        new_cols.append("vader_"+col_name_prefix+"_"+col_name)
    pol_scores.columns = new_cols
    return pol_scores

vader_hl_df = vader_polarity_scores(X_train, "Headline", "hl")
vader_body_df = vader_polarity_scores(X_train, "articleBody", "body")
X_train = pd.concat([X_train, vader_hl_df, vader_body_df], axis=1)
### END SENTIMENT ANALYSIS

In [None]:
# END SENTIMENT ANALYSIS

In [None]:
del_str_cols(X_train)

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
X_train.head()

In [None]:
X_train, X_dev, y_train, y_dev = sklearn.model_selection.train_test_split(X_train, y_train, test_size=0.1, random_state=42, shuffle=True)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_dev.shape)
print(y_dev.shape)

In [None]:
clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)

In [None]:
clf.fit(X_train, y_train)

In [None]:
preds = clf.predict(X_dev)
print(classification_report(y_dev, preds))
score, max_score = score_4class(y_dev, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
feat_imp = clf.feature_importances_
indices = np.argsort(feat_imp)[::-1]
for ii in indices:
    print(X_train.columns[ii]+": "+str(feat_imp[ii]))

In [None]:
crf = sklearn.ensemble.RandomForestClassifier(n_estimators=400, n_jobs=8, random_state=42)
crf.fit(X_train, y_train)

In [None]:
preds = crf.predict(X_dev)
print(classification_report(y_dev, preds))

In [None]:
score, max_score = score_4class(y_dev, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
preds = clf.predict(X_test)
print(classification_report(y_test, preds))

score, max_score = score_4class(y_test, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
-----

In [None]:
crf = sklearn.ensemble.RandomForestClassifier(n_estimators=200, n_jobs=8, random_state=42)

In [None]:
crf.fit(X_train, y_train)

In [None]:
preds = crf.predict(X_dev)
print(classification_report(y_dev, preds))

In [None]:
### LOAD TEST SET ###
X_test = pd.read_hdf("X_TEST_full_allfeatures-NOLABEL.h5", key="df")
y_test = pd.read_hdf("y_TEST_full.h5", key="df")

Xy_test = pd.concat([X_test, y_test], axis=1)

y_test = Xy_test["Stance"]
X_test = Xy_test.drop("Stance", axis=1)

#vader_hl_df = vader_polarity_scores(X_test, "Headline", "hl")
#vader_body_df = vader_polarity_scores(X_test, "articleBody", "body")
#X_test = pd.concat([X_test, vader_hl_df, vader_body_df], axis=1)
X_test["hl_body_glove_"+str(glove_dim)+"_cos_dist"] = X_test[["___clean_headline_tokenized_lemmas", "___clean_body_tokenized_lemmas"]].progress_apply(get_glove_cos_dist_hl_body, axis=1)

del_str_cols(X_test)

 17%|█▋        | 4216/25413 [00:02<00:11, 1871.36it/s]