In [None]:
import pandas as pd
import numpy as np
import sklearn
import os
from tqdm import tqdm, trange, tqdm_notebook
import re
import nltk
from multiprocessing import cpu_count, Pool
from functools import partial
import ipywidgets
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from rewrite.scorer import score_4class
import utils # utils from CS224U
from scipy.spatial import distance
import random
tqdm.pandas()

In [None]:
def del_str_cols(df): # df should be X, e.g. X_train or X_dev
    del df["articleBody"]
    del df["Headline"]
    for col_name in df.columns:
        if "___" == col_name[0:3]:
            del df[col_name]
            
def print_reports(preds, actual):
    print(classification_report(actual, preds))
    score, max_score = score_4class(actual, preds)
    print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
class TwoModel:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        #self.make_other_sets()
        
    def make_other_sets(self):
        Xy_train = pd.concat([X_train, y_train], axis=1)
        Xy_train, Xy_dev = sklearn.model_selection.train_test_split(Xy_train, test_size=0.2, random_state=42, shuffle=True)
        
        Xy_train_1 = Xy_train.copy()
        Xy_train_1.loc[Xy_train_1["Stance"] != "unrelated", 'Stance'] = "related"
        self.y_train_1 = Xy_train_1["Stance"]
        self.X_train_1 = Xy_train_1.drop("Stance", axis=1)
        
        Xy_dev_1 = Xy_dev.copy()
        Xy_dev_1.loc[Xy_dev_1["Stance"] != "unrelated", 'Stance'] = "related"
        self.y_dev_1 = Xy_dev_1["Stance"]
        self.X_dev_1 = Xy_dev_1.drop("Stance", axis=1)
        
        Xy_train_2 = Xy_train[Xy_train["Stance"] != "unrelated"]
        self.y_train_2 = Xy_train_2["Stance"]
        self.X_train_2 = Xy_train_2.drop("Stance", axis=1)

        Xy_dev_2 = Xy_dev[Xy_dev["Stance"] != "unrelated"]
        self.y_dev_2 = Xy_dev_2["Stance"]
        self.X_dev_2 = Xy_dev_2.drop("Stance", axis=1)
        
        del_str_cols(self.X_train_1)
        del_str_cols(self.X_dev_1)
        del_str_cols(self.X_train_2)
        del_str_cols(self.X_dev_2)
        
        return Xy_train.drop("Stance", axis=1), Xy_train["Stance"], Xy_dev.drop("Stance", axis=1), Xy_dev["Stance"]
        
    def fit(self):
        ## CLASSIFIER 1 - RELATED/UNRELATED
        self.mod1 = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
        self.mod1.fit(self.X_train_1, self.y_train_1)
        
        ## CLASSIFIER 2 - 3-class
        self.mod2 = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
        self.mod2.fit(self.X_train_2, self.y_train_2)
        
        
    def predict(self, X):
        #if X is None:
        #    X = self.X_dev_1
        del_str_cols(X)
        preds_1 = self.mod1.predict(X)
        preds_2 = self.mod2.predict(X) # note X_dev_1
        
        new_preds = preds_1.copy()
        for ii in range(preds_1.shape[0]):
            if preds_1[ii] == "related":
                new_preds[ii] = preds_2[ii]
            else:
                new_preds[ii] = "unrelated"
        return new_preds

        

In [None]:
X_train = pd.read_hdf("X_train_full_allfeatures-NOLABEL.h5", key="df")
y_train = pd.read_hdf("y_train_full.h5", key="df")

In [None]:
tsm = TwoModel(X_train, y_train)

In [None]:
X_train, y_train, X_dev, y_dev = tsm.make_other_sets()
print(X_train.shape)
print(y_train.shape)
print(X_dev.shape)
print(y_dev.shape)

In [7]:
tsm.fit()

      Iter       Train Loss   Remaining Time 
         1           1.0244           15.78s
         2           0.9193           14.77s
         3           0.8355           14.34s
         4           0.7672           14.33s
         5           0.7104           14.16s
         6           0.6616           13.99s
         7           0.6199           13.90s
         8           0.5840           13.80s
         9           0.5530           13.68s
        10           0.5263           13.55s
        20           0.3716           13.19s
        30           0.3187           12.19s
        40           0.2952           11.21s
        50           0.2808           10.43s
        60           0.2717            9.67s
        70           0.2659            8.87s
        80           0.2615            8.16s
        90           0.2580            7.40s
       100           0.2552            6.72s
       200           0.2374            0.00s
      Iter       Train Loss   Remaining Time 
        

In [8]:
preds = tsm.predict(X_dev)

In [9]:
preds

array(['unrelated', 'unrelated', 'unrelated', ..., 'unrelated',
       'unrelated', 'unrelated'], dtype=object)

In [11]:
print_reports(preds, y_dev)

             precision    recall  f1-score   support

      agree       0.63      0.17      0.26       703
   disagree       0.80      0.02      0.04       180
    discuss       0.65      0.88      0.75      1779
  unrelated       0.97      0.98      0.97      7333

avg / total       0.88      0.89      0.87      9995

Weighted accuracy: 0.8135253879094599 (3657.0 out of 4495.25)


In [None]:
# START SENTIMENT ANALYSIS

In [None]:
tqdm.pandas()

In [None]:
from nltk.sentiment import vader
sid = vader.SentimentIntensityAnalyzer()

In [None]:
def vader_polarity_scores(df, text_col_name, col_name_prefix):
    pol_scores = df[text_col_name].progress_apply(lambda hl: pd.Series(sid.polarity_scores(hl)))
    cols = pol_scores.columns
    new_cols = []
    for col_name in cols:
        new_cols.append("vader_"+col_name_prefix+"_"+col_name)
    pol_scores.columns = new_cols
    return pol_scores

vader_hl_df = vader_polarity_scores(X_train, "Headline", "hl")
vader_body_df = vader_polarity_scores(X_train, "articleBody", "body")
X_train = pd.concat([X_train, vader_hl_df, vader_body_df], axis=1)

In [None]:
# END SENTIMENT ANALYSIS

In [None]:
# START GLOVE AND POS TAGGING

In [None]:
### GLOVE ####
glove_dim = 200
glove_src = os.path.join("GloVe", 'glove.6B.'+str(glove_dim)+'d.txt')
GLOVE = utils.glove2dict(glove_src)
def text_to_mean_vec_ignore_unk(text, w2v=GLOVE, dim=glove_dim):
    vec = np.zeros(dim)
    num_added = 0
    for word in text:
        if word in w2v:
            vec += w2v[word]
            num_added += 1
    if num_added > 0:
        return vec/num_added
    else:
        return np.array([random.uniform(-0.5, 0.5) for i in range(glove_dim)])
def get_glove_cos_dist_hl_body(row):
    hl = row["___clean_headline_tokenized_lemmas"]
    body = row["___clean_body_tokenized_lemmas"]
    hl_vec = text_to_mean_vec_ignore_unk(hl)
    body_vec = text_to_mean_vec_ignore_unk(body)
    cosine_dist = distance.cosine(hl_vec, body_vec) # cosine() from scipy
    return cosine_dist
### END GLOVE CODE BLOCK
X_train["hl_body_glove_"+str(glove_dim)+"_cos_dist"] = X_train[["___clean_headline_tokenized_lemmas", "___clean_body_tokenized_lemmas"]].progress_apply(get_glove_cos_dist_hl_body, axis=1)


nltk.download('averaged_perceptron_tagger')

def get_verbs(text):
    text = nltk.tokenize.word_tokenize(text)
    verbs = [token for token, pos in nltk.pos_tag(text) if pos.startswith('VB')]
    verbs_sentence = ' '.join(word[0] for word in verbs)
    return verbs_sentence
 
def get_verb_glove_cos_dist_hl_body(row):
    hl = row["___clean_headline_tokenized_lemmas"]
    body = row["___clean_body_tokenized_lemmas"]
    
    h1_verbs = get_verbs(h1)
    body_verbs = get_verbs(body)
    
    hl_vec = text_to_mean_vec_ignore_unk(h1_verbs)
    body_vec = text_to_mean_vec_ignore_unk(body_verbs)
    cosine_dist = distance.cosine(hl_vec, body_vec) # cosine() from scipy
    return cosine_dist

X_train["hl_body_verb_glove_"+str(glove_dim)+"_cos_dist"] = X_train[["___clean_headline_tokenized_lemmas", "___clean_body_tokenized_lemmas"]].progress_apply(get_verb_glove_cos_dist_hl_body, axis=1)


In [None]:
# END GLOVE AND  POS TAGGING

In [None]:
del_str_cols(X_train)

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
### LOAD TEST SET ###

In [None]:
X_test = pd.read_hdf("X_test_full_allfeatures-NOLABEL.h5", key="df")
y_test = pd.read_hdf("y_test_full.h5", key="df")

Xy_test = pd.concat([X_test, y_test], axis=1)

y_test = Xy_test["Stance"]
X_test = Xy_test.drop("Stance", axis=1)

vader_hl_df = vader_polarity_scores(X_test, "Headline", "hl")
vader_body_df = vader_polarity_scores(X_test, "articleBody", "body")
X_test = pd.concat([X_test, vader_hl_df, vader_body_df], axis=1)

X_test["hl_body_glove_"+str(glove_dim)+"_cos_dist"] = X_test[["___clean_headline_tokenized_lemmas", "___clean_body_tokenized_lemmas"]].progress_apply(get_glove_cos_dist_hl_body, axis=1)
X_test["hl_body_verb_glove_"+str(glove_dim)+"_cos_dist"] = X_test[["___clean_headline_tokenized_lemmas", "___clean_body_tokenized_lemmas"]].progress_apply(get_verb_glove_cos_dist_hl_body, axis=1)


del_str_cols(X_test)

In [None]:
X_train.head()

In [None]:
clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)

In [None]:
clf.fit(X_train, y_train)

In [None]:
preds = clf.predict(X_dev)

In [None]:
print(classification_report(y_dev, preds))
# With VADER sentiment features

In [None]:
score, max_score = score_4class(y_dev, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
preds = clf.predict(X_test)
print(classification_report(y_test, preds))

score, max_score = score_4class(y_test, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
preds = clf.predict(X_train)
print(classification_report(y_train, preds))

score, max_score = score_4class(y_train, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
crf = sklearn.ensemble.RandomForestClassifier(n_estimators=400, n_jobs=8, random_state=42)
crf.fit(X_train, y_train)

In [None]:
preds = crf.predict(X_dev)
print(classification_report(y_dev, preds))

In [None]:
score, max_score = score_4class(y_dev, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
preds = crf.predict(X_test)
print(classification_report(y_test, preds))

score, max_score = score_4class(y_test, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
preds = crf.predict(X_train)
print(classification_report(y_train, preds))

score, max_score = score_4class(y_train, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
clf.fit(X_test, y_test)
preds = clf.predict(X_test)
print(classification_report(y_test, preds))

preds = clf.predict(X_dev)
print(classification_report(y_dev, preds))

In [None]:
score, max_score = score_4class(y_dev, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
feat_imp = clf.feature_importances_
indices = np.argsort(feat_imp)[::-1]
for ii in indices:
    print(X_train.columns[ii]+": "+str(feat_imp[ii]))

In [None]:
-----

In [None]:
y_train.value_counts(normalize=True)

In [None]:
crf = sklearn.ensemble.RandomForestClassifier(n_estimators=200, n_jobs=8, random_state=42)

In [None]:
crf.fit(X_train, y_train)

In [None]:
preds = crf.predict(X_dev)
print(classification_report(y_dev, preds))

In [None]:
y_train.value_counts(normalize=True)

In [None]:
# 4-class problem:
# By guessing most common category (unrelated), we would achieve approx. 73% accuracy. The baseline model achieves 88%.

In [None]:
### 3 class problem filter out unrelated

In [None]:
Xy_train = pd.concat([X_train, y_train], axis=1)
print(Xy_train.shape)

In [None]:
Xy_train = Xy_train[Xy_train["Stance"] != "unrelated"]
print(Xy_train.shape)

In [None]:
y_train = Xy_train["Stance"]
X_train = Xy_train.drop("Stance", axis=1)
print(X_train.shape)
print(y_train.shape)

In [None]:
Xy_dev = pd.concat([X_dev, y_dev], axis=1)
Xy_dev = Xy_dev[Xy_dev["Stance"] != "unrelated"]
y_dev = Xy_dev["Stance"]
X_dev = Xy_dev.drop("Stance", axis=1)
print(X_dev.shape)
print(y_dev.shape)

In [None]:
Xy_test = pd.concat([X_test, y_test], axis=1)
Xy_test = Xy_test[Xy_test["Stance"] != "unrelated"]
y_test = Xy_test["Stance"]
X_test = Xy_test.drop("Stance", axis=1)
print(X_test.shape)
print(y_test.shape)

In [None]:
clf.fit(X_train, y_train)

In [None]:
preds = clf.predict(X_train)
print(classification_report(y_train, preds))

score, max_score = score_4class(y_train, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
preds = clf.predict(X_dev)
print(classification_report(y_dev, preds))

score, max_score = score_4class(y_dev, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
preds = clf.predict(X_test)
print(classification_report(y_test, preds))

score, max_score = score_4class(y_test, preds)
print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test, preds, labels=["agree", "disagree", "discuss"], sample_weight=None)