In [1]:
import pandas as pd
import numpy as np
import sklearn
import os
from tqdm import tqdm, trange, tqdm_notebook
import re
import nltk
from multiprocessing import cpu_count, Pool
from functools import partial
import ipywidgets
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from rewrite.scorer import score_4class
import utils # utils from CS224U
from scipy.spatial import distance
import random
tqdm.pandas()
nltk.download('averaged_perceptron_tagger')
from nltk.sentiment import vader

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/gui/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!




In [19]:
def del_str_cols(df): # df should be X, e.g. X_train or X_dev
    del df["articleBody"]
    del df["Headline"]
    del df["Body ID"]
    for col_name in df.columns:
        if "___" == col_name[0:3]:
            del df[col_name]
            
def print_reports(preds, actual):
    print(classification_report(actual, preds))
    score, max_score = score_4class(actual, preds)
    print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

def print_feature_importances(model, df):
    feat_imp = model.feature_importances_
    indices = np.argsort(feat_imp)[::-1]
    for ii in indices:
        print(df.columns[ii]+": "+str(feat_imp[ii]))
        
# Usage: pass Xy_train (or a df with Body ID col., and y col. as well)
# returns the same format of df, but spplit so that no body ID is shared between sets
proportion_for_holdout_dev_set = 0.2
def disjoint_train_test_split(Xy, frac_for_test_set=proportion_for_holdout_dev_set, random_state=42):
    r = random.Random()
    r.seed(random_state)
    Xy_IDs = list(set(Xy["Body ID"]))
    print("Total unique IDs: "+str(len(Xy_IDs)))
    r.shuffle(Xy_IDs)
    num_IDs_for_train = int((1-frac_for_test_set)*len(Xy_IDs))
    train_IDs = Xy_IDs[:num_IDs_for_train]
    test_IDs = Xy_IDs[num_IDs_for_train:]
    train_df = Xy[Xy["Body ID"].isin(train_IDs)]
    test_df = Xy[Xy["Body ID"].isin(test_IDs)]
    print("# instances in train: "+str(train_df.shape[0]))
    print("# instances in test: "+str(test_df.shape[0]))
    assert len(set(train_df["Body ID"].unique()) & set(test_df["Body ID"].unique())) == 0 # totally disjoint
    return train_df, test_df

In [20]:
class TwoModel:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        #self.make_other_sets()
        
    def make_other_sets(self):
        #Xy_train = pd.concat([X_train, y_train], axis=1)
        Xy_train = self.X.merge(self.y, on=["Body ID"], how="left")
        #Xy_train, Xy_dev = sklearn.model_selection.train_test_split(Xy_train, test_size=0.2, random_state=42, shuffle=True)
        Xy_train, Xy_dev = disjoint_train_test_split(Xy_train, proportion_for_holdout_dev_set, random_state=43)
        
        Xy_train_1 = Xy_train.copy()
        Xy_train_1.loc[Xy_train_1["Stance"] != "unrelated", 'Stance'] = "related"
        self.y_train_1 = Xy_train_1["Stance"]
        self.X_train_1 = Xy_train_1.drop("Stance", axis=1)
        
        Xy_dev_1 = Xy_dev.copy()
        Xy_dev_1.loc[Xy_dev_1["Stance"] != "unrelated", 'Stance'] = "related"
        self.y_dev_1 = Xy_dev_1["Stance"]
        self.X_dev_1 = Xy_dev_1.drop("Stance", axis=1)
        
        Xy_train_2 = Xy_train[Xy_train["Stance"] != "unrelated"]
        self.y_train_2 = Xy_train_2["Stance"]
        self.X_train_2 = Xy_train_2.drop("Stance", axis=1)

        Xy_dev_2 = Xy_dev[Xy_dev["Stance"] != "unrelated"]
        self.y_dev_2 = Xy_dev_2["Stance"]
        self.X_dev_2 = Xy_dev_2.drop("Stance", axis=1)
        
        del_str_cols(self.X_train_1)
        del_str_cols(self.X_dev_1)
        del_str_cols(self.X_train_2)
        del_str_cols(self.X_dev_2)
        
        return Xy_train.drop("Stance", axis=1), Xy_train["Stance"], Xy_dev.drop("Stance", axis=1), Xy_dev["Stance"]
        
    def fit(self):
        ## CLASSIFIER 1 - RELATED/UNRELATED
        self.mod1 = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
        self.mod1.fit(self.X_train_1, self.y_train_1)
        
        ## CLASSIFIER 2 - 3-class
        self.mod2 = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
        self.mod2.fit(self.X_train_2, self.y_train_2)
        
        
    def predict(self, X):
        #if X is None:
        #    X = self.X_dev_1
        del_str_cols(X)
        preds_1 = self.mod1.predict(X)
        preds_2 = self.mod2.predict(X) # note X_dev_1
        
        new_preds = preds_1.copy()
        for ii in range(preds_1.shape[0]):
            if preds_1[ii] == "related":
                new_preds[ii] = preds_2[ii]
            else:
                new_preds[ii] = "unrelated"
        return new_preds

        

In [4]:
from multiprocessing import cpu_count, Pool
cores = cpu_count() 
partitions = cores
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

In [5]:
##### ADD OUR FEATURES
# START SENTIMENT ANALYSIS
def vader_pol_helper(df):
    return df.apply(lambda hl: pd.Series(sid.polarity_scores(hl)))
sid = vader.SentimentIntensityAnalyzer() # global scope for parallelization
def add_vader_sent(X_df):
    def vader_polarity_scores(df, text_col_name, col_name_prefix):
        pol_scores = parallelize(df[text_col_name], vader_pol_helper)
        #pol_scores = df[text_col_name].progress_apply(lambda hl: pd.Series(sid.polarity_scores(hl)))
        cols = pol_scores.columns
        new_cols = []
        for col_name in cols:
            new_cols.append("vader_"+col_name_prefix+"_"+col_name)
        pol_scores.columns = new_cols
        return pol_scores

    vader_hl_df = vader_polarity_scores(X_df, "Headline", "hl")
    vader_body_df = vader_polarity_scores(X_df, "articleBody", "body")
    X_df = pd.concat([X_df, vader_hl_df, vader_body_df], axis=1)
    return X_df

# END SENTIMENT ANALYSIS
### GLOVE ####
glove_dim = 200
glove_src = os.path.join("GloVe", 'glove.6B.'+str(glove_dim)+'d.txt')
GLOVE = utils.glove2dict(glove_src)
def text_to_mean_vec_ignore_unk(text, w2v=GLOVE, dim=glove_dim):
    vec = np.zeros(dim)
    num_added = 0
    for word in text:
        if word in w2v:
            vec += w2v[word]
            num_added += 1
    if num_added > 0:
        return vec/num_added
    else:
        return np.array([random.uniform(-0.5, 0.5) for i in range(glove_dim)])
def get_glove_cos_dist_hl_body(row):
    hl = row["___clean_headline_tokenized_lemmas"]
    body = row["___clean_body_tokenized_lemmas"]
    hl_vec = text_to_mean_vec_ignore_unk(hl)
    body_vec = text_to_mean_vec_ignore_unk(body)
    cosine_dist = distance.cosine(hl_vec, body_vec) # cosine() from scipy
    return cosine_dist

def get_verbs(text):
    verbs = [token for token, pos in nltk.pos_tag(text) if pos.startswith('VB')]
    verbs_sentence = ' '.join(word[0] for word in verbs)
    return verbs_sentence
 
def get_verb_glove_cos_dist_hl_body(row):
    hl = row["___clean_headline_tokenized_lemmas"]
    body = row["___clean_body_tokenized_lemmas"]
    
    hl_verbs = get_verbs(hl)
    body_verbs = get_verbs(body)
    
    hl_vec = text_to_mean_vec_ignore_unk(hl_verbs)
    body_vec = text_to_mean_vec_ignore_unk(body_verbs)
    cosine_dist = distance.cosine(hl_vec, body_vec) # cosine() from scipy
    return cosine_dist

def hl_body_glove_cos_dist_helper(X_df):
    return X_df[["___clean_headline_tokenized_lemmas", "___clean_body_tokenized_lemmas"]].apply(get_glove_cos_dist_hl_body, axis=1)

def hl_body_glove_verb_helper(X_df):
    return X_df[["___clean_headline_tokenized_lemmas", "___clean_body_tokenized_lemmas"]].apply(get_verb_glove_cos_dist_hl_body, axis=1)

### END GLOVE CODE BLOCK
def add_all_features(X_df, parallel=True):
    print("Adding glove feature #1...", flush=True)
    X_df["hl_body_glove_"+str(glove_dim)+"_cos_dist"] = parallelize(X_df, hl_body_glove_cos_dist_helper)
    print("Adding glove feature #2...", flush=True)
    X_df["hl_body_verb_glove_"+str(glove_dim)+"_cos_dist"] = parallelize(X_df, hl_body_glove_verb_helper)
    print("Adding VADER sentiment...", flush=True)
    X_df = add_vader_sent(X_df)
    return X_df

In [6]:
X_train = pd.read_hdf("X_train_disjoint-allfeatures.h5", key="df")
y_train = pd.read_hdf("y_train_disjoint.h5", key="df")

In [7]:
%time X_train = add_all_features(X_train)

Adding glove feature #1...
Adding glove feature #2...
Adding VADER sentiment...
CPU times: user 9.84 s, sys: 9.88 s, total: 19.7 s
Wall time: 5min 37s


In [8]:
print(X_train.shape)

(39437, 60)


In [21]:
print(y_train.shape)

(39437, 2)


In [22]:
tsm = TwoModel(X_train, y_train)

In [23]:
X_train, y_train, X_dev, y_dev = tsm.make_other_sets()
print(X_train.shape)
print(y_train.shape)
print(X_dev.shape)
print(y_dev.shape)

Total unique IDs: 1346
# instances in train: 1752859
# instances in test: 481302
(1752859, 60)
(1752859,)
(481302, 60)
(481302,)


In [24]:
X_train.head()

Unnamed: 0,Headline,articleBody,Body ID,___clean_headline,___clean_headline_tokenized_lemmas,___clean_body,___clean_body_tokenized_lemmas,bin_count,bin_count_early,bin_count_stopless,...,hl_body_glove_200_cos_dist,hl_body_verb_glove_200_cos_dist,vader_hl_neg,vader_hl_neu,vader_hl_pos,vader_hl_compound,vader_body_neg,vader_body_neu,vader_body_pos,vader_body_compound
0,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...,712,police find mass graves with at least 15 bodie...,"[police, find, mass, graf, with, at, least, 15...",danny boyle is directing the untitled film set...,"[danny, boyle, is, directing, the, untitled, f...",2,0,0,...,0.157001,0.221717,0.194,0.806,0.0,-0.4767,0.008,0.907,0.085,0.9409
1,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...,712,police find mass graves with at least 15 bodie...,"[police, find, mass, graf, with, at, least, 15...",danny boyle is directing the untitled film set...,"[danny, boyle, is, directing, the, untitled, f...",2,0,0,...,0.157001,0.221717,0.194,0.806,0.0,-0.4767,0.008,0.907,0.085,0.9409
2,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...,712,police find mass graves with at least 15 bodie...,"[police, find, mass, graf, with, at, least, 15...",danny boyle is directing the untitled film set...,"[danny, boyle, is, directing, the, untitled, f...",2,0,0,...,0.157001,0.221717,0.194,0.806,0.0,-0.4767,0.008,0.907,0.085,0.9409
3,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...,712,police find mass graves with at least 15 bodie...,"[police, find, mass, graf, with, at, least, 15...",danny boyle is directing the untitled film set...,"[danny, boyle, is, directing, the, untitled, f...",2,0,0,...,0.157001,0.221717,0.194,0.806,0.0,-0.4767,0.008,0.907,0.085,0.9409
4,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...,712,police find mass graves with at least 15 bodie...,"[police, find, mass, graf, with, at, least, 15...",danny boyle is directing the untitled film set...,"[danny, boyle, is, directing, the, untitled, f...",2,0,0,...,0.157001,0.221717,0.194,0.806,0.0,-0.4767,0.008,0.907,0.085,0.9409


In [None]:
tsm.fit()

In [None]:
preds = tsm.predict(X_dev)
print_reports(preds, y_dev)

In [None]:
X_test = pd.read_hdf("X_TEST_full_allfeatures-NOLABEL.h5", key="df")
y_test = pd.read_hdf("y_TEST_full.h5", key="df")

In [None]:
X_test = add_all_features(X_test)

In [None]:
preds = tsm.predict(X_test)
print_reports(preds, y_test)

In [None]:
----

In [None]:
crf = sklearn.ensemble.RandomForestClassifier(n_estimators=200, n_jobs=8, random_state=42)

In [None]:
crf.fit(X_train, y_train)

In [None]:
preds = crf.predict(X_dev)
print(classification_report(y_dev, preds))

In [None]:
y_train.value_counts(normalize=True)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test, preds, labels=["agree", "disagree", "discuss"], sample_weight=None)