In [1]:
import pandas as pd
import numpy as np
import sklearn
import os
from tqdm import tqdm, trange, tqdm_notebook
import re
import nltk
from multiprocessing import cpu_count, Pool
from functools import partial
import ipywidgets
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from rewrite.scorer import score_4class
import utils # utils from CS224U
from scipy.spatial import distance
import random
tqdm.pandas()
nltk.download('averaged_perceptron_tagger')
from nltk.sentiment import vader

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/gui/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!




In [2]:
def del_str_cols(df): # df should be X, e.g. X_train or X_dev
    del df["articleBody"]
    del df["Headline"]
    for col_name in df.columns:
        if "___" == col_name[0:3]:
            del df[col_name]
            
def print_reports(preds, actual):
    print(classification_report(actual, preds))
    score, max_score = score_4class(actual, preds)
    print("Weighted accuracy: "+str(score/max_score)+" ("+str(score)+" out of "+str(max_score)+")")

def print_feature_importances(model, df):
    feat_imp = model.feature_importances_
    indices = np.argsort(feat_imp)[::-1]
    for ii in indices:
        print(df.columns[ii]+": "+str(feat_imp[ii]))

In [3]:
class TwoModel:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        #self.make_other_sets()
        
    def make_other_sets(self):
        Xy_train = pd.concat([X_train, y_train], axis=1)
        Xy_train, Xy_dev = sklearn.model_selection.train_test_split(Xy_train, test_size=0.2, random_state=42, shuffle=True)
        
        Xy_train_1 = Xy_train.copy()
        Xy_train_1.loc[Xy_train_1["Stance"] != "unrelated", 'Stance'] = "related"
        self.y_train_1 = Xy_train_1["Stance"]
        self.X_train_1 = Xy_train_1.drop("Stance", axis=1)
        
        Xy_dev_1 = Xy_dev.copy()
        Xy_dev_1.loc[Xy_dev_1["Stance"] != "unrelated", 'Stance'] = "related"
        self.y_dev_1 = Xy_dev_1["Stance"]
        self.X_dev_1 = Xy_dev_1.drop("Stance", axis=1)
        
        Xy_train_2 = Xy_train[Xy_train["Stance"] != "unrelated"]
        self.y_train_2 = Xy_train_2["Stance"]
        self.X_train_2 = Xy_train_2.drop("Stance", axis=1)

        Xy_dev_2 = Xy_dev[Xy_dev["Stance"] != "unrelated"]
        self.y_dev_2 = Xy_dev_2["Stance"]
        self.X_dev_2 = Xy_dev_2.drop("Stance", axis=1)
        
        del_str_cols(self.X_train_1)
        del_str_cols(self.X_dev_1)
        del_str_cols(self.X_train_2)
        del_str_cols(self.X_dev_2)
        
        return Xy_train.drop("Stance", axis=1), Xy_train["Stance"], Xy_dev.drop("Stance", axis=1), Xy_dev["Stance"]
        
    def fit(self):
        ## CLASSIFIER 1 - RELATED/UNRELATED
        self.mod1 = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
        self.mod1.fit(self.X_train_1, self.y_train_1)
        
        ## CLASSIFIER 2 - 3-class
        self.mod2 = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
        self.mod2.fit(self.X_train_2, self.y_train_2)
        
        
    def predict(self, X):
        #if X is None:
        #    X = self.X_dev_1
        del_str_cols(X)
        preds_1 = self.mod1.predict(X)
        preds_2 = self.mod2.predict(X) # note X_dev_1
        
        new_preds = preds_1.copy()
        for ii in range(preds_1.shape[0]):
            if preds_1[ii] == "related":
                new_preds[ii] = preds_2[ii]
            else:
                new_preds[ii] = "unrelated"
        return new_preds

        

In [4]:
from multiprocessing import cpu_count, Pool
cores = cpu_count() 
partitions = cores
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

In [5]:
##### ADD OUR FEATURES
# START SENTIMENT ANALYSIS
def vader_pol_helper(df):
    return df.apply(lambda hl: pd.Series(sid.polarity_scores(hl)))
sid = vader.SentimentIntensityAnalyzer() # global scope for parallelization
def add_vader_sent(X_df):
    def vader_polarity_scores(df, text_col_name, col_name_prefix):
        pol_scores = parallelize(df[text_col_name], vader_pol_helper)
        #pol_scores = df[text_col_name].progress_apply(lambda hl: pd.Series(sid.polarity_scores(hl)))
        cols = pol_scores.columns
        new_cols = []
        for col_name in cols:
            new_cols.append("vader_"+col_name_prefix+"_"+col_name)
        pol_scores.columns = new_cols
        return pol_scores

    vader_hl_df = vader_polarity_scores(X_df, "Headline", "hl")
    vader_body_df = vader_polarity_scores(X_df, "articleBody", "body")
    X_df = pd.concat([X_df, vader_hl_df, vader_body_df], axis=1)
    return X_df

# END SENTIMENT ANALYSIS
### GLOVE ####
glove_dim = 200
glove_src = os.path.join("GloVe", 'glove.6B.'+str(glove_dim)+'d.txt')
GLOVE = utils.glove2dict(glove_src)
def text_to_mean_vec_ignore_unk(text, w2v=GLOVE, dim=glove_dim):
    vec = np.zeros(dim)
    num_added = 0
    for word in text:
        if word in w2v:
            vec += w2v[word]
            num_added += 1
    if num_added > 0:
        return vec/num_added
    else:
        return np.array([random.uniform(-0.5, 0.5) for i in range(glove_dim)])
def get_glove_cos_dist_hl_body(row):
    hl = row["___clean_headline_tokenized_lemmas"]
    body = row["___clean_body_tokenized_lemmas"]
    hl_vec = text_to_mean_vec_ignore_unk(hl)
    body_vec = text_to_mean_vec_ignore_unk(body)
    cosine_dist = distance.cosine(hl_vec, body_vec) # cosine() from scipy
    return cosine_dist

def get_verbs(text):
    verbs = [token for token, pos in nltk.pos_tag(text) if pos.startswith('VB')]
    verbs_sentence = ' '.join(word[0] for word in verbs)
    return verbs_sentence
 
def get_verb_glove_cos_dist_hl_body(row):
    hl = row["___clean_headline_tokenized_lemmas"]
    body = row["___clean_body_tokenized_lemmas"]
    
    hl_verbs = get_verbs(hl)
    body_verbs = get_verbs(body)
    
    hl_vec = text_to_mean_vec_ignore_unk(hl_verbs)
    body_vec = text_to_mean_vec_ignore_unk(body_verbs)
    cosine_dist = distance.cosine(hl_vec, body_vec) # cosine() from scipy
    return cosine_dist

def hl_body_glove_cos_dist_helper(X_df):
    return X_df[["___clean_headline_tokenized_lemmas", "___clean_body_tokenized_lemmas"]].apply(get_glove_cos_dist_hl_body, axis=1)

def hl_body_glove_verb_helper(X_df):
    return X_df[["___clean_headline_tokenized_lemmas", "___clean_body_tokenized_lemmas"]].apply(get_verb_glove_cos_dist_hl_body, axis=1)

### END GLOVE CODE BLOCK
def add_all_features(X_df, parallel=True):
    print("Adding glove feature #1...", flush=True)
    X_df["hl_body_glove_"+str(glove_dim)+"_cos_dist"] = parallelize(X_df, hl_body_glove_cos_dist_helper)
    print("Adding glove feature #2...", flush=True)
    X_df["hl_body_verb_glove_"+str(glove_dim)+"_cos_dist"] = parallelize(X_df, hl_body_glove_verb_helper)
    print("Adding VADER sentiment...", flush=True)
    X_df = add_vader_sent(X_df)
    return X_df

In [6]:
X_train = pd.read_hdf("X_train_full_allfeatures-NOLABEL.h5", key="df")
y_train = pd.read_hdf("y_train_full.h5", key="df")

In [7]:
%time X_train = add_all_features(X_train)

Adding glove feature #1...
Adding glove feature #2...
Adding VADER sentiment...
CPU times: user 11.1 s, sys: 10.9 s, total: 22 s
Wall time: 7min 22s


In [8]:
print(X_train.shape)

(49972, 59)


In [9]:
tsm = TwoModel(X_train, y_train)

In [10]:
X_train, y_train, X_dev, y_dev = tsm.make_other_sets()
print(X_train.shape)
print(y_train.shape)
print(X_dev.shape)
print(y_dev.shape)

(39977, 59)
(39977,)
(9995, 59)
(9995,)


In [11]:
X_train.head()

Unnamed: 0,Headline,articleBody,___clean_headline,___clean_headline_tokenized_lemmas,___clean_body,___clean_body_tokenized_lemmas,bin_count,bin_count_early,bin_count_stopless,word_overlap_features,...,hl_body_glove_200_cos_dist,hl_body_verb_glove_200_cos_dist,vader_hl_neg,vader_hl_neu,vader_hl_pos,vader_hl_compound,vader_body_neg,vader_body_neu,vader_body_pos,vader_body_compound
45131,Insurgents killed in Nigeria despite alleged t...,"Anna Wintour, editor-in-chief on Vogue magazin...",insurgents killed in nigeria despite alleged t...,"[insurgent, killed, in, nigeria, despite, alle...",anna wintour editor in chief on vogue magazine...,"[anna, wintour, editor, in, chief, on, vogue, ...",3,3,1,0.009756,...,0.349139,0.32218,0.333,0.667,0.0,-0.6705,0.053,0.875,0.073,0.8214
48011,New Audio Reveals Pause in Gunfire When Michae...,"SEVEN girls, aged 13 to 15, have fallen pregna...",new audio reveals pause in gunfire when michae...,"[new, audio, reveals, pause, in, gunfire, when...",seven girls aged 13 to 15 have fallen pregnant...,"[seven, girl, aged, 13, to, 15, have, fallen, ...",2,1,0,0.021739,...,0.180693,0.178216,0.0,1.0,0.0,0.0,0.12,0.88,0.0,-0.9393
26530,Iraqi Official Dismisses ‘Unfounded’ Reports T...,Everyone's been waiting years and years for a ...,iraqi official dismisses unfounded reports tha...,"[iraqi, official, dismisses, unfounded, report...",everyone s been waiting years and years for a ...,"[everyone, s, been, waiting, year, and, year, ...",2,1,0,0.018634,...,0.29279,0.25449,0.118,0.882,0.0,-0.0516,0.009,0.929,0.063,0.9289
30200,Nigeria says it has deal with Boko Haram to re...,WASHINGTON — A U.S. Republican senator and fre...,nigeria says it has deal with boko haram to re...,"[nigeria, say, it, ha, deal, with, boko, haram...",washington a u s republican senator and freque...,"[washington, a, u, s, republican, senator, and...",7,4,3,0.025,...,0.174129,0.09708,0.0,1.0,0.0,0.0,0.09,0.823,0.088,-0.6236
7195,ISIL allegedly kills US journalist in video,Absolutely awful news. Media are reporting tha...,isil allegedly kills us journalist in video,"[isil, allegedly, kill, u, journalist, in, video]",absolutely awful news media are reporting that...,"[absolutely, awful, news, medium, are, reporti...",2,2,1,0.086957,...,0.214252,1.008478,0.368,0.632,0.0,-0.5423,0.162,0.838,0.0,-0.5095


In [12]:
tsm.fit()

      Iter       Train Loss   Remaining Time 
         1           1.0244           11.69s
         2           0.9193           11.26s
         3           0.8355           10.85s
         4           0.7672           10.98s
         5           0.7104           10.89s
         6           0.6616           10.79s
         7           0.6199           10.62s
         8           0.5840           10.53s
         9           0.5530           10.44s
        10           0.5263           10.29s
        20           0.3712            9.69s
        30           0.3183            8.96s
        40           0.2869            8.48s
        50           0.2705            8.04s
        60           0.2579            7.42s
        70           0.2480            6.89s
        80           0.2414            6.40s
        90           0.2350            5.89s
       100           0.2300            5.38s
       200           0.1958            0.00s
      Iter       Train Loss   Remaining Time 
        

In [13]:
preds = tsm.predict(X_dev)
print_reports(preds, y_dev)

             precision    recall  f1-score   support

      agree       0.69      0.33      0.44       703
   disagree       0.70      0.12      0.20       180
    discuss       0.71      0.89      0.79      1779
  unrelated       0.97      0.98      0.98      7333

avg / total       0.90      0.90      0.89      9995

Weighted accuracy: 0.8445025304488071 (3796.25 out of 4495.25)


In [14]:
X_test = pd.read_hdf("X_TEST_full_allfeatures-NOLABEL.h5", key="df")
y_test = pd.read_hdf("y_TEST_full.h5", key="df")

In [15]:
X_test = add_all_features(X_test)

Adding glove feature #1...
Adding glove feature #2...
Adding VADER sentiment...


In [16]:
preds = tsm.predict(X_test)
print_reports(preds, y_test)

             precision    recall  f1-score   support

      agree       0.45      0.20      0.28      1903
   disagree       0.39      0.01      0.02       697
    discuss       0.62      0.79      0.69      4464
  unrelated       0.95      0.97      0.96     18349

avg / total       0.84      0.86      0.84     25413

Weighted accuracy: 0.7661838858491579 (8927.0 out of 11651.25)


In [None]:
----

In [None]:
crf = sklearn.ensemble.RandomForestClassifier(n_estimators=200, n_jobs=8, random_state=42)

In [None]:
crf.fit(X_train, y_train)

In [None]:
preds = crf.predict(X_dev)
print(classification_report(y_dev, preds))

In [None]:
y_train.value_counts(normalize=True)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test, preds, labels=["agree", "disagree", "discuss"], sample_weight=None)