# L665 ML for NLPSpring 2018 

## Assignment 1 - Task 4, notebook 3 of 3: doc2vec exploration with gensim

Author: Carlos Sathler

In [1]:
import numpy as np
import pandas as pd
import gc

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline  

## Read toxic comments dataset and create train and test partitions

Source: Kaggle Toxic Comment Classification Challenge (https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data)

In [2]:
df_all = pd.read_csv('input/train.csv')
drop_cols = ['id', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df_all = df_all.drop(drop_cols, axis=1)
#df_all = df_all.sample(frac=0.2)
df_all.head()

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [3]:
print('Percentage of toxic comments: {}'.format(df_all['toxic'].sum() / df_all['toxic'].count()))

Percentage of toxic comments: 0.09584448302009764


In [4]:
# normalize text
import re
pat = re.compile(u'[^a-zA-Z0-9]')
def normalize(txt):
    return pat.sub(' ',txt)
    
df_all['comment_text'] = df_all['comment_text'].apply(lambda x: normalize(x)) 

In [5]:
# create train, test partitions
X_all = df_all.comment_text.values
X_train, X_test, y_train, y_test = train_test_split(X_all, df_all.toxic.values, test_size=0.3, random_state=42)

## Create benchmark using BOW (and GradientBoostingClassifier)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [7]:
%%time

# extract BOW as tfidf sparce matrix
vectorizer = TfidfVectorizer(\
                             ngram_range=(1,3),
                             stop_words='english',
                             min_df=0.001,
                             max_df=0.99,
                             sublinear_tf=True
                            )
vectorizer.fit(X_all)
X_train_csr = vectorizer.transform(X_train)
X_test_csr = vectorizer.transform(X_test)
print(X_train_csr.shape)
print(X_test_csr.shape)

(111699, 4873)
(47872, 4873)
CPU times: user 1min 20s, sys: 2.42 s, total: 1min 22s
Wall time: 1min 22s


In [8]:
%%time

from sklearn.linear_model import LogisticRegression

# train and predict
clf = LogisticRegression()
clf.fit(X_train_csr, y_train)
y_hat = clf.predict(X_test_csr)
acc = accuracy_score(y_test, y_hat)
print("Accuracy = {}".format(acc))

Accuracy = 0.9548379010695187
CPU times: user 1.64 s, sys: 107 ms, total: 1.75 s
Wall time: 1.2 s


## Create doc2vec using gensim

In [9]:
%%time

def print_msg(msg):
    print('{} => {}'.format(strftime("%H:%M:%S", gmtime()), msg))
    
NROWS=5000000

# gensim modules
import os
from time import gmtime, strftime
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# random
from random import shuffle

def get_doc2vec_model(df, fieldname, epochs=5, size=100, window=5, negative=3):
    
    print_msg('Doc2Vec for feature "{}"'.format(fieldname))
    
    # need to create a list in the following format:
    # [['word1', 'word2', 'word3', 'lastword'], ['label1']]

    feature_array = df[fieldname].values
    idx = np.array(range(0,feature_array.shape[0]))

    print_msg("Extracting sentences...")

    sentences = []
    for i, sentence in zip(idx[:NROWS], feature_array[:NROWS]):
        sentences.append(LabeledSentence(utils.to_unicode(sentence.lower()).split(), ['SENT_{}'.format(str(i))]))

    print_msg("Building vocabulary...")

    model = Doc2Vec(min_count=1, window=window, size=size, sample=1e-4, negative=negative, workers=8)
    model.build_vocab(sentences)
    
    print_msg("Training doc2vec model...")

    def get_shuffled(sentences):
        shuffle(sentences)
        return sentences

    model.train(get_shuffled(sentences), epochs=epochs, total_examples=model.corpus_count)

    del feature_array, idx, sentences
        
    return model

df_train = pd.DataFrame(X_train, columns=(['comment_text']))
df_test = pd.DataFrame(X_test, columns=(['comment_text']))
model_train = get_doc2vec_model(df_train, 'comment_text', epochs=200, size=1000, window=10, negative=5)  
model_test = get_doc2vec_model(df_test, 'comment_text', epochs=200, size=1000, window=10, negative=5)  

X_train_doc2vec = np.array([ dv for dv in model_train.docvecs ])
X_test_doc2vec = np.array([ dv for dv in model_test.docvecs ])

del model_train, model_test
gc.collect()

print(X_train_doc2vec.shape)
print(X_test_doc2vec.shape)

01:28:02 => Doc2Vec for feature "comment_text"
01:28:02 => Extracting sentences...
01:28:06 => Building vocabulary...
01:28:18 => Training doc2vec model...
02:15:47 => Doc2Vec for feature "comment_text"
02:15:47 => Extracting sentences...
02:15:49 => Building vocabulary...
02:15:56 => Training doc2vec model...
(111699, 1000)
(47872, 1000)
CPU times: user 3h 49min 38s, sys: 25min 32s, total: 4h 15min 10s
Wall time: 1h 12min 39s


## Append POS tag vector to BOW one-hot-encoded vector

In [10]:
%%time 

from scipy.sparse import hstack
X_train_csr_2 = hstack((X_train_csr, X_train_doc2vec)).tocsr()
X_test_csr_2 = hstack((X_test_csr, X_test_doc2vec)).tocsr()
print(X_train_csr_2.shape)
print(X_test_csr_2.shape)

(111699, 5873)
(47872, 5873)
CPU times: user 11 s, sys: 4.04 s, total: 15 s
Wall time: 15.4 s


## Compare AdaBoostClassifier on enhanced data against benchmark

In [11]:
%%time

# BOW + NLP features

from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=100, learning_rate=1.0)
clf.fit(X_train_csr_2, y_train)
y_hat = clf.predict(X_test_csr_2)
acc = accuracy_score(y_test, y_hat)
print("Accuracy = {}".format(acc))

Accuracy = 0.9433489304812834
CPU times: user 33min 16s, sys: 59.1 s, total: 34min 15s
Wall time: 34min 22s


In [12]:
%%time

# BOW dataset

clf.fit(X_train_csr, y_train)
y_hat = clf.predict(X_test_csr)
acc = accuracy_score(y_test, y_hat)
print("Accuracy = {}".format(acc))

Accuracy = 0.9490307486631016
CPU times: user 38.5 s, sys: 691 ms, total: 39.2 s
Wall time: 39.2 s


In [13]:
%%time

# doc2vec alone

clf.fit(X_train_doc2vec, y_train)
y_hat = clf.predict(X_test_doc2vec)
acc = accuracy_score(y_test, y_hat)
print("Accuracy = {}".format(acc))

Accuracy = 0.8833765040106952
CPU times: user 29min 34s, sys: 2.36 s, total: 29min 36s
Wall time: 29min 37s
