# L665 ML for NLPSpring 2018 

## Assignment 1 - Task 1 

Author: Carlos Sathler

In [1]:
import numpy as np
import pandas as pd
import gc

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline  

## Read toxic comments dataset and create train and test partitions

Source: Kaggle Toxic Comment Classification Challenge (https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data)

In [2]:
df_all = pd.read_csv('input/train.csv')
drop_cols = ['id', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df_all = df_all.drop(drop_cols, axis=1)
#df_all = df_all.sample(frac=0.2)
df_all.head()

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [3]:
print('Percentage of toxic comments: {}'.format(df_all['toxic'].sum() / df_all['toxic'].count()))

Percentage of toxic comments: 0.09584448302009764


In [4]:
# normalize text
import re
pat = re.compile(u'[^a-zA-Z0-9]')
def normalize(txt):
    return pat.sub(' ',txt)
    
df_all['comment_text'] = df_all['comment_text'].apply(lambda x: normalize(x)) 

In [5]:
# create train, test partitions
X_all = df_all.comment_text.values
X_train, X_test, y_train, y_test = train_test_split(X_all, df_all.toxic.values, test_size=0.3, random_state=42)

## Create benchmark using BOW (and GradientBoostingClassifier)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [7]:
%%time

# extract BOW as tfidf sparce matrix
vectorizer = TfidfVectorizer(\
                             ngram_range=(1,3),
                             stop_words='english',
                             min_df=0.001,
                             max_df=0.99,
                             sublinear_tf=True
                            )
vectorizer.fit(X_all)
X_train_csr = vectorizer.transform(X_train)
X_test_csr = vectorizer.transform(X_test)
print(X_train_csr.shape)
print(X_test_csr.shape)

(111699, 4873)
(47872, 4873)
CPU times: user 1min 34s, sys: 5.18 s, total: 1min 40s
Wall time: 1min 42s


In [8]:
%%time

from sklearn.linear_model import LogisticRegression

# train and predict
clf = LogisticRegression()
clf.fit(X_train_csr, y_train)
y_hat = clf.predict(X_test_csr)
acc = accuracy_score(y_test, y_hat)
print("Accuracy = {}".format(acc))

Accuracy = 0.9548379010695187
CPU times: user 1.7 s, sys: 159 ms, total: 1.86 s
Wall time: 2.38 s


## Vectorize POS tag 

In [9]:
%%time

# create dictionary with pos tags - will use 1000 rows of data

import spacy
from spacy import displacy

nlp = spacy.load('en')

SIZE=1000

def doc_generator():
    for txt in X_all[:SIZE]:
        yield nlp(txt)
 
doc_generator = doc_generator()
tag_voc = dict()
for doc in doc_generator:
    for token in doc:
        tag_voc[token.tag_] = 1

idx = 0
for k, v in tag_voc.items():
    idx += 1
    tag_voc[k] = idx
    
print(len(tag_voc))
print(tag_voc)

KeyboardInterrupt: 

In [10]:
%%time

# create list of vectors for train partition

def train_vec_generator():
    for txt in X_train:
        doc = nlp(txt)
        yield [tag_voc.get(token.tag_,999) for token in doc]

vec_generator = train_vec_generator()
max_len = 0
train_vec_list = list()
for vec in vec_generator:
    train_vec_list.append(vec)
    vec_len = len(vec)
    if vec_len > max_len:
        max_len = vec_len

CPU times: user 1h 52min 13s, sys: 5min 16s, total: 1h 57min 29s
Wall time: 29min 53s


In [11]:
%%time

# create list of vectors for test partition with proper length (pad with zeros to match comment with longest length)

def test_vec_generator():
    for txt in X_test:
        doc = nlp(txt)
        yield [tag_voc.get(token.tag_,999) for token in doc]

vec_generator = test_vec_generator()
test_vec_list = list()
for vec in vec_generator:
    test_vec_list.append(vec)
    vec_len = len(vec)
    if vec_len > max_len:
        max_len = vec_len

CPU times: user 47min 34s, sys: 2min 9s, total: 49min 43s
Wall time: 12min 39s


In [12]:
%%time

# create train pos tag vector with proper length (pad with zeros to match comment with longest length)
i=0
for vec in train_vec_list:
    train_vec_list[i] = [0] * (max_len - len(vec)) + vec
    i += 1
    
X_train_pos_tag = np.array(train_vec_list)
del train_vec_list
gc.collect()

# create train pos tag vector with proper length
i=0
for vec in test_vec_list:
    test_vec_list[i] = [0] * (max_len - len(vec)) + vec
    i += 1
    
X_test_pos_tag = np.array(test_vec_list)
del test_vec_list
gc.collect()

CPU times: user 25.1 s, sys: 2.29 s, total: 27.4 s
Wall time: 27.6 s


## AdaBoostClassifier on BOW + POS tag (vectorized)

In [13]:
%%time 

from scipy.sparse import hstack
X_train_csr_2 = hstack((X_train_csr, X_train_pos_tag.reshape(X_train_pos_tag.shape[0], max_len))).tocsr()
X_test_csr_2 = hstack((X_test_csr, X_test_pos_tag.reshape(X_test_pos_tag.shape[0], max_len))).tocsr()
print(X_train_csr_2.shape)
print(X_test_csr_2.shape)

(111699, 6958)
(47872, 6958)
CPU times: user 3.35 s, sys: 1.21 s, total: 4.56 s
Wall time: 4.74 s


In [14]:
%%time

# BOW + NLP features

from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=100, learning_rate=1.0)
clf.fit(X_train_csr_2, y_train)
y_hat = clf.predict(X_test_csr_2)
acc = accuracy_score(y_test, y_hat)
print("Accuracy = {}".format(acc))

Accuracy = 0.9504512032085561
CPU times: user 1min 45s, sys: 2.75 s, total: 1min 48s
Wall time: 1min 49s


In [23]:
%%time

# BOW

clf = AdaBoostClassifier(n_estimators=100, learning_rate=1.0)
clf.fit(X_train_csr, y_train)
y_hat = clf.predict(X_test_csr)
acc = accuracy_score(y_test, y_hat)
print("Accuracy = {}".format(acc))

Accuracy = 0.9490307486631016
CPU times: user 42.2 s, sys: 795 ms, total: 43 s
Wall time: 43.1 s


In [22]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='squared_epsilon_insensitive', max_iter=200) 
# loss=squared_hinge 0.9552
clf.fit(X_train_csr, y_train)
y_hat = clf.predict(X_test_csr)
acc = accuracy_score(y_test, y_hat)
print("Accuracy = {}".format(acc))

Accuracy = 0.4920621657754011
