# w266: Final Project
### Christopher Danicic, Robert Deng, Chandan Gope
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

In [7]:
#Load libraries
import os, sys, re, json, time, gc, warnings
import itertools, collections
from importlib import reload
import numpy as np
import pandas as pd
import scipy.stats as ss
import matplotlib.pyplot as plt
import matplotlib_venn as venn
import matplotlib.gridspec as gridspec 
import seaborn as sns
import string
from PIL import Image
color = sns.color_palette()

#SciKit
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import *
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

#NLTK
import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer  

%matplotlib inline

### Load data

In [8]:
#Load data
train = pd.read_csv("~/w266-data/train.csv")
test = pd.read_csv("~/w266-data/test.csv")

In [9]:
nrow_train=train.shape[0]
nrow_test=test.shape[0]
sum=nrow_train+nrow_test
print("       : train : test")
print("rows   :",nrow_train,":",nrow_test)
print("perc   :",round(nrow_train*100/sum),"   :",round(nrow_test*100/sum))

       : train : test
rows   : 159571 : 153164
perc   : 51    : 49


There's roughly a 50/50 split between test and train. Later, we will further segment 15% of the train data as dev data

# Modelling

In [10]:
def nltk_preprocess(data):
    '''This function preprocesses a data frame, specifing a text_column, 
    and strips down the document to cleaned, individualized word tokens without
    stop words and other excessive parts of speech and eventually rejoins the remaining words.
    '''
    #Initializes stop words and new column creation
    stop = stopwords.words('english')
    
    
    #Initialize Lemmatizer object and final list of lemmatized words
    lemmatizer = WordNetLemmatizer()
    
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return None # for easy if-statement
        
    def lemmatized(word, tag):
        wntag = get_wordnet_pos(tag)
        if wntag is None:
            lemma = str(lemmatizer.lemmatize(word))
        else:
            lemma = str(lemmatizer.lemmatize(word, pos=wntag))
        return lemma

    #Remove numeric data, alpha nonnumeric symbols, ip address, new lines, and usernames
    data = data.apply(lambda z: re.sub(r'\d+', r' ', z))
    data = data.apply(lambda z: re.sub(r'\W+', r' ', z))
    data = data.apply(lambda z: re.sub(r"_+",r" ", z))
    data = data.apply(lambda z: re.sub("\\n","", z))
    data = data.apply(lambda z: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","", z))
    data = data.apply(lambda z: re.sub("\[\[.*\]","", z))
    
    #Decapitalize, NLTK: tokenize, remove stop words, keep relevant pos tags, and lemmatize
    data = data.str.lower()
    data = data.apply(word_tokenize)
    data = data.apply(lambda x: [item for item in x if item not in stop])
    data = data.apply(pos_tag)
    data = data.apply(lambda x: [lemmatized(word, tag) for (word, tag) in x])
    data = data.apply(lambda x: ' '.join(x))
    return data

In [11]:
#Modelling
#Seeding
np.random.seed(6)
in_dev = np.random.choice([True, False], len(train), p=[0.15, 0.85])

#Train
train_data = train.comment_text[np.logical_not(in_dev)]
train_labels = train.iloc[np.logical_not(in_dev), 2:8]

#Dev Set
dev_data, dev_labels = train.comment_text[in_dev], train.iloc[in_dev, 2:8]

#Target Names
target_names = train.columns[2:8]

#Print some train data
print("train_text", train_data.shape, "\ntrain_labels", train_labels.shape)
print("\n\ndev_text", dev_data.shape, "\ndev_labels", dev_labels.shape)
train.head()

train_text (135526,) 
train_labels (135526, 6)


dev_text (24045,) 
dev_labels (24045, 6)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [13]:
preprocessed_train_data = nltk_preprocess(train_data)
preprocessed_dev_data = nltk_preprocess(dev_data)

In [15]:
toxic_pipeline = Pipeline([('cv', CountVectorizer()),
                           ('tfidf', TfidfTransformer()),
                           ('dtc', DecisionTreeClassifier())])

toxic_pipeline_2 = Pipeline([('cv', CountVectorizer()), ('tfidf', TfidfTransformer()), ('svc', OneVsRestClassifier(LinearSVC(multi_class="ovr")))])


toxic_pipeline.fit(preprocessed_train_data, train_labels)
toxic_pipeline_2.fit(preprocessed_train_data, train_labels)

Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_a...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))])

In [16]:
toxic_pipeline_2 = Pipeline([('cv', CountVectorizer()),
                           ('tfidf', TfidfTransformer()),
                           ('svc', OneVsRestClassifier(LinearSVC(multi_class="ovr")))])
toxic_pipeline_2.fit(preprocessed_train_data, train_labels)
pred_dev_2 = toxic_pipeline_2.predict(dev_data)
print(classification_report(pred_dev_2, dev_labels, target_names = target_names))

               precision    recall  f1-score   support

        toxic       0.57      0.61      0.59      2162
 severe_toxic       0.22      0.65      0.33        79
      obscene       0.64      0.59      0.61      1342
       threat       0.08      0.75      0.14         8
       insult       0.43      0.80      0.56       619
identity_hate       0.14      0.74      0.23        38

  avg / total       0.56      0.63      0.58      4248



In [17]:
pred_dev = toxic_pipeline.predict(dev_data)
pred_dev_2 = toxic_pipeline_2.predict(dev_data)
print(classification_report(pred_dev, dev_labels, target_names = target_names))
print(classification_report(pred_dev_2, dev_labels, target_names = target_names))

               precision    recall  f1-score   support

        toxic       0.59      0.24      0.34      5655
 severe_toxic       0.12      0.22      0.16       129
      obscene       0.66      0.26      0.37      3089
       threat       0.13      0.32      0.19        31
       insult       0.50      0.21      0.30      2735
identity_hate       0.17      0.17      0.17       200

  avg / total       0.57      0.24      0.33     11839

               precision    recall  f1-score   support

        toxic       0.57      0.61      0.59      2162
 severe_toxic       0.22      0.65      0.33        79
      obscene       0.64      0.59      0.61      1342
       threat       0.08      0.75      0.14         8
       insult       0.43      0.80      0.56       619
identity_hate       0.14      0.74      0.23        38

  avg / total       0.56      0.63      0.58      4248



In [18]:
pred_train = toxic_pipeline.predict(train_data)
pred_train_2 = toxic_pipeline_2.predict(train_data)
print(classification_report(pred_train, train_labels, target_names = target_names))
print(classification_report(pred_train_2, train_labels, target_names = target_names))

               precision    recall  f1-score   support

        toxic       0.71      0.28      0.40     32480
 severe_toxic       0.46      0.63      0.53      1010
      obscene       0.74      0.31      0.44     17069
       threat       0.34      0.61      0.44       226
       insult       0.64      0.28      0.39     15231
identity_hate       0.45      0.41      0.43      1320

  avg / total       0.69      0.30      0.41     67336

               precision    recall  f1-score   support

        toxic       0.69      0.70      0.70     12842
 severe_toxic       0.35      0.86      0.50       562
      obscene       0.75      0.67      0.71      8063
       threat       0.20      0.94      0.33        86
       insult       0.59      0.92      0.72      4332
identity_hate       0.31      0.95      0.46       388

  avg / total       0.68      0.73      0.69     26273



In [19]:
from sklearn.metrics import confusion_matrix
print("pred_dev confusion matrix", confusion_matrix(pred_dev, dev_labels)
print("pred_dev_2 confusion matrix", confusion_matrix(pred_dev_2, dev_labels)

SyntaxError: invalid syntax (<ipython-input-19-036e1ee6cc1c>, line 3)

In [20]:
len(cv.get_feature_names())

146228

In [16]:
cv = CountVectorizer(min_df = 1)
ctv = cv.fit_transform(preprocessed_train_data)
ctv_freq = pd.DataFrame({'term': cv.get_feature_names(), 'occurrences':np.asarray(ctv.sum(axis=0)).ravel().tolist()})
ctv_freq['frequency'] = ctv_freq['occurrences']/ctv_freq.shape[0]
print (ctv_freq.sort_values('frequency', ascending = False).head(100))

        occurrences       term  frequency
7931          63087    article   0.431429
92724         48729       page   0.333240
139676        41448  wikipedia   0.283448
124123        34331       talk   0.234777
133963        27537        use   0.188316
37999         26265       edit   0.179617
90535         26127        one   0.178673
76057         25537       make   0.174638
97164         25355     please   0.173394
141125        25020      would   0.171103
72979         24432       like   0.167082
111195        21752        say   0.148754
112540        21663        see   0.148145
126754        20929      think   0.143126
69132         20205       know   0.138175
118089        20174     source   0.137963
49758         19180        get   0.131165
50796         18377         go   0.125674
4214          17458       also   0.119389
127509        16465       time   0.112598
94889         16105     people   0.110136
1389          16097        add   0.110082
133995        15739       user   0

In [None]:
def plot_coefficients(classifier, feature_names, top_features=20):
    coef = classifier.coef_.ravel()
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    # create plot
    plt.figure(figsize=(15, 5))
    colors = [‘red’ if c < 0 else ‘blue’ for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha=’right’)
    plt.show()

cv = CountVectorizer()
train = cv.fit_transform(preprocessed_train_data)


svm = OneVsRestClassifier(LinearSVC(multi_class="ovr")
svm.fit(X_train, target)
plot_coefficients(svm, cv.get_feature_names())