# w266: Final Project
### Christopher Danicic, Robert Deng, Chandan Gope
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

In [32]:
#Load libraries
import os, sys, re, json, time
import itertools, collections
from importlib import reload
import numpy as np
import pandas as pd

#SciKit
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import *
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB


#NLTK
import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

### Load data

In [40]:
#Load data
train = pd.read_csv("~/w266-finalproject/data/train.csv")
test = pd.read_csv("~/w266-finalproject/data/test.csv")

#Seeding
np.random.seed(6)
in_dev = np.random.choice([True, False], len(train), p=[0.15, 0.85])

#Train
train_data = train.comment_text[np.logical_not(in_dev)]
train_labels = train.iloc[np.logical_not(in_dev), 2:8]

#Dev Set
dev_data, dev_labels = train.comment_text[in_dev], train.iloc[in_dev, 2:8]

#Target Names
target_names = train.columns[2:8]

#Print some train data
print("train_text", train_data.shape, "\ntrain_labels", train_labels.shape)
print("\n\ndev_text", dev_data.shape, "\ndev_labels", dev_labels.shape)
train.head(n=20)

train_text (135526,) 
train_labels (135526, 6)


dev_text (24045,) 
dev_labels (24045, 6)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


### Basic data exploration

In [4]:
# Bokeh for plotting.
import bokeh.plotting as bp
from bokeh.models import HoverTool
bp.output_notebook()

# Helper code for plotting histograms
def plot_length_histogram(lengths, x_range=[0,100], bins=40, normed=True):
    hist, bin_edges = np.histogram(a=lengths, bins=bins, normed=normed, range=x_range)
    bin_centers = (bin_edges[1:] + bin_edges[:-1])/2
    bin_widths =  (bin_edges[1:] - bin_edges[:-1])

    hover = HoverTool(tooltips=[("bucket", "@x"), ("count", "@top")], mode="vline")
    fig = bp.figure(plot_width=800, plot_height=400, tools=[hover])
    fig.vbar(x=bin_centers, width=bin_widths, top=hist, hover_fill_color="firebrick")
    fig.y_range.start = 0
    fig.x_range.start = 0
    fig.xaxis.axis_label = "Number of tokens"
    fig.yaxis.axis_label = "Frequency"
    bp.show(fig)

In [5]:
train_comment_text = train['comment_text']
sentences_lengths = train_comment_text.str.split().apply(len)
plot_length_histogram(sentences_lengths, x_range=[0,300])
print("95% percentile length: {:.0f}".format(sentences_lengths.quantile(.95)))

95% percentile length: 230


**Vectorizing Request Titles & Text**

We believe that the titles and request body should be treated separately. We will do so by building a preprocessor that includes NLTK capabilities along with regular expression cleaning.
1. Regular Expressions: cleaning out sequences of numbers, non-letter characters, strings with underscores, and lower-casing
2. Tokenizing: splits raw text into sentences, words and punctuation
3. Stop Words: removing filler words i.e. the, me, then
4. Pos Tag: use tokenized words to identify the part of speech WordNet tag from Penn Treebank https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
5. Lemmatizing: Convert the part of speech WordNet tag to identify a noun, verb, adverb, or adjective (based on starting with 'N’, ‘V’, ‘R’, or ‘J’). Then we use the tag to translate to the lemma, or the root word in the english lexicon.

In [None]:
#download all nltk corpora and data
nltk.download()

In [15]:
def nltk_preprocess(data):
    '''This function preprocesses a data frame, specifing a text_column, 
    and strips down the document to cleaned, individualized word tokens without
    stop words and other excessive parts of speech and eventually rejoins the remaining words.
    '''
    #Initializes stop words and new column creation
    stop = stopwords.words('english')
    
    
    #Initialize Lemmatizer object and final list of lemmatized words
    lemmatizer = WordNetLemmatizer()
    
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return None # for easy if-statement
        
    def lemmatized(word, tag):
        wntag = get_wordnet_pos(tag)
        if wntag is None:
            lemma = str(lemmatizer.lemmatize(word))
        else:
            lemma = str(lemmatizer.lemmatize(word, pos=wntag))
        return lemma

    data = data.apply(lambda x: re.sub(r'\d+', r' ', x)).apply(lambda y: re.sub(r'\W+', r' ', y)).apply(lambda z: re.sub(r"_+",r" ",z))
    data = data.str.lower()
    data = data.apply(word_tokenize)
    data = data.apply(lambda x: [item for item in x if item not in stop])
    data = data.apply(pos_tag)
    data = data.apply(lambda x: [lemmatized(word, tag) for (word, tag) in x])
    data = data.apply(lambda x: ' '.join(x))
    return data

In [54]:
toxic_pipeline = Pipeline([('cv', CountVectorizer(min_df=.003, max_df=.3)),
                           ('tfidf', TfidfTransformer()),
                           ('dtc', DecisionTreeClassifier(criterion = "entropy"))])

toxic_pipeline_2 = Pipeline([('cv', CountVectorizer(min_df=.003, max_df=.3)),
                           ('tfidf', TfidfTransformer()),
                           ('svc', OneVsRestClassifier(LinearSVC()))])

#Fit w/ NLTK Preprocessor
toxic_pipeline.fit(nltk_preprocess(train_data.iloc[0:2000]), train_labels[0:2000])
toxic_pipeline_2.fit(nltk_preprocess(train_data.iloc[0:2000]), train_labels[0:2000])

Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.3, max_features=None, min_df=0.003,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        str...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))])

In [55]:
pred_dev = toxic_pipeline.predict(dev_data)
pred_dev_2 = toxic_pipeline_2.predict(dev_data)
print(classification_report(pred_dev, dev_labels, target_names = target_names))
print(classification_report(pred_dev_2, dev_labels, target_names = target_names))

               precision    recall  f1-score   support

        toxic       0.51      0.21      0.30      5569
 severe_toxic       0.14      0.34      0.20        95
      obscene       0.50      0.14      0.22      4351
       threat       0.15      0.22      0.18        50
       insult       0.41      0.24      0.30      1985
identity_hate       0.08      0.17      0.11        99

  avg / total       0.48      0.19      0.27     12149

               precision    recall  f1-score   support

        toxic       0.40      0.53      0.46      1749
 severe_toxic       0.12      0.51      0.19        53
      obscene       0.43      0.32      0.36      1658
       threat       0.01      0.50      0.03         2
       insult       0.37      0.54      0.44       791
identity_hate       0.04      0.36      0.08        25

  avg / total       0.40      0.45      0.41      4278



In [36]:
cv = CountVectorizer(min_df = 1)
ctv = cv.fit_transform(nltk_preprocess(train_text[0:1000]))
ctv_freq = pd.DataFrame({'term': cv.get_feature_names(), 'occurrences':np.asarray(ctv.sum(axis=0)).ravel().tolist()})
ctv_freq['frequency'] = ctv_freq['occurrences']/ctv_freq.shape[0]
print (ctv_freq.sort_values('frequency', ascending = False).head(100))

      occurrences        term  frequency
420           442     article   0.058918
4714          323        page   0.043055
7296          312   wikipedia   0.041589
6489          252        talk   0.033591
4601          217         one   0.028926
2013          210        edit   0.027993
3938          209        make   0.027859
7003          203         use   0.027059
4951          183      please   0.024393
5772          177         say   0.023594
7380          174       would   0.023194
3783          170        like   0.022661
5849          169         see   0.022527
6138          161      source   0.021461
6616          156       think   0.020794
3613          147        know   0.019595
7008          145        user   0.019328
2730          143          go   0.019062
218           141        also   0.018795
2686          138         get   0.018395
6664          133        time   0.017729
4825          114      people   0.015196
86            113         add   0.015063
3857          10