In [93]:
import sys, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import string
import datetime
import re
%load_ext autoreload
%autoreload 2

#modeling imports
from dateutil.relativedelta import relativedelta
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline
plt.style.use('fivethirtyeight')

#tf imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score


#multinomial nb
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from scipy.sparse.linalg import svds



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [94]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
subm = pd.read_csv('./data/sample_submission.csv')
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[labels].max(axis=1) #make an indicator for when there is no 
                                            #value for any of the labels 
#cant have any of the unknown values 
train['comment_text'].fillna("unknown", inplace=True)
test['comment_text'].fillna("unknown", inplace=True)

In [95]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1


In [96]:
def load_data():

    data = os.path.join("data", "train.csv")

    df = pd.read_csv(data)
    X_train = df[['comment_text']]
    y_train = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

    X_test = pd.read_csv(os.path.join("data", "test.csv"))
    y_test = pd.read_csv(os.path.join("data", "test_labels.csv"))
    test = X_test.merge(y_test, on='id')
    test = test[ (test['toxic']!=-1) | (test['severe_toxic']!=-1) | 
                (test['obscene']!=-1) | (test['threat']!=-1) | (test['insult']!=-1) 
                | (test['identity_hate']!=-1) ]
    test = test.reset_index(drop=True) 
    
    X_test = test[['comment_text']]
    y_test = test[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

    return X_train, y_train, X_test, y_test 

In [97]:
X_train, y_train, X_test, y_test = load_data()

# PREPROCESSING

In [98]:
import re
from nltk.corpus import movie_reviews, stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.probability import FreqDist
#lemmatizing
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

BLACKLIST_STOPWORDS = ['over','only','very','not','no']
ENGLISH_STOPWORDS = set(stopwords.words('english')) - set(BLACKLIST_STOPWORDS)
import re
cList = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}

c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Isabella_GC/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [99]:
#convert text to lowercase
X_train = X_train.apply(lambda x: x.astype(str).str.lower())
X_test = X_test.apply(lambda x: x.astype(str).str.lower())

#column now has the expanded contractions
X_train['expanded'] = X_train.comment_text.apply(expandContractions)
X_test['expanded'] = X_test.comment_text.apply(expandContractions)

#remove numbers, punctuation
#https://medium.com/@chaimgluck1/have-messy-text-data-clean-it-with-simple-lambda-functions-645918fcc2fc
X_train['expanded'] = X_train.expanded.apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
X_test['expanded'] = X_test.expanded.apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
X_train['expanded'] = X_train.expanded.apply(lambda x: x.translate(str.maketrans('','','1234567890')))
X_test['expanded'] = X_test.expanded.apply(lambda x: x.translate(str.maketrans('','','1234567890')))

#take away the \n vals
X_train['expanded'] = X_train.expanded.apply(lambda x: x.translate(str.maketrans('\n',' ')))
X_test['expanded'] = X_test.expanded.apply(lambda x: x.translate(str.maketrans('\n',' ')))

#remove english stop words

#strip white space

#lemmatization

In [110]:

X_train = X_train.drop(['new_lines'], axis=1)



In [111]:
X_train

Unnamed: 0,comment_text,expanded
0,explanation\nwhy the edits made under my usern...,explanation why the edits made under my userna...
1,d'aww! he matches this background colour i'm s...,daww he matches this background colour im seem...
2,"hey man, i'm really not trying to edit war. it...",hey man im really not trying to edit war it is...
3,"""\nmore\ni can't make any real suggestions on ...",more i cannot make any real suggestions on im...
4,"you, sir, are my hero. any chance you remember...",you sir are my hero any chance you remember wh...
...,...,...
159566,""":::::and for the second time of asking, when ...",and for the second time of asking when your vi...
159567,you should be ashamed of yourself \n\nthat is ...,you should be ashamed of yourself that is a ...
159568,"spitzer \n\numm, theres no actual article for ...",spitzer umm theres no actual article for pro...
159569,and it looks like it was actually you who put ...,and it looks like it was actually you who put ...


# Modeling

In [13]:
from keras import Sequential
import keras
from keras.layers import Embedding, LSTM, Dense, Dropout
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences #padding

In [20]:
#TODO: should eventually take out the stop words here
#train tokenizer, then encode documents (comment_text)
tokenizer = Tokenizer(oov_token=True)
tokenizer.fit_on_texts(X_train['comment_text']) #fit the tokenizer to training set, make the test unknown words be an UNK value.
#trying to find the length of the vocabulary, might shrink
num_words = len(tokenizer.word_index) + 1
num_words
#looks like there is about 210K words... maybe we should pass in a maxwords 
#argument to the ttokenizer, but my guess is words that are really making 
#toxic comments are less frequent, so don't necessarily want to do that...
sequences = tokenizer.texts_to_sequences(X_train['comment_text'])
tokens = tokenizer.sequences_to_texts(sequences)


In [118]:
data = pad_sequences(sequences, maxlen=150)
test_sequences = tokenizer.texts_to_sequences(X_test['comment_text'])
test_data = pad_sequences(test_sequences, maxlen=150)

In [124]:
## Network architecture
# inspired at https://towardsdatascience.com/a-beginners-guide-on-sentiment-analysis-with-rnn-9e100627c02e and https://medium.com/@sabber/classifying-yelp-review-comments-using-lstm-and-word-embeddings-part-1-eb2275e4066b
model = Sequential()
#first layer is embedding, takes in size of vocab, 100 dim embedding, and 150 which is length of the comment 
model.add(Embedding(num_words, 100, input_length=150)) 
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(8, activation='sigmoid'))#change to 8 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [125]:
y_train

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
159566,0,0,0,0,0,0
159567,0,0,0,0,0,0
159568,0,0,0,0,0,0
159569,0,0,0,0,0,0


In [126]:
#first run through didn't specify a batch size, probably do that
#on the next try. 
model.fit(data, np.array(y_train['toxic']), validation_split=.3, epochs=3)

Instructions for updating:
Use tf.cast instead.
Train on 111699 samples, validate on 47872 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1c2f46a400>

In [129]:
model_toxic_json = model.to_json()
with open("model_toxic.json", "w") as json_file:
    json_file.write(model_toxic_json)
# serialize weights to HDF5
model.save_weights("model_toxic.h5")
print("Saved model_toxic to disk")

Saved model_toxic to disk


In [136]:
#score on the test set! 
score = model.evaluate(test_data, y_test['toxic'])



In [139]:
print("toxic label %s: %.2f%%" % (model.metrics_names[1], score[1]*100))

toxic label acc: 91.06%


In [148]:
#train models for all the other labels 
y_train
labels = y_train.columns
labels


In [149]:
for label in labels:
    print("====================================")
    print("====================================")
    print("starting fit on", label)
    print("====================================")
    print("====================================")
    if label == 'toxic':
        continue #already trained toxic
    
    #just reset the model, bc idk how to make sure it deletes old fit
    model = Sequential()
    #first layer is embedding, takes in size of vocab, 100 dim embedding, and 150 which is length of the comment 
    model.add(Embedding(num_words, 100, input_length=150)) 
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    
    #fit the data to the next label 
    model.fit(data, np.array(y_train[label]), validation_split=.3, epochs=3)
    model_json = model.to_json()
    
    name_json = label + '.json'
    name_h5 = label + '.h5'
    
    #save into json file
    with open(name_json, "w") as json_file:
        json_file.write(model_json)
    
    # serialize weights to HDF5
    model.save_weights(name_h5)
    print("Saved", label,"to disk")
    
    #lastly, evaluate on test 
    score = model.evaluate(test_data, y_test[label])
    print(label,"%s: %.2f%%" % (model.metrics_names[1], score[1]*100))
    print("====================================")
    print("====================================")
    print("done with", label,"starting next fit")
    print("====================================")
    print("====================================")
    

starting fit on toxic
starting fit on severe_toxic
Train on 111699 samples, validate on 47872 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Saved severe_toxic to disk
severe_toxic acc: 98.44%
done with severe_toxic starting next fit
starting fit on obscene
Train on 111699 samples, validate on 47872 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Saved obscene to disk
obscene acc: 94.77%
done with obscene starting next fit
starting fit on threat
Train on 111699 samples, validate on 47872 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Saved threat to disk
threat acc: 99.58%
done with threat starting next fit
starting fit on insult
Train on 111699 samples, validate on 47872 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Saved insult to disk
insult acc: 94.29%
done with insult starting next fit
starting fit on identity_hate
Train on 111699 samples, validate on 47872 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Saved identity_hate to disk
identity_hate acc: 97.70%
done with identity_hate starting next fit


In [None]:

#try # 2 dont use 
X_train['seq'] = tokenizer.texts_to_sequences(X_train['comment_text'])
X_test['seq'] = tokenizer.texts_to_sequences(X_test['comment_text'])

In [102]:
def pad_shorter(arr, max_len):
    for n in range(len(arr), max_len):
        arr.append(None) 
    return arr

max_len = 100
print("min length before:", X_train.seq.map(lambda x: len(x)).min())
X_train['seq'] = X_train['seq'].apply(lambda x: pad_shorter(x, max_len))

min length before: 1


In [103]:
print("min length after:", X_train.seq.map(lambda x: len(x)).min())
X_train

min length after: 100


Unnamed: 0,comment_text,seq
0,Explanation\nWhy the edits made under my usern...,"[689, 76, 2, 127, 131, 178, 30, 673, 4512, 120..."
1,D'aww! He matches this background colour I'm s...,"[96146, 53, 2636, 14, 556, 3810, 74, 4557, 270..."
2,"Hey man, I'm really not trying to edit war. It...","[413, 438, 74, 135, 15, 250, 3, 72, 315, 79, 5..."
3,"""\nMore\nI can't make any real suggestions on ...","[58, 8, 229, 98, 55, 329, 1437, 16, 2134, 8, 6..."
4,"You, sir, are my hero. Any chance you remember...","[7, 1678, 20, 30, 3517, 55, 1070, 7, 580, 40, ..."
...,...,...
159566,""":::::And for the second time of asking, when ...","[5, 13, 2, 428, 85, 4, 903, 83, 21, 314, 563, ..."
159567,You should be ashamed of yourself \n\nThat is ...,"[7, 57, 17, 4653, 4, 207, 10, 9, 6, 3328, 232,..."
159568,"Spitzer \n\nUmm, theres no actual article for ...","[34279, 7331, 5209, 47, 738, 24, 13, 8168, 351..."
159569,And it looks like it was actually you who put ...,"[5, 12, 575, 50, 12, 25, 211, 7, 63, 202, 16, ..."


In [105]:
def trunc_longer(arr, max_len):
    if len(arr) > max_len:
        arr = arr[:max_len]
    return arr

max_len = 100
print("max length before:", X_train.seq.map(lambda x: len(x)).max())
X_train['seq'] = X_train['seq'].apply(lambda x: trunc_longer(x, max_len))
print("max length after:", X_train.seq.map(lambda x: len(x)).max())
X_train.head()

#here all the comments should be the same size. go back and check what the mean/median 
#length of comment (by word) is, 100 might be a hyperparameter we should tune. 
#could also be checkign what the lengths are of toxic comments vs. normal comments. 

max length before: 1403
max length after: 100


Unnamed: 0,comment_text,seq
0,Explanation\nWhy the edits made under my usern...,"[689, 76, 2, 127, 131, 178, 30, 673, 4512, 120..."
1,D'aww! He matches this background colour I'm s...,"[96146, 53, 2636, 14, 556, 3810, 74, 4557, 270..."
2,"Hey man, I'm really not trying to edit war. It...","[413, 438, 74, 135, 15, 250, 3, 72, 315, 79, 5..."
3,"""\nMore\nI can't make any real suggestions on ...","[58, 8, 229, 98, 55, 329, 1437, 16, 2134, 8, 6..."
4,"You, sir, are my hero. Any chance you remember...","[7, 1678, 20, 30, 3517, 55, 1070, 7, 580, 40, ..."


In [None]:
#now, start the RNN model somehow 
sequences = tokenizer.texts_to_sequences

In [2]:
sequences

NameError: name 'sequences' is not defined

In [17]:
#load the pandas datasets into a tensor
# train = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
# test = tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values))

general notes:

i feel like we should train the length of comment as a hyperparameter. need to figure out how long most comments are and set to that, not arbitrarily 150. find most common length, medial length, etc. worried that the current model is cutting off any hashtags at the end, etc. 

In [1]:
!which python 

/Library/Frameworks/Python.framework/Versions/2.7/bin/python
