In [106]:
import sys, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import string
import datetime
import re
%load_ext autoreload
%autoreload 2

#modeling imports
from dateutil.relativedelta import relativedelta
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline
plt.style.use('fivethirtyeight')

#tf imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score


#multinomial nb
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from scipy.sparse.linalg import svds



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [107]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
subm = pd.read_csv('./data/sample_submission.csv')
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[labels].max(axis=1) #make an indicator for when there is no 
                                            #value for any of the labels 
#cant have any of the unknown values 
train['comment_text'].fillna("unknown", inplace=True)
test['comment_text'].fillna("unknown", inplace=True)

In [108]:
def load_data():

    data = os.path.join("data", "train.csv")

    df = pd.read_csv(data)
    X_train = df[['comment_text']]
    y_train = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

    X_test = pd.read_csv(os.path.join("data", "test.csv"))
    y_test = pd.read_csv(os.path.join("data", "test_labels.csv"))
    test = X_test.merge(y_test, on='id')
    test = test[ (test['toxic']!=-1) | (test['severe_toxic']!=-1) | 
                (test['obscene']!=-1) | (test['threat']!=-1) | (test['insult']!=-1) 
                | (test['identity_hate']!=-1) ]
    test = test.reset_index(drop=True) 
    
    X_test = test[['comment_text']]
    y_test = test[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

    return X_train, y_train, X_test, y_test 

In [109]:
X_train, y_train, X_test, y_test = load_data()

In [110]:
from keras import Sequential
import keras
from keras.layers import Embedding, LSTM, Dense, Dropout
import tensorflow as tf
from keras.preprocessing.text import Tokenizer

In [111]:
#TODO: should eventually take out the stop words here
#train tokenizer, then encode documents (comment_text)
tokenizer = Tokenizer(oov_token=True)
tokenizer.fit_on_texts(X_train['comment_text']) #fit the tokenizer to training set, make the test unknown words be an UNK value.


In [122]:
#trying to find the length of the vocabulary, might shrink
num_words = len(tokenizer.word_index) + 1
num_words
#looks like there is about 210K words... maybe we should pass in a maxwords 
#argument to the ttokenizer, but my guess is words that are really making 
#toxic comments are less frequent, so don't necessarily want to do that...

210339

In [113]:
from keras.preprocessing.sequence import pad_sequences

In [118]:
sequences = tokenizer.texts_to_sequences(X_train['comment_text'])
data = pad_sequences(sequences, maxlen=150)
test_sequences = tokenizer.texts_to_sequences(X_test['comment_text'])
test_data = pad_sequences(test_sequences, maxlen=150)

In [124]:
## Network architecture
# inspired at https://towardsdatascience.com/a-beginners-guide-on-sentiment-analysis-with-rnn-9e100627c02e and https://medium.com/@sabber/classifying-yelp-review-comments-using-lstm-and-word-embeddings-part-1-eb2275e4066b
model = Sequential()
#first layer is embedding, takes in size of vocab, 100 dim embedding, and 150 which is length of the comment 
model.add(Embedding(num_words, 100, input_length=150)) 
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [125]:
y_train

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
159566,0,0,0,0,0,0
159567,0,0,0,0,0,0
159568,0,0,0,0,0,0
159569,0,0,0,0,0,0


In [126]:
#first run through didn't specify a batch size, probably do that
#on the next try. 
model.fit(data, np.array(y_train['toxic']), validation_split=.3, epochs=3)

Instructions for updating:
Use tf.cast instead.
Train on 111699 samples, validate on 47872 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1c2f46a400>

In [129]:
model_toxic_json = model.to_json()
with open("model_toxic.json", "w") as json_file:
    json_file.write(model_toxic_json)
# serialize weights to HDF5
model.save_weights("model_toxic.h5")
print("Saved model_toxic to disk")

Saved model_toxic to disk


In [136]:
#score on the test set! 
score = model.evaluate(test_data, y_test['toxic'])



In [139]:
print("toxic label %s: %.2f%%" % (model.metrics_names[1], score[1]*100))

toxic label acc: 91.06%


In [148]:
#train models for all the other labels 
y_train
labels = y_train.columns
labels


In [None]:
for label in labels:
    print("====================================")
    print("====================================")
    print("starting fit on", label)
    print("====================================")
    print("====================================")
    if label == 'toxic':
        continue #already trained toxic
    
    #just reset the model, bc idk how to make sure it deletes old fit
    model = Sequential()
    #first layer is embedding, takes in size of vocab, 100 dim embedding, and 150 which is length of the comment 
    model.add(Embedding(num_words, 100, input_length=150)) 
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    
    #fit the data to the next label 
    model.fit(data, np.array(y_train[label]), validation_split=.3, epochs=3)
    model_json = model.to_json()
    
    name_json = label + '.json'
    name_h5 = label + '.h5'
    
    #save into json file
    with open(name_json, "w") as json_file:
        json_file.write(model_json)
    
    # serialize weights to HDF5
    model.save_weights(name_h5)
    print("Saved", label,"to disk")
    
    #lastly, evaluate on test 
    score = model.evaluate(test_data, y_test[label])
    print(label,"%s: %.2f%%" % (model.metrics_names[1], score[1]*100))
    print("====================================")
    print("====================================")
    print("done with", label,"starting next fit")
    print("====================================")
    print("====================================")
    

starting fit on toxic
starting fit on severe_toxic
Train on 111699 samples, validate on 47872 samples
Epoch 1/3
 11904/111699 [==>...........................] - ETA: 27:02 - loss: 0.0697 - acc: 0.9892

In [None]:

#try # 2 dont use 
X_train['seq'] = tokenizer.texts_to_sequences(X_train['comment_text'])
X_test['seq'] = tokenizer.texts_to_sequences(X_test['comment_text'])

In [102]:
def pad_shorter(arr, max_len):
    for n in range(len(arr), max_len):
        arr.append(None) 
    return arr

max_len = 100
print("min length before:", X_train.seq.map(lambda x: len(x)).min())
X_train['seq'] = X_train['seq'].apply(lambda x: pad_shorter(x, max_len))

min length before: 1


In [103]:
print("min length after:", X_train.seq.map(lambda x: len(x)).min())
X_train

min length after: 100


Unnamed: 0,comment_text,seq
0,Explanation\nWhy the edits made under my usern...,"[689, 76, 2, 127, 131, 178, 30, 673, 4512, 120..."
1,D'aww! He matches this background colour I'm s...,"[96146, 53, 2636, 14, 556, 3810, 74, 4557, 270..."
2,"Hey man, I'm really not trying to edit war. It...","[413, 438, 74, 135, 15, 250, 3, 72, 315, 79, 5..."
3,"""\nMore\nI can't make any real suggestions on ...","[58, 8, 229, 98, 55, 329, 1437, 16, 2134, 8, 6..."
4,"You, sir, are my hero. Any chance you remember...","[7, 1678, 20, 30, 3517, 55, 1070, 7, 580, 40, ..."
...,...,...
159566,""":::::And for the second time of asking, when ...","[5, 13, 2, 428, 85, 4, 903, 83, 21, 314, 563, ..."
159567,You should be ashamed of yourself \n\nThat is ...,"[7, 57, 17, 4653, 4, 207, 10, 9, 6, 3328, 232,..."
159568,"Spitzer \n\nUmm, theres no actual article for ...","[34279, 7331, 5209, 47, 738, 24, 13, 8168, 351..."
159569,And it looks like it was actually you who put ...,"[5, 12, 575, 50, 12, 25, 211, 7, 63, 202, 16, ..."


In [105]:
def trunc_longer(arr, max_len):
    if len(arr) > max_len:
        arr = arr[:max_len]
    return arr

max_len = 100
print("max length before:", X_train.seq.map(lambda x: len(x)).max())
X_train['seq'] = X_train['seq'].apply(lambda x: trunc_longer(x, max_len))
print("max length after:", X_train.seq.map(lambda x: len(x)).max())
X_train.head()

#here all the comments should be the same size. go back and check what the mean/median 
#length of comment (by word) is, 100 might be a hyperparameter we should tune. 
#could also be checkign what the lengths are of toxic comments vs. normal comments. 

max length before: 1403
max length after: 100


Unnamed: 0,comment_text,seq
0,Explanation\nWhy the edits made under my usern...,"[689, 76, 2, 127, 131, 178, 30, 673, 4512, 120..."
1,D'aww! He matches this background colour I'm s...,"[96146, 53, 2636, 14, 556, 3810, 74, 4557, 270..."
2,"Hey man, I'm really not trying to edit war. It...","[413, 438, 74, 135, 15, 250, 3, 72, 315, 79, 5..."
3,"""\nMore\nI can't make any real suggestions on ...","[58, 8, 229, 98, 55, 329, 1437, 16, 2134, 8, 6..."
4,"You, sir, are my hero. Any chance you remember...","[7, 1678, 20, 30, 3517, 55, 1070, 7, 580, 40, ..."


In [None]:
#now, start the RNN model somehow 
sequences = tokenizer.texts_to_sequences

In [17]:
#load the pandas datasets into a tensor
# train = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
# test = tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values))


general notes:

i feel like we should train the length of comment as a hyperparameter. need to figure out how long most comments are and set to that, not arbitrarily 150. find most common length, medial length, etc. worried that the current model is cutting off any hashtags at the end, etc. 