In [2]:
import pandas as pd
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from sklearn import model_selection
import numpy as np
import tensorflow  as tf
import keras
import h5py
from keras.models import load_model


from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras.metrics import binary_accuracy

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Get full training data set

In [3]:
#train_all is entire training set, used for tokenization
train_all=pd.read_csv('train.csv')

In [41]:
train_all.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0


### get smaller sets for training and testing

In [37]:
#toxic_yes_train
toxic_yes_train=pd.read_csv('toxic_yes_train.txt',sep='\n',header=-1)
toxic_yes_train['toxic']=np.ones(toxic_yes_train.shape[0],)
toxic_yes_train['toxic']=toxic_yes_train['toxic'].apply(lambda x: int(x))
toxic_yes_train.columns=['comment','toxic']

#toxic_no_train
toxic_no_train=pd.read_csv('toxic_no_train.txt',sep='\n',header=-1)
toxic_no_train['toxic']=np.zeros(toxic_no_train.shape[0],)
toxic_no_train['toxic']=toxic_no_train['toxic'].apply(lambda x: int(x))
toxic_no_train.columns=['comment','toxic']


#toxic_yes_dev
toxic_yes_dev=pd.read_csv('toxic_yes_dev.txt',sep='\n',header=-1)
toxic_yes_dev['toxic']=np.ones(toxic_yes_dev.shape[0],)
toxic_yes_dev['toxic']=toxic_yes_dev['toxic'].apply(lambda x: int(x))
toxic_yes_dev.columns=['comment','toxic']

#toxic_no_dev
toxic_no_dev=pd.read_csv('toxic_no_dev.txt',sep='\n',header=-1)
toxic_no_dev['toxic']=np.zeros(toxic_no_dev.shape[0],)
toxic_no_dev['toxic']=toxic_no_dev['toxic'].apply(lambda x: int(x))
toxic_no_dev.columns=['comment','toxic']

### Mix up the yes & no in each train and dev

In [38]:
toxic_dev=pd.concat([toxic_no_dev,toxic_yes_dev])
toxic_dev = toxic_dev.sample(frac=1).reset_index(drop=True)

toxic_train=pd.concat([toxic_no_train,toxic_yes_train])
toxic_train = toxic_train.sample(frac=1).reset_index(drop=True)

In [39]:
#preprocessing

print('Splitting data...')

#training data
x_train = toxic_train['comment'].fillna("nada").values
y_train = toxic_train[['toxic']].values

#testing validation data (not for training model, just validation for )

x_test = toxic_dev['comment'].fillna("nada").values
y_test = toxic_dev[['toxic']].values

# check lengths

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Splitting data...
127658 train sequences
31913 test sequences


### Model parameters

In [40]:
max_feature=20000
EPOCHS=4
maxlen=250 
dropout=0.1

### Tokenize

In [42]:
tokenizer = text.Tokenizer(num_words=max_feature)

tokenizer.fit_on_texts(list(train_all["comment_text"].fillna("nada").values)) #fit on all comment_text

#create tokenized comments
list_tokenized_train = tokenizer.texts_to_sequences(x_train)  #training (80% of train.csv)
list_tokenized_test = tokenizer.texts_to_sequences(x_test)  #testing (20% of train.csv)

### Create Model function

In [47]:
def get_model(maxlen,max_features,dropout,embed_size=128):
    
    embed_size=embed_size #default to 128
    maxlen=maxlen # max length of sequence input
    max_features=max_features  # max vocab
    dropout=dropout
    
    inp = Input(shape=(maxlen, ))
    
    x = Embedding(max_features, embed_size)(inp)
    
    x = Bidirectional(LSTM(50, return_sequences=False))(x)
    
    x = Dense(50, activation="relu")(x)
    
    x = Dropout(dropout)(x)
    
    x = Dense(1, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=x)
    
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=[binary_accuracy])

    return model


In [48]:
print('padding maxlen=',maxlen)
x_train_sequence = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
x_test_sequence = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

print('building model')
model=get_model(maxlen=maxlen,max_features=max_feature,dropout=dropout,embed_size=128)

batch_size = 32

weight_file_path="weights_base_{}_{}_{}_std_data.hdf5".format(max_feature,maxlen,dropout)
model_file_path='bidirectional_lstm_relu_sigmoid_maxfeat{}_maxlen{}_dropout{}_std_data'.format(max_feature,maxlen,dropout)

checkpoint = ModelCheckpoint(weight_file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

callbacks_list = [checkpoint, early] #early

#fit model
print('fitting model with max_features={}, maxlen = {}, and dropout = {}'.format(max_feature,maxlen,dropout))
model.fit(x_train_sequence, y_train, batch_size=batch_size, epochs=EPOCHS, validation_split=0.1, callbacks=callbacks_list)

#save model
model.save(model_file_path)

test_model=load_model(model_file_path)

print('compiling model')
test_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=[binary_accuracy])

print('evaluating model')
score = test_model.evaluate(test_sequence, y_test, verbose=1)

print("%s: %.2f%%" % (test_model.metrics_names[1], score[1]*100))

print('creating prediction')
predict=test_model.predict(test_sequence, verbose=1)

predict_df=pd.DataFrame(predict)
predict_df.to_csv('bidirectional_lstm_relu_sigmoid_maxfeat{}_maxlen{}_dropout{}_std_data.csv'.format(max_feature,maxlen,dropout))



padding maxlen= 250
building model
fitting model with max_features=20000, maxlen = 250, and dropout = 0.1
Train on 114892 samples, validate on 12766 samples
Epoch 1/4

Epoch 00001: val_loss improved from inf to 0.09981, saving model to weights_base_20000_250_0.1_std_data.hdf5
Epoch 2/4

Epoch 00002: val_loss did not improve
Epoch 3/4

Epoch 00003: val_loss did not improve
Epoch 4/4

Epoch 00004: val_loss did not improve
compiling model
evaluating model


NameError: name 'test_sequence' is not defined

In [50]:
print('evaluating model')
score = test_model.evaluate(x_test_sequence, y_test, verbose=1)

print("%s: %.2f%%" % (test_model.metrics_names[1], score[1]*100))

print('creating prediction')
predict=test_model.predict(x_test_sequence, verbose=1)

predict_df=pd.DataFrame(predict)
predict_df.to_csv('bidirectional_lstm_relu_sigmoid_maxfeat{}_maxlen{}_dropout{}_std_data.csv'.format(max_feature,maxlen,dropout))



evaluating model
binary_accuracy: 96.00%
creating prediction


In [51]:
predict_df.head()

Unnamed: 0,0
0,1.293409e-05
1,5.358924e-07
2,0.001002165
3,0.000260028
4,1.615242e-05


In [56]:
print('test predictions on "toxic" category')
predict_df=pd.DataFrame(predict)
predict_df.columns=['toxic']

predict_df.to_csv('prediction_2000_250_std_data.csv')

for c in predict_df.columns:
     predict_df[c]=predict_df[c].map(lambda x: 1 if x >=0.5 else 0)

gold=pd.DataFrame(y_test)
gold.columns=['toxic']
        
err_df=gold['toxic']-predict_df['toxic']  #val of -1 is false positive Type I, value of +1 is false negative Type II

#performance_dict[key]['total_toxic']=sum(gold['toxic'])
#performance_dict[key]['toxic_falsepos']=err_df.value_counts()[-1]
#performance_dict[key]['toxic_falseneg']=err_df.value_counts()[1]



print('total true toxic',sum(gold['toxic']))
#false positives
print('total false positive (incorrectly flagged as toxic) Type I',err_df.value_counts()[-1])
print('false positive rate', '{:02.3f}'.format(100*err_df.value_counts()[-1]/sum(gold['toxic'])),'%')
#false negatives
print('total false negative (omitted flagging as toxic) Type II',err_df.value_counts()[1])
print('false negative rate', '{:02.3f}'.format(100*err_df.value_counts()[1]/sum(gold['toxic'])),'%')


print('-'*50,'\n')

test predictions on "toxic" category
total true toxic 3058
total false positive (incorrectly flagged as toxic) Type I 472
false positive rate 15.435 %
total false negative (omitted flagging as toxic) Type II 803
false negative rate 26.259 %
-------------------------------------------------- 



In [73]:
pd.concat([pd.DataFrame(x_test),pd.DataFrame(err_df)], axis=1).to_csv('LSTM_std_data_error.csv',index=False)

In [75]:
dummy=pd.read_csv('LSTM_std_data_error.csv')
dummy[dummy['toxic']==-1].to_csv('LSTM_std_error_falsePOS.csv',index=False)
dummy[dummy['toxic']==1].to_csv('LSTM_std_error_falseNEG.csv',index=False)

In [77]:
pd.read_csv('LSTM_std_error_falseNEG.csv').head()

Unnamed: 0,0,toxic
0,you simply display your ignorance fatuorum,1
1,ha do n't be silly you said that you were goin...,1
2,and also admit being dick that time,1
3,"boring ! ! ! ! ! ! stay out of it sister , go ...",1
4,"if a group of christians kill a hindu , do you...",1
