In [21]:
import numpy as np 
import pandas as pd 

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

max_features = 20000
maxlen = 100


train = pd.read_csv("train.csv")
#test = pd.read_csv("test.csv")
#train = train.sample(frac=1)

list_sentences_train = train["comment_text"].fillna("nada").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
#list_sentences_test = test["comment_text"].fillna("nada").values


tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
#list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
#X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

    return model


model = get_model()
batch_size = 32
epochs = 2


file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=20)


callbacks_list = [checkpoint, early] #early
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

#model.load_weights(file_path)

#y_predict = model.predict(X_te)



#sample_submission = pd.read_csv("sample_submission.csv")

#sample_submission[list_classes] = y_predict



#sample_submission.to_csv("baseline.csv", index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.27913, saving model to weights_base.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.27913 to 0.27637, saving model to weights_base.best.hdf5


<keras.callbacks.History at 0x128f59c50>

In [None]:


#maxlens=[100,200,300]
#batch_sizes=[32]

In [7]:
train = pd.read_csv("train.csv")

In [16]:
comment_len_bin=dict()

comment_len_bin['total']=train.shape[0]
comment_len_bin['100']=0
comment_len_bin['150']=0
comment_len_bin['200']=0
comment_len_bin['250']=0
comment_len_bin['300']=0

for index,row in train.iterrows():
    
    if len(row['comment_text'].split(' '))<=100:
        comment_len_bin['100']+=(1)
        
    if len(row['comment_text'].split(' '))<=150:
        comment_len_bin['150']+=(1)

    if len(row['comment_text'].split(' '))<=200:
        comment_len_bin['200']+=(1)

    if len(row['comment_text'].split(' '))<=250:
        comment_len_bin['250']+=(1)

    if len(row['comment_text'].split(' '))<=300:
        comment_len_bin['300']+=(1)
        
    

In [14]:
comment_len_bin

{'100': 130994, '150': 0, '200': 0, '250': 0, '300': 0, 'total': 159571}

In [19]:
for i in ['100','150','200','250','300']:
    print('{:6.2f}'.format(100*comment_len_bin[i]/comment_len_bin['total']),'% of comments have length equal or less than', i, 'words')

 82.09 % of comments have length equal or less than 100 words
 89.78 % of comments have length equal or less than 150 words
 93.61 % of comments have length equal or less than 200 words
 95.75 % of comments have length equal or less than 250 words
 96.95 % of comments have length equal or less than 300 words


# Lets Automate

In [None]:
!wget('https://s3.amazonaws.com/danicic-w266-final/train.csv')

In [156]:
import pandas as pd
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from sklearn import model_selection
import numpy as np
import tensorflow  as tf
import keras
import h5py
from keras.models import load_model


from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras.metrics import categorical_accuracy

In [163]:
# import data
print('Importing data...')
train_all=pd.read_csv('train.csv')

# 80/20 split
train_80, test_20 = model_selection.train_test_split(train_all,test_size=0.2)

#preprocessing

print('Splitting data...')

#training data
x_train = train_80['comment_text'].fillna("nada").values
y_train = train_80[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values

#testing validation data (not for training model, just validation for )

x_test = test_20['comment_text'].fillna("nada").values
y_test = test_20[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values

# check lengths

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Importing data...
Splitting data...
127656 train sequences
31915 test sequences


## Model Parameters:

In [4]:
max_feature_list=[20000,10000]
EPOCHS=4
maxlen_list=[150,250] 
dropout_list=[0.1,0.2]

## tokenize

In [164]:
tokenizer = text.Tokenizer(num_words=max_feature_list[0])

tokenizer.fit_on_texts(list(train_all["comment_text"].fillna("nada").values)) #fit on all comment_text

#create tokenized comments
list_tokenized_train = tokenizer.texts_to_sequences(x_train)  #training (80% of train.csv)
list_tokenized_test = tokenizer.texts_to_sequences(x_test)  #testing (20% of train.csv)

## create LSTM model maker function

In [140]:
def get_model(maxlen,max_features,dropout,embed_size=128):
    
    embed_size=embed_size #default to 128
    maxlen=maxlen # max length of sequence input
    max_features=max_features  # max vocab
    dropout=dropout
    
    inp = Input(shape=(maxlen, ))
    
    x = Embedding(max_features, embed_size)(inp)
    
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    
    x = GlobalMaxPool1D()(x)
    
    x = Dropout(dropout)(x)
    
    x = Dense(50, activation="relu")(x)
    
    x = Dropout(dropout)(x)
    
    x = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=x)
    
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=[categorical_accuracy])

    return model


## Tokenize and fit model

In [8]:
max_feature_list=[20000,10000]
maxlen_list=[150,250] 
dropout_list=[0.1,0.2]
EPOCHS=4

for max_feature in [max_feature_list[0]]:
    
    for maxlen in maxlen_list:
        
        for dropout in dropout_list:
            
            print('padding maxlen=',maxlen)
            x_train_pad = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
            x_test_pad = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)
            
            print('building model')
            model=get_model(maxlen=maxlen,max_features=max_feature,dropout=dropout,embed_size=128)
            
            batch_size = 32


            weight_file_path="weights_base_{}_{}_{}.hdf5".format(max_feature,maxlen,dropout)
            model_file_path='bidirectional_lstm_globMP_relu_sigmoid_maxfeat{}_maxlen{}_dropout{}'.format(max_feature,maxlen,dropout)

            checkpoint = ModelCheckpoint(weight_file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

            early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

            callbacks_list = [checkpoint, early] #early
            
            #fit model
            print('fitting model with max_features={}, maxlen = {}, and dropout = {}'.format(max_feature,maxlen,dropout))
            model.fit(x_train_pad, y_train, batch_size=batch_size, epochs=EPOCHS, validation_split=0.1, callbacks=callbacks_list)
            
            #save model
            model.save(model_file_path)


padding maxlen= 150
building model
fitting model with max_features=20000, maxlen = 150, and dropout = 0.1
Train on 114890 samples, validate on 12766 samples
Epoch 1/4

Epoch 00001: val_loss improved from inf to 0.04986, saving model to weights_base_20000_150_0.1.hdf5
Epoch 2/4

Epoch 00002: val_loss improved from 0.04986 to 0.04824, saving model to weights_base_20000_150_0.1.hdf5
Epoch 3/4

Epoch 00003: val_loss did not improve
Epoch 4/4

Epoch 00004: val_loss did not improve
padding maxlen= 150
building model
fitting model with max_features=20000, maxlen = 150, and dropout = 0.2
Train on 114890 samples, validate on 12766 samples
Epoch 1/4

Epoch 00001: val_loss improved from inf to 0.05144, saving model to weights_base_20000_150_0.2.hdf5
Epoch 2/4

Epoch 00002: val_loss improved from 0.05144 to 0.05099, saving model to weights_base_20000_150_0.2.hdf5
Epoch 3/4

Epoch 00003: val_loss did not improve
Epoch 4/4

Epoch 00004: val_loss did not improve
padding maxlen= 250
building model
fit

In [9]:
model.weights

[<tf.Variable 'embedding_4/embeddings:0' shape=(20000, 128) dtype=float32_ref>,
 <tf.Variable 'bidirectional_4/forward_lstm_4/kernel:0' shape=(128, 200) dtype=float32_ref>,
 <tf.Variable 'bidirectional_4/forward_lstm_4/recurrent_kernel:0' shape=(50, 200) dtype=float32_ref>,
 <tf.Variable 'bidirectional_4/forward_lstm_4/bias:0' shape=(200,) dtype=float32_ref>,
 <tf.Variable 'bidirectional_4/backward_lstm_4/kernel:0' shape=(128, 200) dtype=float32_ref>,
 <tf.Variable 'bidirectional_4/backward_lstm_4/recurrent_kernel:0' shape=(50, 200) dtype=float32_ref>,
 <tf.Variable 'bidirectional_4/backward_lstm_4/bias:0' shape=(200,) dtype=float32_ref>,
 <tf.Variable 'dense_7/kernel:0' shape=(100, 50) dtype=float32_ref>,
 <tf.Variable 'dense_7/bias:0' shape=(50,) dtype=float32_ref>,
 <tf.Variable 'dense_8/kernel:0' shape=(50, 6) dtype=float32_ref>,
 <tf.Variable 'dense_8/bias:0' shape=(6,) dtype=float32_ref>]

In [10]:
#model=get_model(maxlen=100,max_features=20000,dropout=0.1,embed_size=128)

In [14]:
from keras.models import load_model 

# Returns a compiled model identical to the previous one
#model = load_model("model_name.hdf5")


In [10]:
y_predict = model.predict(x_test_pad)

In [14]:
from keras import metrics

In [22]:
print(y_test.shape)
print(y_predict.shape)

(31915, 6)
(31915, 6)


In [31]:
loaded_model=model

In [34]:
#binary_accuracy(y_true, y_pred)
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=[categorical_accuracy])
score = loaded_model.evaluate(x_test_pad, y_test, verbose=1)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

categorical_accuracy: 99.24%


In [47]:
model_dict={'bidirectional_lstm_globMP_relu_sigmoid_maxfeat20000_maxlen150_dropout0.1':[150,0.1],\
'bidirectional_lstm_globMP_relu_sigmoid_maxfeat20000_maxlen150_dropout0.2':[150,0.2],\
'bidirectional_lstm_globMP_relu_sigmoid_maxfeat20000_maxlen250_dropout0.1':[250,0.1],\
'bidirectional_lstm_globMP_relu_sigmoid_maxfeat20000_maxlen250_dropout0.2':[250,0.2]}

In [49]:
from keras.models import load_model 

# Returns a compiled model identical to the previous one
#model = load_model("model_name.hdf5")

tokenizer = text.Tokenizer(num_words=20000)
#tokenize based on comment database
tokenizer.fit_on_texts(list(train_all["comment_text"].fillna("nada").values)) #fit on all comment_text
#create tokenized comments
#list_tokenized_train = tokenizer.texts_to_sequences(x_train)  #training (80% of train.csv)
list_tokenized_test = tokenizer.texts_to_sequences(x_test)  #testing (20% of train.csv)

for file in model_dict.keys():
    
    maxlen=model_dict[file][0]
    dropout=model_dict[file][1]
    
    print('-'*50,'\nModel with maxlen of {} and dropout of {}'.format(maxlen,dropout))
    
    print('padding maxlen=',maxlen)
    #x_train_pad = sequence.pad_sequences(list_tokenized_train, maxlen=model_dict[file])
    x_test_pad = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

    print('loading model')
    loaded_model=load_model(file)
    
    print('compiling model')
    loaded_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=[categorical_accuracy])
    print('evaluating model')
    score = loaded_model.evaluate(x_test_pad, y_test, verbose=1)
    print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))
    print('-'*50,'\n')
    

-------------------------------------------------- 
Model with maxlen of 150 and dropout of 0.1
padding maxlen= 150
loading model
compiling model
evaluating model
categorical_accuracy: 99.11%
-------------------------------------------------- 

-------------------------------------------------- 
Model with maxlen of 150 and dropout of 0.2
padding maxlen= 150
loading model
compiling model
evaluating model
categorical_accuracy: 99.26%
-------------------------------------------------- 

-------------------------------------------------- 
Model with maxlen of 250 and dropout of 0.1
padding maxlen= 250
loading model
compiling model
evaluating model
categorical_accuracy: 97.69%
-------------------------------------------------- 

-------------------------------------------------- 
Model with maxlen of 250 and dropout of 0.2
padding maxlen= 250
loading model
compiling model
evaluating model
categorical_accuracy: 99.24%
-------------------------------------------------- 



In [53]:
# import data
print('Importing data...')
train_all=pd.read_csv('train.csv')

# 80/20 split
train_80, test_20 = model_selection.train_test_split(train_all,test_size=0.2)

#preprocessing

print('Splitting data...')

#training data
x_train = train_80['comment_text'].fillna("nada").values
y_train = train_80[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values

#testing validation data (not for training model, just validation for )

x_test = test_20['comment_text'].fillna("nada").values
y_test = test_20[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values

# check lengths

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')



# MODEL PARAMETERS

max_feature_list=[20000,10000]
EPOCHS=4
maxlen_list=[150,250] 
dropout_list=[0.1,0.2]

#TOKENIZE 

tokenizer = text.Tokenizer(num_words=max_feature_list[1])
tokenizer.fit_on_texts(list(train_all["comment_text"].fillna("nada").values)) #fit on all comment_text

#create tokenized comments
list_tokenized_train = tokenizer.texts_to_sequences(x_train)  #training (80% of train.csv)
list_tokenized_test = tokenizer.texts_to_sequences(x_test)  #testing (20% of train.csv)



for max_feature in [max_feature_list[1]]:
    
    for maxlen in [maxlen_list[0]]:
        
        for dropout in [dropout_list[0]]:
            
            print('padding maxlen=',maxlen)
            x_train_pad = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
            x_test_pad = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)
            
            print('building model')
            model=get_model(maxlen=maxlen,max_features=max_feature,dropout=dropout,embed_size=128)
            
            batch_size = 32


            weight_file_path="weights_base_{}_{}_{}.hdf5".format(max_feature,maxlen,dropout)
            model_file_path='bidirectional_lstm_globMP_relu_sigmoid_maxfeat{}_maxlen{}_dropout{}'.format(max_feature,maxlen,dropout)

            checkpoint = ModelCheckpoint(weight_file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

            early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

            callbacks_list = [checkpoint, early] #early
            
            #fit model
            print('fitting model with max_features={}, maxlen = {}, and dropout = {}'.format(max_feature,maxlen,dropout))
            model.fit(x_train_pad, y_train, batch_size=batch_size, epochs=EPOCHS, validation_split=0.1, callbacks=callbacks_list)
            
            #save model
            model.save(model_file_path)


Importing data...
Splitting data...
127656 train sequences
31915 test sequences
padding maxlen= 150
building model
fitting model with max_features=10000, maxlen = 150, and dropout = 0.1
Train on 114890 samples, validate on 12766 samples
Epoch 1/4

Epoch 00001: val_loss improved from inf to 0.04875, saving model to weights_base_10000_150_0.1.hdf5
Epoch 2/4

Epoch 00002: val_loss improved from 0.04875 to 0.04698, saving model to weights_base_10000_150_0.1.hdf5
Epoch 3/4

Epoch 00003: val_loss improved from 0.04698 to 0.04615, saving model to weights_base_10000_150_0.1.hdf5
Epoch 4/4

Epoch 00004: val_loss did not improve


## Evaluate model and predictions

In [54]:
loaded_model=model

x_test_pad = sequence.pad_sequences(list_tokenized_test, maxlen=150)

print('compiling model')
loaded_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=[categorical_accuracy])
print('evaluating model')
score = loaded_model.evaluate(x_test_pad, y_test, verbose=1)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))
print('-'*50,'\n')

compiling model
evaluating model
categorical_accuracy: 99.32%
-------------------------------------------------- 



In [97]:
y_predict=model.predict(x_test_pad)

In [98]:
y_predict=pd.DataFrame(y_predict)
y_test=pd.DataFrame(y_test)

y_predict.columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate']
y_test.columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate']


# get predictions with decision boundary = 0.5
for c in ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]:

    y_predict[c]=y_predict[c].map(lambda x: 1 if x >=0.5 else 0)
    
# subract predictions from true labels to get type I (= -1) and type II (=1)
y_error=y_test-y_predict



In [99]:
y_error.head(20)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,0,0,0,0,0,0
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0


In [100]:
test_20['comment_text'].iloc[18]

'"\n\nMay Allah (swt) either give you punishment or hidiyaat for spreading falsehood and keeping other Muslim editors from making the article neutral - Insh\'Allah.  —Preceding unsigned comment added by 89.108.24.87     —Preceding unsigned comment added by 213.146.172.146   \n\nYour comment on your edit proves you are ignorant of Islam.  —Preceding unsigned comment added by 213.146.172.146   "'

In [130]:
tot=y_error.shape[0]
for c in ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]:
    print(c)
    print(y_error[c].value_counts())
    type1=0
    type2=0
    
    try:
        type1=y_error[c].value_counts()[-1]
    except:
        pass
    
    try:
        type2=y_error[c].value_counts()[1]
    except:
        pass
    print('\n Type I (flagged when no flag exists):','{:02.2f}%'.format(type1*100/tot))
    print('Type II (not flagged when flag should exist):',"{:02.2f}%".format(type2*100/tot))
    print('\n \n')

toxic
 0    30690
 1      992
-1      233
Name: toxic, dtype: int64

 Type I (flagged when no flag exists): 0.73%
Type II (not flagged when flag should exist): 3.11%

 

severe_toxic
 0    31611
 1      187
-1      117
Name: severe_toxic, dtype: int64

 Type I (flagged when no flag exists): 0.37%
Type II (not flagged when flag should exist): 0.59%

 

obscene
 0    31292
 1      384
-1      239
Name: obscene, dtype: int64

 Type I (flagged when no flag exists): 0.75%
Type II (not flagged when flag should exist): 1.20%

 

threat
0    31829
1       86
Name: threat, dtype: int64

 Type I (flagged when no flag exists): 0.00%
Type II (not flagged when flag should exist): 0.27%

 

insult
 0    31060
 1      532
-1      323
Name: insult, dtype: int64

 Type I (flagged when no flag exists): 1.01%
Type II (not flagged when flag should exist): 1.67%

 

identity_hate
 0    31675
 1      161
-1       79
Name: identity_hate, dtype: int64

 Type I (flagged when no flag exists): 0.25%
Type II (not

In [138]:
test_20.shape

(31915, 8)

In [145]:
model.weights[7]

<tf.Variable 'dense_11/kernel:0' shape=(100, 50) dtype=float32_ref>


In [152]:

wrong=0
for index, row in y_error.iterrows():
    if y_error.iloc[index].value_counts()[0]!=6:
        wrong+=1
        
print(wrong/y_error.shape[0])

0.07773774087419709


In [151]:
y_error.iloc[1].value_counts()[0]

6

In [153]:
sum([992,233,187,117,384,239,86,532,323,161,79])

3333

In [155]:
0.07773774087419709/6

0.012956290145699515

## NOW LETS AUTOMATE MAX FEATURES 10000

In [158]:
max_feature_list=[20000,10000]
EPOCHS=4
maxlen_list=[150,250] 
dropout_list=[0.1,0.2]

In [166]:
# tokenize

tokenizer = text.Tokenizer(num_words=max_feature_list[1])

tokenizer.fit_on_texts(list(train_all["comment_text"].fillna("nada").values)) #fit on all comment_text

#create tokenized comments
list_tokenized_train = tokenizer.texts_to_sequences(x_train)  #training (80% of train.csv)
list_tokenized_test = tokenizer.texts_to_sequences(x_test)  #testing (20% of train.csv)

In [167]:
max_feature_list=[20000,10000]
maxlen_list=[150,250] 
dropout_list=[0.1,0.2]
EPOCHS=4

for max_feature in [max_feature_list[1]]:
    
    for maxlen in maxlen_list:
        
        for dropout in dropout_list:
            
            if (maxlen,dropout) !=(150,0.1):  # already did 150,0.1
                print('padding maxlen=',maxlen)
                x_train_pad = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
                x_test_pad = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

                print('building model')
                model=get_model(maxlen=maxlen,max_features=max_feature,dropout=dropout,embed_size=128)

                batch_size = 32


                weight_file_path="weights_base_{}_{}_{}.hdf5".format(max_feature,maxlen,dropout)
                model_file_path='bidirectional_lstm_globMP_relu_sigmoid_maxfeat{}_maxlen{}_dropout{}'.format(max_feature,maxlen,dropout)

                checkpoint = ModelCheckpoint(weight_file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

                early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

                callbacks_list = [checkpoint, early] #early

                #fit model
                print('fitting model with max_features={}, maxlen = {}, and dropout = {}'.format(max_feature,maxlen,dropout))
                model.fit(x_train_pad, y_train, batch_size=batch_size, epochs=EPOCHS, validation_split=0.1, callbacks=callbacks_list)

                #save model
                model.save(model_file_path)

padding maxlen= 150
building model
fitting model with max_features=10000, maxlen = 150, and dropout = 0.2
Train on 114890 samples, validate on 12766 samples
Epoch 1/4

Epoch 00001: val_loss improved from inf to 0.04887, saving model to weights_base_10000_150_0.2.hdf5
Epoch 2/4

Epoch 00002: val_loss improved from 0.04887 to 0.04755, saving model to weights_base_10000_150_0.2.hdf5
Epoch 3/4

Epoch 00003: val_loss improved from 0.04755 to 0.04693, saving model to weights_base_10000_150_0.2.hdf5
Epoch 4/4

Epoch 00004: val_loss did not improve
padding maxlen= 250
building model
fitting model with max_features=10000, maxlen = 250, and dropout = 0.1
Train on 114890 samples, validate on 12766 samples
Epoch 1/4

Epoch 00001: val_loss improved from inf to 0.04929, saving model to weights_base_10000_250_0.1.hdf5
Epoch 2/4

Epoch 00002: val_loss improved from 0.04929 to 0.04713, saving model to weights_base_10000_250_0.1.hdf5
Epoch 3/4

Epoch 00003: val_loss did not improve
Epoch 4/4

Epoch 0000

In [161]:
max_feature_list=[20000,10000]
maxlen_list=[150,250] 
dropout_list=[0.1,0.2]
EPOCHS=4

for max_feature in [max_feature_list[1]]:
    
    for maxlen in maxlen_list:
        
        for dropout in dropout_list:
            
            if (maxlen,dropout) !=(150,0.1):
                
                print(max_feature,maxlen,dropout)

10000 150 0.2
10000 250 0.1
10000 250 0.2


## Evaluate on test data

In [168]:
model_dict={'bidirectional_lstm_globMP_relu_sigmoid_maxfeat10000_maxlen150_dropout0.1':[150,0.1],\
'bidirectional_lstm_globMP_relu_sigmoid_maxfeat10000_maxlen150_dropout0.2':[150,0.2],\
'bidirectional_lstm_globMP_relu_sigmoid_maxfeat10000_maxlen250_dropout0.1':[250,0.1],\
'bidirectional_lstm_globMP_relu_sigmoid_maxfeat10000_maxlen250_dropout0.2':[250,0.2]}

In [169]:
from keras.models import load_model 

# Returns a compiled model identical to the previous one
#model = load_model("model_name.hdf5")

tokenizer = text.Tokenizer(num_words=10000)
#tokenize based on comment database
tokenizer.fit_on_texts(list(train_all["comment_text"].fillna("nada").values)) #fit on all comment_text
#create tokenized comments
#list_tokenized_train = tokenizer.texts_to_sequences(x_train)  #training (80% of train.csv)
list_tokenized_test = tokenizer.texts_to_sequences(x_test)  #testing (20% of train.csv)

for file in model_dict.keys():
    
    maxlen=model_dict[file][0]
    dropout=model_dict[file][1]
    
    print('-'*50,'\nModel with max_features 10000, maxlen of {}, and dropout of {}'.format(maxlen,dropout))
    
    print('padding maxlen=',maxlen)
    #x_train_pad = sequence.pad_sequences(list_tokenized_train, maxlen=model_dict[file])
    x_test_pad = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

    print('loading model')
    loaded_model=load_model(file)
    
    print('compiling model')
    loaded_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=[categorical_accuracy])
    print('evaluating model')
    score = loaded_model.evaluate(x_test_pad, y_test, verbose=1)
    print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))
    print('-'*50,'\n')

-------------------------------------------------- 
Model with max_features 10000, maxlen of 150, and dropout of 0.1
padding maxlen= 150
loading model
compiling model
evaluating model
categorical_accuracy: 99.27%
-------------------------------------------------- 

-------------------------------------------------- 
Model with max_features 10000, maxlen of 150, and dropout of 0.2
padding maxlen= 150
loading model
compiling model
evaluating model
categorical_accuracy: 99.35%
-------------------------------------------------- 

-------------------------------------------------- 
Model with max_features 10000, maxlen of 250, and dropout of 0.1
padding maxlen= 250
loading model
compiling model
evaluating model
categorical_accuracy: 99.35%
-------------------------------------------------- 

-------------------------------------------------- 
Model with max_features 10000, maxlen of 250, and dropout of 0.2
padding maxlen= 250
loading model
compiling model
evaluating model
categorical_accura

In [170]:
model.layers

[<keras.engine.topology.InputLayer at 0x16d47ebe0>,
 <keras.layers.embeddings.Embedding at 0x168198b38>,
 <keras.layers.wrappers.Bidirectional at 0x174c4cbe0>,
 <keras.layers.pooling.GlobalMaxPooling1D at 0x174c2bfd0>,
 <keras.layers.core.Dropout at 0x174c2bf98>,
 <keras.layers.core.Dense at 0x174c2bf60>,
 <keras.layers.core.Dropout at 0x16d08dd68>,
 <keras.layers.core.Dense at 0x174cc8da0>]

In [173]:
print('loading model')
loaded_model=load_model('bidirectional_lstm_globMP_relu_sigmoid_maxfeat10000_maxlen250_dropout0.2')

print('compiling model')
loaded_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['binary_accuracy','categorical_accuracy'])
print('evaluating model')
score = loaded_model.evaluate(x_test_pad, y_test, verbose=1)
print("%s: %.2f%%" % (loaded_model.metrics_names[:], score[1]*100))
print('-'*50,'\n')

loading model
compiling model
evaluating model
binary_accuracy: 98.31%
-------------------------------------------------- 



In [181]:
print("%s: %.2f%%" % (loaded_model.metrics_names[0], score[0]*100))
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))
print("%s: %.2f%%" % (loaded_model.metrics_names[2], score[2]*100))
print("%s: %.2f%%" % (loaded_model.metrics_names[3], score[3]*100))

loss: 4.92%
binary_accuracy: 98.31%
categorical_accuracy: 99.15%
sparse_categorical_accuracy: 89.87%
