In [21]:
import numpy as np 
import pandas as pd 

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

max_features = 20000
maxlen = 100


train = pd.read_csv("train.csv")
#test = pd.read_csv("test.csv")
#train = train.sample(frac=1)

list_sentences_train = train["comment_text"].fillna("nada").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
#list_sentences_test = test["comment_text"].fillna("nada").values


tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
#list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
#X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

    return model


model = get_model()
batch_size = 32
epochs = 2


file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=20)


callbacks_list = [checkpoint, early] #early
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

#model.load_weights(file_path)

#y_predict = model.predict(X_te)



#sample_submission = pd.read_csv("sample_submission.csv")

#sample_submission[list_classes] = y_predict



#sample_submission.to_csv("baseline.csv", index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.27913, saving model to weights_base.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.27913 to 0.27637, saving model to weights_base.best.hdf5


<keras.callbacks.History at 0x128f59c50>

In [None]:


#maxlens=[100,200,300]
#batch_sizes=[32]

In [7]:
train = pd.read_csv("train.csv")

In [16]:
comment_len_bin=dict()

comment_len_bin['total']=train.shape[0]
comment_len_bin['100']=0
comment_len_bin['150']=0
comment_len_bin['200']=0
comment_len_bin['250']=0
comment_len_bin['300']=0

for index,row in train.iterrows():
    
    if len(row['comment_text'].split(' '))<=100:
        comment_len_bin['100']+=(1)
        
    if len(row['comment_text'].split(' '))<=150:
        comment_len_bin['150']+=(1)

    if len(row['comment_text'].split(' '))<=200:
        comment_len_bin['200']+=(1)

    if len(row['comment_text'].split(' '))<=250:
        comment_len_bin['250']+=(1)

    if len(row['comment_text'].split(' '))<=300:
        comment_len_bin['300']+=(1)
        
    

In [14]:
comment_len_bin

{'100': 130994, '150': 0, '200': 0, '250': 0, '300': 0, 'total': 159571}

In [19]:
for i in ['100','150','200','250','300']:
    print('{:6.2f}'.format(100*comment_len_bin[i]/comment_len_bin['total']),'% of comments have length equal or less than', i, 'words')

 82.09 % of comments have length equal or less than 100 words
 89.78 % of comments have length equal or less than 150 words
 93.61 % of comments have length equal or less than 200 words
 95.75 % of comments have length equal or less than 250 words
 96.95 % of comments have length equal or less than 300 words


# Lets Automate

In [None]:
!wget('https://s3.amazonaws.com/danicic-w266-final/train.csv')

In [1]:
import pandas as pd
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from sklearn import model_selection
import numpy as np
import tensorflow  as tf
import keras
import h5py
from keras.models import load_model


from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras.metrics import categorical_accuracy

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# import data
print('Importing data...')
train_all=pd.read_csv('train.csv')

# 80/20 split
train_80, test_20 = model_selection.train_test_split(train_all,test_size=0.2)

#preprocessing

print('Splitting data...')

#training data
x_train = train_80['comment_text'].fillna("nada").values
y_train = train_80[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values

#testing validation data (not for training model, just validation for )

x_test = test_20['comment_text'].fillna("nada").values
y_test = test_20[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values

# check lengths

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Importing data...
Splitting data...
127656 train sequences
31915 test sequences


## Model Parameters:

In [3]:
max_feature_list=[20000,10000]
EPOCHS=4
maxlen_list=[150,250] 
dropout_list=[0.1,0.2]

## tokenize

In [4]:
tokenizer = text.Tokenizer(num_words=max_feature_list[0])

tokenizer.fit_on_texts(list(train_all["comment_text"].fillna("nada").values)) #fit on all comment_text

#create tokenized comments
list_tokenized_train = tokenizer.texts_to_sequences(x_train)  #training (80% of train.csv)
list_tokenized_test = tokenizer.texts_to_sequences(x_test)  #testing (20% of train.csv)

## create LSTM model maker function

In [5]:
def get_model(maxlen,max_features,dropout,embed_size=128):
    
    embed_size=embed_size #default to 128
    maxlen=maxlen # max length of sequence input
    max_features=max_features  # max vocab
    dropout=dropout
    
    inp = Input(shape=(maxlen, ))
    
    x = Embedding(max_features, embed_size)(inp)
    
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    
    x = GlobalMaxPool1D()(x)
    
    x = Dropout(0.1)(x)
    
    x = Dense(50, activation="relu")(x)
    
    x = Dropout(0.1)(x)
    
    x = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=x)
    
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=[categorical_accuracy])

    return model


## Tokenize and fit model

In [6]:
max_feature_list=[20000,10000]
maxlen_list=[150,250] 
dropout_list=[0.1,0.2]
EPOCHS=4

for max_feature in max_feature_list:
    
    for maxlen in maxlen_list:
        
        for dropout in dropout_list:
            
            print('padding maxlen=',maxlen)
            x_train_pad = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
            x_test_pad = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)
            
            print('building model')
            model=get_model(maxlen=maxlen,max_features=max_feature,dropout=dropout,embed_size=128)
            
            batch_size = 32


            weight_file_path="weights_base_{}_{}_{}.hdf5".format(max_feature,maxlen,dropout)
            model_file_path='bidirectional_lstm_globMP_relu_sigmoid_maxfeat{}_maxlen{}_dropout{}'.format(max_feature,maxlen,dropout)

            checkpoint = ModelCheckpoint(weight_file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

            early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

            callbacks_list = [checkpoint, early] #early
            
            #fit model
            print('fitting model with max_features={}, maxlen = {}, and dropout = {}'.format(max_feature,maxlen,dropout))
            model.fit(x_train_pad, y_train, batch_size=batch_size, epochs=EPOCHS, validation_split=0.1, callbacks=callbacks_list)
            
            #save model
            model.save(model_file_path)


padding maxlen= 150
building model
fitting model with max_features=20000, maxlen = 150, and dropout = 0.1
Train on 114890 samples, validate on 12766 samples
Epoch 1/4
  3648/114890 [..............................] - ETA: 20:07 - loss: 0.3210 - categorical_accuracy: 0.9498

KeyboardInterrupt: 

In [10]:
model=get_model(maxlen=100,max_features=20000,dropout=0.1,embed_size=128)

In [14]:
from keras.models import load_model 

# Returns a compiled model identical to the previous one
model = load_model("weights_base.best.hdf5")


In [15]:
y_predict = model.predict(x_test_pad)

In [20]:
y_predict

array([[9.9992514e-01, 5.7464128e-04, 1.4827535e-01, 3.5196834e-04,
        7.3631547e-02, 3.4083657e-03],
       [9.9791819e-01, 3.6885649e-01, 9.9339539e-01, 7.6247734e-04,
        9.1969764e-01, 3.7648093e-02],
       [9.9992228e-01, 6.7343569e-04, 9.3939133e-02, 3.6026756e-04,
        5.0159719e-02, 6.4799786e-03],
       ...,
       [9.9997091e-01, 3.7670071e-04, 1.2260568e-01, 2.7406376e-04,
        4.6401832e-02, 1.3076887e-03],
       [9.9977499e-01, 1.2707798e-03, 1.5876716e-01, 8.3691807e-04,
        7.5429894e-02, 7.6980758e-03],
       [9.9976450e-01, 1.8819682e-03, 2.3662409e-01, 5.6276756e-04,
        1.0626555e-01, 7.2038313e-03]], dtype=float32)

In [18]:
type(list_tokenized_train)

list