In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pickle
import warnings
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
print(df_train.shape)
print(df_train.head())
print('\n')
print(df_test.shape)
print(df_test.head())

(159571, 8)
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  


(153164, 2)
                 id                                       comment_text
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll.

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [3]:
training_comments = df_train["comment_text"].tolist()
tk = Tokenizer()
tk.fit_on_texts(training_comments)
text2idx = tk.texts_to_sequences(training_comments)
text2idx[:7]

[[688,
  75,
  1,
  126,
  130,
  177,
  29,
  672,
  4511,
  12052,
  1116,
  86,
  331,
  51,
  2278,
  11448,
  50,
  6864,
  15,
  60,
  2756,
  148,
  7,
  2937,
  34,
  117,
  1221,
  15190,
  2825,
  4,
  45,
  59,
  244,
  1,
  365,
  31,
  1,
  38,
  27,
  143,
  73,
  3462,
  89,
  3085,
  4583,
  2273,
  985],
 [96145,
  52,
  2635,
  13,
  555,
  3809,
  73,
  4556,
  2706,
  21,
  94,
  38,
  803,
  2679,
  992,
  589,
  8377,
  182],
 [412,
  437,
  73,
  134,
  14,
  249,
  2,
  71,
  314,
  78,
  50,
  9,
  13,
  626,
  8,
  2284,
  492,
  502,
  102,
  4,
  611,
  2,
  35,
  325,
  126,
  363,
  3,
  29,
  38,
  27,
  52,
  208,
  2,
  434,
  57,
  36,
  1,
  2394,
  93,
  1,
  737,
  468],
 [57,
  7,
  228,
  97,
  54,
  328,
  1436,
  15,
  2133,
  7,
  6024,
  22,
  1,
  123,
  2502,
  56,
  16,
  513,
  15,
  25,
  5,
  4236,
  3,
  1327,
  3,
  9762,
  7,
  67,
  1,
  277,
  85,
  122,
  13503,
  37,
  9,
  51,
  19,
  42,
  10,
  1,
  1460,
  138,
  1257,
  2153,

In [4]:
vocab_size = len(tk.word_index) + 1 
print("Vocab size: ", vocab_size)

Vocab size:  210338


In [5]:
seq_len = 100
features = pad_sequences(text2idx, maxlen=seq_len, dtype='int32')
features[:10, :100]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,   688,
           75,     1,   126,   130,   177,    29,   672,  4511, 12052,
         1116,    86,   331,    51,  2278, 11448,    50,  6864,    15,
           60,  2756,   148,     7,  2937,    34,   117,  1221, 15190,
         2825,     4,    45,    59,   244,     1,   365,    31,     1,
           38,    27,   143,    73,  3462,    89,  3085,  4583,  2273,
          985],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     

In [6]:
labels = df_train.iloc[:,-6:].as_matrix()
print(features.shape)
print(labels.shape)

(159571, 100)
(159571, 6)


#### Model Formation

In [7]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)

In [9]:
def model_creation(x_train, y_train, x_test, y_test, embedding_size = 150, learning_rate=0.05, batch_size = 64, 
                   third_layer=False):
    
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, embeddings_initializer='uniform', 
                                     input_length=seq_len))
    model.add(Dropout(0.44))
    model.add(LSTM(512, return_sequences=True))
    model.add(Dropout(0.04))
    model.add(LSTM(512, return_sequences=True))
    model.add(Dropout(0.11))
    
    if third_layer:
        model.add(LSTM(1024, return_sequences=False))
        model.add(Dropout(0.32))
        
    model.add(Dense(6, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer="adam")
    print(model.summary())

    model.fit(x_train, y_train, epochs=30, batch_size=batch_size)
    score, acc = model.evaluate(x_test, y_test, verbose=0)
    print('Test accuracy: ', acc)
    return model

In [None]:
seq_model = model_creation(x_train, y_train, x_test, y_test, third_layer=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 150)          31550700  
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 150)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 512)          1357824   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 512)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 512)          2099200   
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 512)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 1024)              6295552   
__________

In [8]:
from keras.models import load_model
seq_model = load_model("lstm_model_02.hd5")

In [None]:
score, acc = seq_model.evaluate(x_test, y_test, verbose=0)
print("score: ". score)
print('Test accuracy: ', acc)

#### Testing the Model on the test set 

In [29]:
test_comments = df_test["comment_text"].tolist()

In [41]:
test_tokens = text_processing(test_comments)
test_tokens[:7]

[['yo',
  'bitch',
  'ja',
  'rule',
  'is',
  'more',
  'succesful',
  'then',
  'you',
  'll',
  'ever',
  'be',
  'whats',
  'up',
  'with',
  'you',
  'and',
  'hating',
  'you',
  'sad',
  'mofuckas',
  'i',
  'should',
  'bitch',
  'slap',
  'ur',
  'pethedic',
  'white',
  'faces',
  'and',
  'get',
  'you',
  'to',
  'kiss',
  'my',
  'ass',
  'you',
  'guys',
  'sicken',
  'me',
  'ja',
  'rule',
  'is',
  'about',
  'pride',
  'in',
  'da',
  'music',
  'man',
  'dont',
  'diss',
  'that',
  'shit',
  'on',
  'him',
  'and',
  'nothin',
  'is',
  'wrong',
  'bein',
  'like',
  'tupac',
  'he',
  'was',
  'a',
  'brother',
  'too',
  'fuckin',
  'white',
  'boys',
  'get',
  'things',
  'right',
  'next',
  'time'],
 ['from', 'rfc', 'the', 'title', 'is', 'fine', 'as', 'it', 'is', 'imo'],
 ['sources', 'zawe', 'ashton', 'on', 'lapland'],
 ['if',
  'you',
  'have',
  'a',
  'look',
  'back',
  'at',
  'the',
  'source',
  'the',
  'information',
  'i',
  'updated',
  'was',
  'th

In [45]:
from keras.preprocessing.sequence import pad_sequences
test_text_int = []
for token in test_tokens:
    test_text_int.append([vocab2idx.get(i, 0) for i in token])
test_features = pad_sequences(test_text_int, maxlen=seq_len, dtype='int32')
test_features[:10,:100]

array([[     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0, 162494,  17016, 114303, 140792,  66990, 120373, 185523,
         97087,  93796, 106081, 171838, 185474,  79966, 184432,  52097,
         93796, 174113,  26824,  93796,  51626,      0, 140155,  46705,
         17016, 149598, 123091,      0,  13991,  89789, 174113, 164852,
         93796, 161337, 158700, 124101, 125466,  93796,  56036,  28082,
        155892, 114303, 140792,  66990, 120623, 141908,  22087, 138332,
         32040,  17934, 112956, 129066,  37291,  52246, 105800, 156603,
        174113,  50599,  66990,  82219,  77413, 136263, 129566, 162054,
        166902,  80433, 132407, 103194,  64773,  13991, 108441, 164852,
         43630,  71938,  92401,  21584],
       [     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,

In [52]:
test_probs = seq_model.predict_proba(test_features, batch_size=64)

In [53]:
print(test_probs.shape)

(153164, 6)


In [63]:
import pandas as pd
df_output = pd.DataFrame(test_probs, columns=df_train.columns.values.tolist()[2:])

In [64]:
df_output = pd.concat([df_test["id"], df_output],1,'inner')
print(df_output.shape)
print(df_output.head())

(153164, 7)
                 id     toxic  severe_toxic   obscene    threat    insult  \
0  00001cee341fdb12  0.998251  2.150678e-01  0.984470  0.064757  0.924891   
1  0000247867823ef7  0.000851  2.402557e-07  0.000057  0.000024  0.000047   
2  00013b17ad220c46  0.002242  5.820791e-07  0.000145  0.000065  0.000106   
3  00017563c3f7919a  0.001023  2.817372e-07  0.000069  0.000029  0.000053   
4  00017695ad8997eb  0.000999  2.751268e-07  0.000067  0.000029  0.000052   

   identity_hate  
0       0.188065  
1       0.000019  
2       0.000057  
3       0.000024  
4       0.000023  


In [66]:
df_output.to_csv("submission1.csv", index=False)