In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pickle
import warnings
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
print(df_train.shape)
print(df_train.head())
print('\n')
print(df_test.shape)
print(df_test.head())

(159571, 8)
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  


(153164, 2)
                 id                                       comment_text
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll.

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [3]:
training_comments = df_train["comment_text"].tolist()
tk = Tokenizer()
tk.fit_on_texts(training_comments)
text2idx = tk.texts_to_sequences(training_comments)
text2idx[:7]

[[688,
  75,
  1,
  126,
  130,
  177,
  29,
  672,
  4511,
  12052,
  1116,
  86,
  331,
  51,
  2278,
  11448,
  50,
  6864,
  15,
  60,
  2756,
  148,
  7,
  2937,
  34,
  117,
  1221,
  15190,
  2825,
  4,
  45,
  59,
  244,
  1,
  365,
  31,
  1,
  38,
  27,
  143,
  73,
  3462,
  89,
  3085,
  4583,
  2273,
  985],
 [96145,
  52,
  2635,
  13,
  555,
  3809,
  73,
  4556,
  2706,
  21,
  94,
  38,
  803,
  2679,
  992,
  589,
  8377,
  182],
 [412,
  437,
  73,
  134,
  14,
  249,
  2,
  71,
  314,
  78,
  50,
  9,
  13,
  626,
  8,
  2284,
  492,
  502,
  102,
  4,
  611,
  2,
  35,
  325,
  126,
  363,
  3,
  29,
  38,
  27,
  52,
  208,
  2,
  434,
  57,
  36,
  1,
  2394,
  93,
  1,
  737,
  468],
 [57,
  7,
  228,
  97,
  54,
  328,
  1436,
  15,
  2133,
  7,
  6024,
  22,
  1,
  123,
  2502,
  56,
  16,
  513,
  15,
  25,
  5,
  4236,
  3,
  1327,
  3,
  9762,
  7,
  67,
  1,
  277,
  85,
  122,
  13503,
  37,
  9,
  51,
  19,
  42,
  10,
  1,
  1460,
  138,
  1257,
  2153,

In [5]:
vocab_size = len(tk.word_index) + 1 
print("Vocab size: ", vocab_size)

Vocab size:  210338


In [6]:
seq_len = 100
features = pad_sequences(text2idx, maxlen=seq_len, dtype='int32')
features[:10, :100]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,   688,
           75,     1,   126,   130,   177,    29,   672,  4511, 12052,
         1116,    86,   331,    51,  2278, 11448,    50,  6864,    15,
           60,  2756,   148,     7,  2937,    34,   117,  1221, 15190,
         2825,     4,    45,    59,   244,     1,   365,    31,     1,
           38,    27,   143,    73,  3462,    89,  3085,  4583,  2273,
          985],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     

In [7]:
labels = df_train.iloc[:,-6:].as_matrix()
print(features.shape)
print(labels.shape)

(159571, 100)
(159571, 6)


In [9]:
pickle.dump(features, open("features.pkl","wb"))
pickle.dump(labels, open("labels.pkl", "wb"))

#### Model Formation

In [7]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)

In [9]:
def model_creation(x_train, y_train, x_test, y_test, embedding_size = 150, learning_rate=0.05, batch_size = 64, 
                   third_layer=False):
    
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, embeddings_initializer='uniform', 
                                     input_length=seq_len))
    model.add(Dropout(0.44))
    model.add(LSTM(512, return_sequences=True))
    model.add(Dropout(0.04))
    model.add(LSTM(512, return_sequences=True))
    model.add(Dropout(0.11))
    
    if third_layer:
        model.add(LSTM(1024, return_sequences=False))
        model.add(Dropout(0.32))
        
    model.add(Dense(6, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer="adam")
    print(model.summary())

    model.fit(x_train, y_train, epochs=30, batch_size=batch_size)
    score, acc = model.evaluate(x_test, y_test, verbose=0)
    print('Test accuracy: ', acc)
    return model

In [None]:
seq_model = model_creation(x_train, y_train, x_test, y_test, third_layer=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 150)          31550700  
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 150)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 512)          1357824   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 512)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 512)          2099200   
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 512)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 1024)              6295552   
__________

In [10]:
# score, acc = seq_model.evaluate(x_test, y_test, verbose=0)
print("score: ", score)
print('Test accuracy: ', acc)

score:  0.11397717887838202
Test accuracy:  0.9651943787043772


#### Testing the Model on the test set 

In [4]:
from keras.models import load_model
seq_model = load_model("lstm_model_03.hd5")

In [5]:
test_comments = df_test["comment_text"].tolist()

In [6]:
test_tokens = tk.texts_to_sequences(test_comments)
test_tokens[:7]

[[2665,
  655,
  8849,
  656,
  8,
  57,
  16388,
  83,
  884,
  356,
  16,
  3222,
  76,
  21,
  6,
  4,
  6865,
  6,
  1521,
  7,
  56,
  655,
  4942,
  1898,
  682,
  6908,
  4,
  96,
  6,
  2,
  5104,
  29,
  417,
  6,
  726,
  39769,
  35,
  8849,
  656,
  8,
  36,
  4122,
  10,
  2818,
  660,
  437,
  454,
  19612,
  9,
  333,
  15,
  153,
  4,
  28513,
  8,
  240,
  30685,
  49,
  22636,
  52,
  24,
  5,
  2045,
  162,
  3132,
  682,
  2880,
  96,
  219,
  145,
  493,
  84],
 [31, 1185, 1, 348, 8, 676, 17, 11, 8, 2826],
 [109, 34550, 15, 62877, 355],
 [22,
  6,
  18,
  5,
  151,
  157,
  34,
  1,
  119,
  1,
  102,
  7,
  1501,
  24,
  1,
  364,
  640,
  7,
  40,
  77,
  645,
  1,
  119,
  3098,
  1501,
  7,
  1002,
  1400,
  1,
  102,
  396,
  125,
  26,
  127,
  6,
  12,
  20,
  349],
 [7, 59, 7516, 71, 80, 34, 42],
 [127, 6, 12, 1085, 7, 67, 101, 1092, 3, 6, 4, 47, 14, 360, 175, 137],
 [45,
  33,
  14,
  149,
  777,
  2,
  28,
  108,
  126,
  19,
  408,
  199,
  4,
  1688,
  

In [7]:
# from keras.preprocessing.sequence import pad_sequences
# test_text_int = []
# for token in test_tokens:
#     test_text_int.append([vocab2idx.get(i, 0) for i in token])
seq_len = 100
test_features = pad_sequences(test_tokens, maxlen=seq_len, dtype='int32')
test_features[:10,:100]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,  2665,   655,  8849,   656,     8,    57, 16388,    83,
          884,   356,    16,  3222,    76,    21,     6,     4,  6865,
            6,  1521,     7,    56,   655,  4942,  1898,   682,  6908,
            4,    96,     6,     2,  5104,    29,   417,     6,   726,
        39769,    35,  8849,   656,     8,    36,  4122,    10,  2818,
          660,   437,   454, 19612,     9,   333,    15,   153,     4,
        28513,     8,   240, 30685,    49, 22636,    52,    24,     5,
         2045,   162,  3132,   682,  2880,    96,   219,   145,   493,
           84],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     

In [8]:
test_probs = seq_model.predict_proba(test_features, batch_size=64)

In [9]:
print(test_probs.shape)

(153164, 6)


In [10]:
import pandas as pd
df_output = pd.DataFrame(test_probs, columns=df_train.columns.values.tolist()[2:])

In [11]:
df_output = pd.concat([df_test["id"], df_output],1,'inner')
print(df_output.shape)
print(df_output.head())

(153164, 7)
                 id         toxic  severe_toxic       obscene        threat  \
0  00001cee341fdb12  9.991732e-01  4.743376e-01  9.973660e-01  1.476861e-02   
1  0000247867823ef7  1.241595e-06  2.669021e-09  2.245563e-07  1.747874e-07   
2  00013b17ad220c46  3.606170e-06  3.456704e-09  1.469271e-07  7.315201e-07   
3  00017563c3f7919a  7.339078e-07  3.242501e-09  4.806292e-07  1.918826e-07   
4  00017695ad8997eb  4.854329e-06  1.279536e-08  5.490770e-07  7.622195e-07   

         insult  identity_hate  
0  9.894505e-01   3.391772e-01  
1  2.419364e-07   6.136602e-09  
2  2.446558e-07   2.252773e-08  
3  3.854764e-07   5.271759e-09  
4  1.567009e-06   4.444089e-08  


In [12]:
df_output.to_csv("submission01.csv", index=False)