In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
list_of_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
max_features = 20000
max_text_length = 400
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
batch_size = 32
epochs = 2

In [6]:
train=pd.read_csv('/home/chiransh/Downloads/Toxic Comment Classification Challenge/train.csv')
test=pd.read_csv('/home/chiransh/Downloads/Toxic Comment Classification Challenge/test.csv')
train.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [15]:
x = train['comment_text'].values
print(x.shape)
print x[0]

(95851,)
Nonsense?  kiss off, geek. what I said is true.  I'll have your account terminated.


In [19]:
y = train[list_of_classes].values
print(y[0])

[1 0 0 0 0 0]


In [20]:
x_tokenizer = text.Tokenizer(num_words=max_features)
print(x_tokenizer)
x_tokenizer.fit_on_texts(list(x))
print(x_tokenizer)
x_tokenized = x_tokenizer.texts_to_sequences(x)
x_train_val = sequence.pad_sequences(x_tokenized, maxlen=max_text_length)

<keras.preprocessing.text.Tokenizer object at 0x7f98914dc690>
<keras.preprocessing.text.Tokenizer object at 0x7f98914dc690>


In [27]:
x_train_val[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [28]:
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y, test_size=0.1, random_state=1)

In [29]:
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=max_text_length))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto 6 output layers, and squash it with a sigmoid:
model.add(Dense(6))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 400, 50)           1000000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 50)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 398, 250)          37750     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_2 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 250)               0     

In [30]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
validation_data=(x_val, y_val))

Train on 86265 samples, validate on 9586 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f988fdce250>

In [31]:
x_test = test['comment_text'].fillna('comment_missing').values
print(x_test)

['==Orphaned non-free media (Image:41cD1jboEvL. SS500 .jpg)=='
 '::Kentuckiana is colloquial.  Even though the area is often referred to as this, it (in my opinion) has never held the encyclopedic precision of "Louisville metropolitian area", which has a specific U.S. Census definition.  Also, apparently Kentuckiana often refers to the local television viewing area, which isn\'t nearly contiguous with the official metro area.  As you indicate, Kentuckiana seems to be more of a slang or marketing phenomena than anything we could pin down in encyclopedic terms here.  That\'s why we see Wikipedia language like "the Louisville metropolitan area, sometimes referred to as Kentuckiana". That\'s my take on it. \xe2\x80\x94   \xe2\x80\xa2'
 'Hello fellow Wikipedians,\nI have just modified  on [WIKI_LINK: Double Trouble (George Jones and Johnny Paycheck album)]. Please take a moment to review [EXTERNAL_LINK: my edit]. If you have any questions, or need the bot to ignore the links, or the page al

In [32]:
x_test_tokenized = x_tokenizer.texts_to_sequences(x_test)
x_testing = sequence.pad_sequences(x_test_tokenized, maxlen=max_text_length)

In [33]:
y_testing = model.predict(x_testing, verbose = 1)



In [58]:
print y_testing.shape

(226998, 6)


In [62]:
sample_submission.head(5)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.5,0.5,0.5,0.5,0.5,0.5
1,0.5,0.5,0.5,0.5,0.5,0.5
2,0.5,0.5,0.5,0.5,0.5,0.5
3,0.5,0.5,0.5,0.5,0.5,0.5
4,0.5,0.5,0.5,0.5,0.5,0.5


In [60]:
y_testing.reshape(226998,6)

array([[1.00698695e-02, 1.30238055e-04, 2.93819513e-03, 8.27523414e-04,
        1.44955853e-03, 9.85168735e-04],
       [2.28026125e-04, 5.35183950e-08, 1.51070171e-05, 2.02751266e-06,
        7.32274975e-06, 4.77998628e-06],
       [6.51912123e-05, 1.41051615e-08, 7.04486183e-06, 6.16867680e-07,
        1.60537797e-06, 1.45982449e-06],
       ...,
       [2.16477620e-03, 1.27921703e-05, 5.12004772e-04, 1.45046899e-04,
        2.29289755e-04, 1.77210386e-04],
       [1.46988537e-02, 2.90800584e-04, 5.26758889e-03, 1.59326429e-03,
        2.59735202e-03, 1.71030895e-03],
       [1.98391426e-04, 8.43000549e-08, 1.98666839e-05, 2.81378743e-06,
        6.37640869e-06, 5.94255607e-06]], dtype=float32)

In [63]:
df = pd.DataFrame({'toxic':y_testing[:,0],'severe_toxic':y_testing[:,1],'obscene':y_testing[:,2] ,
                  'threat':y_testing[:,0],'insult':y_testing[:,1],'identity_hate':y_testing[:,2]})

In [64]:
df.head(5)

Unnamed: 0,identity_hate,insult,obscene,severe_toxic,threat,toxic
0,0.002938,0.0001302381,0.002938,0.0001302381,0.01007,0.01007
1,1.5e-05,5.35184e-08,1.5e-05,5.35184e-08,0.000228,0.000228
2,7e-06,1.410516e-08,7e-06,1.410516e-08,6.5e-05,6.5e-05
3,0.002515,7.75143e-05,0.002515,7.75143e-05,0.012599,0.012599
4,0.001531,4.836086e-05,0.001531,4.836086e-05,0.004444,0.004444


In [65]:
df.to_csv("toxic_comment_classification.csv", index=False)