**Perform the necessary imports**

In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing import text, sequence
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Reshape, Flatten, Input
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalMaxPooling2D, Conv2D
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


**Necessary global variables**

In [2]:
list_of_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
max_features = 20000
max_text_length = 400
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
batch_size = 32
epochs = 10
keepprob=0.2

**Quick peek into the data**

In [3]:
train_df = pd.read_csv('../data/input/train.csv')
print(train_df.head())

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  


**Printing using 'iloc' just for fun**

In [4]:
print(train_df.iloc[0, -7])
print(train_df.iloc[0, 1])

Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27


**Checking if  NaNs exist in the training data**

In [5]:
print(np.where(pd.isnull(train_df)))

(array([], dtype=int64), array([], dtype=int64))


**Apparently no NaNs in the training set!**

**Converting pandas series to a numpy array using .values**

In [6]:
x = train_df['comment_text'].values
print(x)

[ "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"
 "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)"
 "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info."
 ...,
 'Spitzer \n\nUmm, theres no actual article for prostitution ring.  - Crunch Captain.'
 'And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.'
 '"\nAnd ... I really don\'t think you understand.  I came here and my idea was bad right away.  What kind of community goes ""you have bad ideas"" go away, instead

In [7]:
print("properties of x")
print("type : {}, dimensions : {}, shape : {}, total no. of elements : {}, data type of each element: {}, size of each element {} bytes".format(type(x), x.ndim, x.shape, x.size, x.dtype, x.itemsize))

properties of x
type : <class 'numpy.ndarray'>, dimensions : 1, shape : (159571,), total no. of elements : 159571, data type of each element: object, size of each element 8 bytes


**Getting the labels**

In [8]:
y = train_df[list_of_classes].values
print(y)

[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 ..., 
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]


In [9]:
print("properties of y")
print("type : {}, dimensions : {}, shape : {}, total no. of elements : {}, data type of each element: {}, size of each element {} bytes".format(type(y), y.ndim, y.shape, y.size, y.dtype, y.itemsize))

properties of y
type : <class 'numpy.ndarray'>, dimensions : 2, shape : (159571, 6), total no. of elements : 957426, data type of each element: int64, size of each element 8 bytes


**Keras makes our life easy. Using Tokenizer to get a list of sequence and then padding it form a 2D numpy array **

In [10]:
x_tokenizer = text.Tokenizer(num_words=max_features)
print(x_tokenizer)
x_tokenizer.fit_on_texts(list(x))
print(x_tokenizer)
x_tokenized = x_tokenizer.texts_to_sequences(x) #list of lists(containing numbers), so basically a list of sequences, not a numpy array
#pad_sequences:transform a list of num_samples sequences (lists of scalars) into a 2D Numpy array of shape 
x_train_val = sequence.pad_sequences(x_tokenized, maxlen=max_text_length)

<keras.preprocessing.text.Tokenizer object at 0x2aab2218e9e8>
<keras.preprocessing.text.Tokenizer object at 0x2aab2218e9e8>


In [11]:
print("properties of x_train_val")
print("type : {}, dimensions : {}, shape : {}, total no. of elements : {}, data type of each element: {}, size of each element {} bytes".format(type(x_train_val), x_train_val.ndim, x_train_val.shape, x_train_val.size, x_train_val.dtype, x_train_val.itemsize))

properties of x_train_val
type : <class 'numpy.ndarray'>, dimensions : 2, shape : (159571, 400), total no. of elements : 63828400, data type of each element: int32, size of each element 4 bytes


**90% of the data is used for training and the rest for validation**

In [12]:
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y, test_size=0.1, random_state=1)

**Start building the model**

In [13]:
# print('Build model...')
# model = Sequential()
# # we start off with an efficient embedding layer which maps
# # our vocab indices into embedding_dims dimensions
# model.add(Embedding(max_features,
#                     embedding_dims,
#                     input_length=max_text_length))
# model.add(Dropout(0.2))

# # we add a Convolution1D, which will learn filters
# # word group filters of size filter_length:
# model.add(Conv1D(filters,
#                  kernel_size,
#                  padding='valid',
#                  activation='relu',
#                  strides=1))
# # we use max pooling:
# model.add(GlobalMaxPooling1D())

# # We add a vanilla hidden layer:
# model.add(Dense(hidden_dims))
# model.add(Dropout(0.2))
# model.add(Activation('relu'))

# # We project onto 6 output layers, and squash it with a sigmoid:
# model.add(Dense(6))
# model.add(Activation('sigmoid'))

# model.compile(loss='binary_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy'])

# model.summary()

In [14]:
inputs = Input(shape=(max_text_length,), dtype='int32')
embedding = Embedding(input_dim=max_features, 
                      output_dim=embedding_dims, input_length=max_text_length)(inputs)
reshape = Reshape((max_text_length,embedding_dims,1))(embedding)
drop = Dropout(keepprob)(reshape)
conv_1 = Conv2D(filters, kernel_size=(kernel_size, embedding_dims), 
                padding='valid', kernel_initializer='normal', activation='relu')(drop)
maxpool = GlobalMaxPooling2D()(conv_1)
#conv_2 = Conv2D(num_filter_2, kernel_size=(filter_size_2, embedding_dim), 
 #               padding='valid', kernel_initializer='normal', activation='relu')(reshape)
#conv_3 = Conv2D(num_filter_3, kernel_size=(filter_size_3, embedding_dim), 
  #              padding='valid', kernel_initializer='normal', activation='relu')(reshape)

#maxpool_1 = MaxPooling2D(pool_size=(sequence_length - filter_size_1 + 1, 1), 
 #                     strides=(pool_stride_1, pool_stride_1), padding='valid')(conv_1)
#maxpool_2 = MaxPooling2D(pool_size=(sequence_length - filter_size_2 + 1, 1), 
 #                     strides=(pool_stride_2, pool_stride_2), padding='valid')(conv_2)
#maxpool_3 = MaxPooling2D(pool_size=(sequence_length - filter_size_3 + 1, 1), 
#                      strides=(pool_stride_3, pool_stride_3), padding='valid')(conv_3)

#concatenated_tensor = Concatenate(axis=1)([maxpool_1, maxpool_2, maxpool_3])
#flatten = Flatten()(maxpool)
dense_1 = Dense(units=hidden_dims, activation='relu')(maxpool)
dropout = Dropout(keepprob)(dense_1)
output = Dense(units=6, activation='softmax')(dropout)
model = Model(inputs=inputs, outputs=output)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 400)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 400, 50)           1000000   
_________________________________________________________________
reshape_1 (Reshape)          (None, 400, 50, 1)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 50, 1)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 398, 1, 250)       37750     
_________________________________________________________________
global_max_pooling2d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
__________

**Begin training**

In [15]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
validation_data=(x_val, y_val))

Train on 143613 samples, validate on 15958 samples
Epoch 1/1


<keras.callbacks.History at 0x2aab251446d8>


**Quick peek into the test set**

In [16]:
test_df = pd.read_csv('../data/input/test.csv')
print(test_df.head())

                 id                                       comment_text
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...
1  0000247867823ef7  == From RfC == \n\n The title is fine as it is...
2  00013b17ad220c46  " \n\n == Sources == \n\n * Zawe Ashton on Lap...
3  00017563c3f7919a  :If you have a look back at the source, the in...
4  00017695ad8997eb          I don't anonymously edit articles at all.


**Checking if  NaNs exist in the test data**

In [17]:
print(np.where(pd.isnull(test_df)))

(array([], dtype=int64), array([], dtype=int64))


**Hmmm**

In [18]:
test_df.iloc[52300, 1]

'SOmebody fucked up the homepage plz edit!! thanks, I need medevil knawledge.'

**Fill the NaN field**

In [19]:
x_test = test_df['comment_text'].fillna('comment_missing').values
print(x_test)

[ "Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,"
 '== From RfC == \n\n The title is fine as it is, IMO.'
 '" \n\n == Sources == \n\n * Zawe Ashton on Lapland —  /  "' ...,
 '" \n\n == Okinotorishima categories == \n\n I see your changes and agree this is ""more correct.""  I had gotten confused, but then found this: \n :... while acknowledging Japan\'s territorial rights to Okinotorishima itself ... \n However, is there a category for  \n :... did not acknowledge Japan\'s claim to an exclusive economic zone (EEZ) stemming from Okinotorishima. \n That is, is there a category for ""disputed EEZ""s?   "'
 '" \n\n == ""One of the founding nations of the EU - Germany - has 

**Tokenizing and padding similar to what we did before to training data**

In [20]:
x_test_tokenized = x_tokenizer.texts_to_sequences(x_test)
x_testing = sequence.pad_sequences(x_test_tokenized, maxlen=max_text_length)

**Time to predict!**

In [21]:
y_testing = model.predict(x_testing, verbose = 1)



**Submit predictions!**

In [43]:
sample_submission = pd.read_csv("../data/input/sample_submission.csv")
sample_submission[list_of_classes] = y_testing
sample_submission.to_csv("../data/output/toxic_comment_classification.csv", index=False)

In [44]:
sample_submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.344461,0.090538,0.205128,0.00723,0.306264,0.046379
1,0000247867823ef7,0.165703,0.17249,0.161471,0.16944,0.160765,0.170131
2,00013b17ad220c46,0.195152,0.161928,0.161874,0.160031,0.162344,0.158671
3,00017563c3f7919a,0.134783,0.178342,0.175583,0.168711,0.170842,0.171739
4,00017695ad8997eb,0.206071,0.159492,0.161634,0.156599,0.16165,0.154554


In [75]:
len(sample_submission)

153164