In [18]:
import numpy as np
import pandas as pd
import sys
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

sys.setrecursionlimit(1500)

In [19]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [20]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
train.drop(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)
train['comment_text'][0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [24]:
# Preprocessing training data
# Porter Stemmer will stem words.
ps = PorterStemmer()
def clean_data(comment):
    data = re.sub('[^A-Za-z]', ' ', comment)
    data = data.lower().split()
    stops = set(stopwords.words('english'))
    # removing stop words from data.
    meaningful_words = [w for w in data if w not in stops]
    return(' '.join(meaningful_words))

n = train['comment_text'].size
clean_review = []
for i in range(0,n):
    clean_review.append(clean_data(train['comment_text'][i]))
    
clean_review[0] # cleaned first review.

'explanation edits made username hardcore metallica fan reverted vandalisms closure gas voted new york dolls fac please remove template talk page since retired'

In [25]:
# Preprocessing test data
# Porter Stemmer will stem words.
#ps = PorterStemmer()
def clean_data(comment):
    data = re.sub('[^A-Za-z]', ' ', comment)
    data = data.lower().split()
    stops = set(stopwords.words('english'))
    # removing stop words from data.
    meaningful_words = [w for w in data if w not in stops]
    return(' '.join(meaningful_words))

n = test['comment_text'].size
test_comment = []
for i in range(0,n):
    test_comment.append(clean_data(test['comment_text'][i]))
    
test_comment[0] # cleaned first review.

'yo bitch ja rule succesful ever whats hating sad mofuckas bitch slap ur pethedic white faces get kiss ass guys sicken ja rule pride da music man dont diss shit nothin wrong bein like tupac brother fuckin white boys get things right next time'

In [26]:
max_features = 20000
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(clean_review)) # Assigning every word a token(number)
tokenized_train = tokenizer.texts_to_sequences(clean_review) # converting sentences into sequence of tokens
tokenized_test = tokenizer.texts_to_sequences(test_comment)
tokenized_train[:1]

[[532,
  46,
  48,
  521,
  4116,
  10690,
  862,
  215,
  10866,
  6374,
  2494,
  2672,
  38,
  1021,
  14500,
  2533,
  5,
  134,
  242,
  4,
  3,
  57,
  3084]]

In [27]:
# Padding sentences for equal length
maxlen = 200
X_train = pad_sequences(tokenized_train, maxlen= maxlen)
X_test = pad_sequences(tokenized_test, maxlen= maxlen)

In [29]:
# Building model

inp= Input(shape=(maxlen,))

embed_size = 200
x = Embedding(max_features,embed_size)(inp)

x= LSTM(60, return_sequences=True, name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation = 'relu')(x)
x = Dropout(0.1)(x)
x = Dense(6, activation = 'sigmoid')(x) # 6 output labels

# here target variables are 6 but still binary_crossentropy is used because a comment can have more than 1 category
# and categorical_crossentropy would only classify a comment in 1 category
model = Model(inputs = inp, outputs = x)
model.compile(loss = 'binary_crossentropy', optimizer= 'adam', metrics = ['accuracy']) 
batch_size = 100
epochs =2
model.fit(X_train, y, batch_size = batch_size, epochs= epochs, validation_split = 0.1)



Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2190f738a90>

In [30]:
y_pred = model.predict(X_test)

In [35]:
y_pred=y_pred.round(2)
y_pred

array([[ 0.99000001,  0.31999999,  0.93000001,  0.05      ,  0.83999997,
         0.16      ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.01      ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.01      ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.93000001,  0.03      ,  0.69      ,  0.01      ,  0.56      ,
         0.04      ]], dtype=float32)