In [1]:
import os
import re

import numpy as np
import pandas as pd
import tensorflow as tf
import keras.backend as K
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
from keras.layers import *
from keras.models import Model

import os
print(os.listdir("../input"))
print(os.listdir("../input/glove-global-vectors-for-word-representation"))
print(os.listdir("../input/jigsaw-unintended-bias-in-toxicity-classification"))


Using TensorFlow backend.


['glove-global-vectors-for-word-representation', 'jigsaw-unintended-bias-in-toxicity-classification']
['glove.6B.200d.txt', 'glove.6B.100d.txt', 'glove.6B.50d.txt']
['sample_submission.csv', 'test.csv', 'train.csv']


# Data Loading

we load the dataset and apply some transformations to use it in a deep learning model.

In [2]:
print("Loading data...")
df_train = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
print("Train shape:", df_train.shape)
df_test = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv",encoding="latin-1")
print("Test shape:", df_test.shape)


Loading data...
Train shape: (1804874, 45)
Test shape: (97320, 2)


In [3]:
df_train = df_train.rename(columns=({"comment_text":"Reviews"}))
df_train = df_train.rename(columns=({"target":"Label"}))


In [4]:
df_test = df_test.rename(columns=({"comment_text":"Reviews"}))

In [5]:
df_train = df_train[:10000]
df_test = df_test[:10000]

In [6]:
df_train.head(2)

Unnamed: 0,id,Label,Reviews,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:41.987077+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:42.870083+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4


In [7]:
df_train.columns

Index(['id', 'Label', 'Reviews', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count'],
      dtype='object')

In [8]:
df_test.head(1)

Unnamed: 0,id,Reviews
0,7000000,Jeff Sessions is another one of Trump's Orwell...


In addition to the datasets, we load a pretrained word embeddings.

In [9]:
EMBEDDING_FILE =  '../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt'

def load_embeddings(filename):
    embeddings = {}
    with open(filename) as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

embeddings = load_embeddings(EMBEDDING_FILE)

# Preprocessings

After loading the datasets, we will preprocess the datasets. In this time, we will apply following techniques:

* Nigation handling
* Replacing numbers
* Tokenization
* Zero padding

Negation handling is the process of converting negation abbreviation to a canonical format. For example, "aren't" is converted to "are not". It is helpful for sentiment analysis.

Replacing numbers is the process of converting numbers to a specific character. For example, "1924" and "123" are converted to "0". It is helpful for reducing the vocabulary size.

Tokenization is the process of taking a text or set of texts and breaking it up into its individual words. In this step, we will tokenize text with the help of splitting text by space or punctuation marks.

Zero padding is the process of pad "0" to the dataset for the purpose of ensuring that all sentences has the same length.

## Negation handling

In [10]:
df_train.Reviews = df_train.Reviews.str.replace("n't", 'not')
df_test.Reviews = df_test.Reviews.str.replace("n't", 'not')

In [11]:
df_train['Label'].value_counts()

0.000000    7500
0.166667     653
0.200000     543
0.300000     286
0.400000     260
0.500000     151
0.600000     111
0.100000      70
0.700000      54
0.800000      30
0.142857      23
1.000000      18
0.111111      15
0.833333      11
0.900000       9
0.687500       4
0.587500       4
0.181818       3
0.728571       3
0.825000       3
0.545455       3
0.272727       3
0.653333       3
0.750000       2
0.609375       2
0.914286       2
0.893939       2
0.375000       2
0.651515       2
0.454545       2
            ... 
0.925450       1
0.603774       1
0.389831       1
0.005386       1
0.025000       1
0.973006       1
0.578125       1
0.507042       1
0.725806       1
0.537500       1
0.736842       1
0.887500       1
0.476923       1
0.819672       1
0.760563       1
0.629630       1
0.000935       1
0.108696       1
0.571429       1
0.145833       1
0.573529       1
0.557143       1
0.283019       1
0.472973       1
0.824324       1
0.457627       1
0.733333       1
0.432432      

In [12]:
target_count=len(df_train['Label'].value_counts())
target_count

231

In [13]:
df_train.dtypes

id                                       int64
Label                                  float64
Reviews                                 object
severe_toxicity                        float64
obscene                                float64
identity_attack                        float64
insult                                 float64
threat                                 float64
asian                                  float64
atheist                                float64
bisexual                               float64
black                                  float64
buddhist                               float64
christian                              float64
female                                 float64
heterosexual                           float64
hindu                                  float64
homosexual_gay_or_lesbian              float64
intellectual_or_learning_disability    float64
jewish                                 float64
latino                                 float64
male         

In [14]:
df_test.dtypes

id          int64
Reviews    object
dtype: object

In [15]:
df_train['Reviews'] = df_train['Reviews'].astype(str)
df_test['Reviews'] = df_test['Reviews'].astype(str)

## Replacing numbers

In [16]:
df_train.Reviews = df_train.Reviews.apply(lambda x: re.sub(r'[0-9]+', '0', x))
df_test.Reviews = df_test.Reviews.apply(lambda x: re.sub(r'[0-9]+', '0', x))



In [17]:
x_train = df_train['Reviews'].values
x_test  = df_test['Reviews'].values
y_train = df_train['Label'].values
x = np.r_[x_train, x_test]

## Tokenization

In [18]:
tokenizer = Tokenizer(lower=True, filters='\n\t')
tokenizer.fit_on_texts(x)
x_train = tokenizer.texts_to_sequences(x_train)
x_test  = tokenizer.texts_to_sequences(x_test)
vocab_size = len(tokenizer.word_index) + 1  # +1 is for zero padding.
print('vocabulary size: {}'.format(vocab_size))

vocabulary size: 83190


## Zero padding

In [19]:
maxlen = len(max((s for s in np.r_[x_train, x_test]), key=len))
x_train = sequence.pad_sequences(x_train, maxlen=maxlen, padding='post')
x_test = sequence.pad_sequences(x_test, maxlen=maxlen, padding='post')
print('maxlen: {}'.format(maxlen))
print(x_train.shape)
print(x_test.shape)

maxlen: 317
(10000, 317)
(10000, 317)


In [20]:
def filter_embeddings(embeddings, word_index, vocab_size, dim=300):
    embedding_matrix = np.zeros([vocab_size, dim])
    for word, i in word_index.items():
        if i >= vocab_size:
            continue
        vector = embeddings.get(word)
        if vector is not None:
            embedding_matrix[i] = vector
    return embedding_matrix

embedding_size = 200
embedding_matrix = filter_embeddings(embeddings, tokenizer.word_index,
                                     vocab_size, embedding_size)
print('OOV: {}'.format(len(set(tokenizer.word_index) - set(embeddings))))

OOV: 54240


# Building a model

In this time, we will use attention based LSTM model. First of all, we should define the attention layer as follows:

In [21]:
class Attention(Layer):
    """
    Keras Layer that implements an Attention mechanism for temporal data.
    Supports Masking.
    Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    :param kwargs:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(Attention())
    """
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None
        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim
        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
        if self.bias:
            eij += self.b
        eij = K.tanh(eij)
        a = K.exp(eij)
        if mask is not None:
            a *= K.cast(mask, K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

After defining the attention layer, we will define the entire model:

In [22]:
def build_model(maxlen, vocab_size, embedding_size, embedding_matrix):
    input_words = Input((maxlen, ))
    x_words = Embedding(vocab_size,
                        embedding_size,
                        weights=[embedding_matrix],
                        mask_zero=True,
                        trainable=False)(input_words)
    x_words = SpatialDropout1D(0.3)(x_words)
    x_words = Bidirectional(LSTM(50, return_sequences=True))(x_words)
    x = Attention(maxlen)(x_words)
    x = Dropout(0.2)(x)
    x = Dense(50, activation='relu')(x)
    x = Dropout(0.2)(x)
    pred = Dense(target_count, activation='softmax')(x)

    model = Model(inputs=input_words, outputs=pred)
    return model

model = build_model(maxlen, vocab_size, embedding_size, embedding_matrix)
model.compile(optimizer='nadam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 317)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 317, 200)          16638000  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 317, 200)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 317, 100)          100400    
_________________________________________________________________
attention_1 (Attention)      (None, 100)               417       
_________________________________________________________________
dropout_1 (Dropout)  

# Training the model

In [23]:
save_file = 'model.h5'
history = model.fit(x_train, y_train,
                    epochs=15, verbose=1,
                    batch_size=1024, shuffle=True)

Instructions for updating:
Use tf.cast instead.
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


# Making a submission file

After training the model, we make a submission file by predicting for the test dataset.

In [24]:
y_pred = model.predict(x_test, batch_size=1024)
y_pred = y_pred.argmax(axis=1).astype(int)
y_pred.shape

(10000,)

In [25]:
df_test['prediction'] = y_pred
df_test[['id', 'prediction']].to_csv('submission.csv', index=False)