In [None]:
import tensorflow as tf
import time, os
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.preprocessing import LabelEncoder

In [None]:
class CharNetConfig(object):
    def __init__(self, params=None):
        if params is None:
            self.conv_layers = [
                    [256, 7, 3],
                    [256, 7, 3],
                    [256, 3, None],
                    [256, 3, None],
                    [256, 3, None],
                    [256, 3, 3]
                ]
            self.fc_layers = [1024, 512, 10]
            self.l0 = 128
            self.alstr = 'abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:’’’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}]'
            self.alphabet_size = len(self.alstr)
        else:
            self.conv_layers = params['conv_layers']
            self.fc_layers = params['fc_layers']
            self.l0 = params['l0']
            slef.alstr = params['alstr']

In [None]:
class CharNet(object):
    """docstring for CharNet."""
    def __init__(self, conv_layers,
                        fc_layers,
                        l0,
                        alphabet_size,
                        encoder,
                        **args
    ):
        super(CharNet, self).__init__()
        tf.set_random_seed(time.time())
        self.l0 = l0
        self.conv_layers = conv_layers
        self.fc_layers = fc_layers
        self.alphabet_size = alphabet_size

        initializer = tf.contrib.layers.xavier_initializer()

        with tf.name_scope('Input'):
            self.input_x = tf.placeholder(tf.int64, shape=[None, self.l0],
                                          name='input_x')
            self.input_y = tf.placeholder(tf.float32, shape=[None, 6],
                                          name='input_y')
            self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                    name='dropout_keep_prob')

        with tf.name_scope('Embedding'):
            x = tf.nn.embedding_lookup(encoder, self.input_x)
            x = tf.expand_dims(x, -1)

        # Configure conv layers
        for i, layer_params in enumerate(conv_layers):
            with tf.name_scope("Convolution"):
                filter_param = [
                    layer_params[1],
                    x.get_shape()[2].value, # l0
                    x.get_shape()[3].value, # channels
                    layer_params[0]
                ]
                W = tf.Variable(initializer(filter_param), dtype='float32', name='filter')

                conv_layer = tf.nn.conv2d(x, W, [1, 1, 1, 1], 'VALID', name='conv')
                conv_layer = tf.nn.relu(conv_layer, name='act_relu')

            if not layer_params[-1] is None:
                with tf.name_scope("MaxPooling"):
                    pool_layer = tf.nn.max_pool(conv_layer,
                                            ksize=[1, layer_params[-1], 1, 1],
                                            strides=[1, layer_params[-1], 1, 1],
                                            padding='VALID')
                    x = tf.transpose(pool_layer, [0, 1, 3, 2])
            else:
                x = tf.transpose(conv_layer, [0, 1, 3, 2])

        # flatten conv output for fc
        with tf.name_scope("Flatten"):
            x = tf.contrib.layers.flatten(x)

        # Configure fc layers
        for i, layer_units in enumerate(fc_layers):
            with tf.name_scope("FullyConnected"):
                W = tf.Variable(initializer([x.get_shape()[-1].value, layer_units]),
                                dtype='float32', name='W')
                b = tf.Variable(initializer([layer_units]),
                                dtype='float32', name='W')
                x = tf.nn.xw_plus_b(x, W, b, name='fully-connected')
                x = tf.nn.relu(x)

            with tf.name_scope("Dropout"):
                x = tf.nn.dropout(x, self.dropout_keep_prob)

        with tf.name_scope("Output"):
            W = tf.Variable(initializer([x.get_shape()[-1].value, 6]),
                            dtype='float32', name='W')
            b = tf.Variable(initializer([6]),
                            dtype='float32', name='W')
            self.yhat = tf.nn.sigmoid(tf.matmul(x, W) + b, name='output')

        with tf.name_scope("Loss"):
            self.loss = tf.losses.log_loss(self.input_y, self.yhat)

In [None]:
class Data(object):
    def __init__(self, file_path, alstr, l0, is_dev=False, batch_size=128, **args):
        self.alstr = alstr
        self.l0 = l0
        self.is_dev = is_dev
        self.batch_size = batch_size
        self.raw_data = pd.read_csv(file_path)

        self.alphabet = self.make_alphabet(self.alstr)
        self.encoder, self.e_dict = self.one_hot_encoder(self.alphabet)
        self.alphabet_size = len(self.alphabet)
    
        self.input_x = self.process_full_description(self.raw_data)
        if not self.is_dev:
            self.y = self.generate_y(self.raw_data)
            
    def shuffling(self):
        shuffle_indices = np.random.permutation(np.arange(len(self.input_x)))
        self.input_x = self.input_x[shuffle_indices]
        self.y = self.y[shuffle_indices]

    def next_batch(self, batch_num):
        data_size = len(self.input_x)
        start = batch_num * self.batch_size
        end = min((batch_num + 1) * self.batch_size, data_size)
        batch_x = self.input_x[start:end]
        if self.is_dev == False:
            batch_y = self.y[start:end]
        else:
            batch_y = None
        return batch_x, batch_y

    def process_full_description(self, df):
        df['comment_text'] = df['comment_text'].astype('str')
        df['desc_vecs'] = df['comment_text'].apply(
                lambda x: self.doc_process(x, self.e_dict)
        )
        return df['desc_vecs'].values

    def generate_y(self, df):
        list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
        y = df[list_classes].values
        return y

    def one_hot_encoder(self, alphabet):
        encoder_dict = {}
        encoder = []

        encoder_dict['UNK'] = 0
        encoder.append(np.zeros(len(alphabet), dtype='float32'))

        for i, alpha in enumerate(alphabet):
            onehot = np.zeros(len(alphabet), dtype='float32')
            encoder_dict[alpha] = i + 1
            onehot[i] = 1
            encoder.append(onehot)

        encoder = np.array(encoder, dtype='float32')
        return encoder, encoder_dict

    def doc_process(self, desc, e_dict):
        l = self.l0
        desc = desc.strip().lower()
        min_len = min(l, len(desc))
        doc_vec = np.zeros(l, dtype='int64')
        for j in range(min_len):
            if desc[j] in e_dict:
                doc_vec[j] = e_dict[desc[j]]
            else:
                doc_vec[j] = e_dict['UNK']
        return doc_vec

    def make_alphabet(self, alstr):
        return [char for char in alstr]

In [None]:
train_file = '../inputs/train.csv'
dev_file = '../inputs/test.csv'
config = CharNetConfig()

In [None]:
train_data = Data(train_file, config.alstr, config.l0, is_dev=False, batch_size=128)
dev_data = Data(dev_file, config.alstr, config.l0, is_dev=True, batch_size=128)
conf = tf.ConfigProto()
conf.gpu_options.allow_growth=True

In [None]:
with tf.Session(config=conf) as sess:
    charnet = CharNet(config.conv_layers, config.fc_layers, config.l0, config.alphabet_size, train_data.encoder)

    optimizer = tf.train.AdamOptimizer()
    grads = optimizer.compute_gradients(charnet.loss)
    train_op = optimizer.apply_gradients(grads)

    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    print("Writing to {}\n".format(out_dir))

    # Summaries for loss and accuracy
    loss_summary = tf.summary.scalar("loss", charnet.loss)

    # Train Summaries
    train_summary_op = tf.summary.merge([loss_summary])
    train_summary_dir = os.path.join(out_dir, "summaries", "train")
    train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

    # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
    checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    saver = tf.train.Saver(tf.global_variables())

    sess.run(tf.global_variables_initializer())

    def train_step(x_batch, y_batch, step):
        """
        A single training step
        """
        y_batch = np.reshape(y_batch, (-1, 6))
        mm = x_batch.tolist()
        x_batch = np.array([k.tolist() for k in mm])

        feed_dict = {
            charnet.input_x: x_batch,
            charnet.input_y: y_batch,
            charnet.dropout_keep_prob: .5
        }

        _, summaries, loss = sess.run(
            [train_op,
             train_summary_op,
             charnet.loss],
            feed_dict
        )

        print("step {}, loss {:g}".format(step, loss))
        train_summary_writer.add_summary(summaries, step)

    def predict_on_test(x_batch, results):
        mm = x_batch.tolist()
        x_batch = np.array([k.tolist() for k in mm])

        feed_dict = {
            charnet.input_x: x_batch,
            charnet.dropout_keep_prob: 1.0
        }
        result = sess.run([charnet.yhat], feed_dict)
        results.append(result)

    for epoch in range(50):
        print("epoch is: {}".format(epoch))
        train_data.shuffling()
        for i in range(int(len(train_data.y)/train_data.batch_size) + 1):
            input_x, y = train_data.next_batch(i)
            train_step(input_x, y, i)

            if i % 1000 == 0:
                path = saver.save(sess, './model.ckpt')
                print("Epoch {}, Saved model checkpoint to {}\n".format(epoch, path))
    
    results = []
    submission = dev_data.raw_data['id']
    for i in range(int(len(dev_data.input_x)/dev_data.batch_size) + 1):
        input_x, _ = dev_data.next_batch(i)
        predict_on_test(input_x, results)
    
    preds = []
    for result in results:
        for elem in result[0]:
            preds.append(elem)

    preds = np.array(preds)
#     list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
#     submission = pd.DataFrame(preds, columns=list_classes)
#     submission['id'] = dev_data.raw_data['id']
#     cols = submission.columns.tolist()
#     cols = cols[-1:] + cols[:-1]
#     submission = submission[cols]
#     submission.to_csv('../outputs/submission.csv', index=False)

In [None]:
preds

In [None]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

path = '../input/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
EMBEDDING_FILE = '../inputs/glove.6B.100d.txt'
TRAIN_DATA_FILE = '../inputs/train.csv'
TEST_DATA_FILE = '../inputs/test.csv'

embed_size = 100 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(100, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.5)(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.5)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_t, y, batch_size=512, epochs=5)
y_test = model.predict([X_te], batch_size=1024, verbose=1)

In [None]:
sample_submission = pd.read_csv('../inputs/sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('../outputs/submission.csv', index=False)

In [None]:
sample_submission