In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
BASEDIR = '../data/raw'

In [3]:
train = pd.read_csv(os.path.join(BASEDIR, 'train.csv'))
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [4]:
test = pd.read_csv(os.path.join(BASEDIR, 'test.csv'))
test.head()

Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


In [5]:
train['comment_text'] = train['comment_text'].fillna(' ')
test['comment_text'] = test['comment_text'].fillna(' ')

In [26]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

# Preprocessing

In [8]:
import spacy
nlp = spacy.load('en', disable=['parser', 'ner', 'textcat'])

In [9]:
def reduce_to_double_max(text):
    """Removes unecessary doubling/tripling/etc of characters
    
    Steps:
        1. Replaces every 3+ consecutive identical chars by 2 consecutive identical chars
        2. Replaces every 2+ consecutive non-word character by a single
    """
    import re
    text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
    return re.sub(r'(\W)\1+', r'\1', text)

In [10]:
def preprocess_corpus(corpus):
    """Applies all preprocessing rules to the corpus"""
    corpus = (reduce_to_double_max(s.lower()) for s in corpus)
    docs = nlp.pipe(corpus, batch_size=1000, n_threads=4)
    return [' '.join([x.lemma_ for x in doc if x.is_alpha]) for doc in docs]

In [11]:
fname_train_processed = '../data/processed/train.txt'

if os.path.isfile(fname_train_processed):
    with open(fname_train_processed, 'r') as fin:
        train_processed = [line.strip() for line in fin if line]
    
else:
    train_processed = preprocess_corpus(train['comment_text'])

    with open(fname_train_processed, 'w') as fout:
        for doc in train_processed:
            fout.write('{}\n'.format(doc))
    
train['comment_text_processed'] = train_processed

In [12]:
fname_test_processed = '../data/processed/test.txt'

if os.path.isfile(fname_test_processed):
    with open(fname_test_processed, 'r') as fin:
        test_processed = [line.strip() for line in fin if line]
    
else:
    test_processed = preprocess_corpus(test['comment_text'])

    with open(fname_test_processed, 'w') as fout:
        for doc in test_processed:
            fout.write('{}\n'.format(doc))
    
test['comment_text_processed'] = test_processed

In [22]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [20]:
EMBEDDING_FILE = '/Users/mathieu/datasets/glove.6B.50d.txt'

def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [21]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.020940498, 0.6441043)

In [23]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [17]:
list_sentences_train = train['comment_text_processed'].values
list_sentences_test = test['comment_text_processed'].values

In [18]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

# Train Network

In [24]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [27]:
model.fit(X_t, y, batch_size=32, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x116370358>

In [30]:
import time
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv('../data/raw/sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('../data/external/submission-{}.csv'.format(time.strftime('%Y%m%d_%H%M', time.localtime())), index=False)

