In [31]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding, Dropout

from sklearn import linear_model
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score

In [6]:
df_train = pd.read_csv('train.csv', encoding='utf8')
df_test = pd.read_csv('test.csv', encoding='utf8')

In [7]:
def parse_df(df):
    match_numbers = re.compile(r'[0-9]+', flags=re.IGNORECASE)
    df.comment_text = df.comment_text.str.replace(match_numbers, 'NUM')
    df.comment_text = df.comment_text.str.replace('[\n"]', ' ')

    return df

In [8]:
df_train = parse_df(df_train)

## Linear Model

In [29]:
v = CountVectorizer()
X = v.fit_transform(df_train['comment_text'].head(10000))
y = df_train['toxic'].head(10000)

clf = linear_model.LogisticRegression()

cross_val_score(clf, X, y, scoring='roc_auc')

array([0.93280218, 0.91289324, 0.92186392])

## GRU

In [32]:
X = pad_sequences(df_train['comment_text'].head(10000).map(lambda r: one_hot(r, 100000)), maxlen=300)
y = df_train['toxic'].head(10000)

X_train, X_val, y_train, y_val = train_test_split(X, y)

In [33]:
model = Sequential()
model.add(Embedding(input_dim=100000,
                    output_dim=128,
                    input_shape=(300,)))
model.add(GRU(128))
model.add(Dropout(0.2))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [36]:
model.fit(X_train, y_train,
          validation_data=(X_val, y_val),
          epochs=2)

Train on 7500 samples, validate on 2500 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f91e9d9e2b0>

In [37]:
roc_auc_score(model.predict_classes(X_val), y_val)

0.8459782608695652