In [None]:
import numpy as np
import pandas as pd

# TODO: any Emoji or smileys in texts?
import re
PATTERN = re.compile(r"[\w\']+|[\.\!\?\:\(\)\"]+")
WORD_PATTERN = re.compile(r"[\w\']+")
DIGIT_PATTERN = re.compile(r'\d')
MATH_PATTERN = re.compile(r'\[\/?math\]|\\\w+')

In [None]:
train_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')
y_train = train_data.target.values.astype('float32')

In [None]:
for corpus in (train_data, test_data):
    for _, row in corpus.iterrows():
        row['question_text'] = MATH_PATTERN.sub('', row.question_text)

In [None]:
print('train_data.shape = {}, test_data.shape = {}'.format(train_data.shape, test_data.shape))

In [None]:
import gensim

w2v = gensim.models.KeyedVectors.load_word2vec_format(
    '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin',
    binary=True
)

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [None]:
W2V_SLICE = slice(0, 300)
END_INDEX = 300
START_INDEX = 301
UNKNOWN_INDEX = 302  # XXX: may be a bad idea, also try skipping them altogether
NUM_1_TO_5_INDEX = 303
NUM_9_TO_18_INDEX = 304
NUM_YEAR_INDEX = 305
NUM_OTHER_INDEX = 306
JEW_OPEN_INDEX = 307
JEW_CLOSE_INDEX = 308
QUESTION_INDEX = 309
EXCLAMATION_INDEX = 310
ELLIPSIS_INDEX = 311
PERIOD_INDEX = 312
COLON_INDEX = 313

NUM_FEATURES = 314

END_ID = 0
START_ID = 1
UNKNOWN_ID = 2
NUM_1_TO_5_ID = 3
NUM_9_TO_18_ID = 4
NUM_YEAR_ID = 5
NUM_OTHER_ID = 6
JEW_OPEN_ID = 7
JEW_CLOSE_ID = 8
QUESTION_ID = 9
EXCLAMATION_ID = 10
ELLIPSIS_ID = 11
PERIOD_ID = 12
COLON_ID = 13

NUM_SPECIAL_IDS = 14

WORD_TO_ID = dict()

# TODO: test this crap
def lookup_word(words_list, passthrough_tokens):
    """Returns a tuple: (word, size)
    """
    word = words_list[0]
    if len(words_list) > 1:
        pair = word + '_' + words_list[1]
        if pair in w2v:
            return pair, 2
        
    if word.endswith("'s"):
        word = word[:-2]
        
    if word in w2v:
        return word, 1
    
    word = word.lower()
    if word in w2v:
        return word, 1
    
    return (word if passthrough_tokens else None), 1
    
def lookup_words(words_list, passthrough_tokens):
    while words_list:
        word, size = lookup_word(words_list, passthrough_tokens)
        if word is not None:
            yield word
        words_list = words_list[size:]

# TODO: check for double quotes as they may be air quotes
for dataset in (train_data, test_data):
    for _, row in dataset.iterrows():
        tokens = [
            match.group(0).strip("'")
            for match in PATTERN.finditer(row.question_text)
        ]
        for word in lookup_words(tokens, passthrough_tokens=False):
            if word is None:
                continue  # skip non-word tokens at this time
            word_id = WORD_TO_ID.get(word)
            if word_id is None:
                word_id = len(WORD_TO_ID) + NUM_SPECIAL_IDS
                if word in w2v:
                    WORD_TO_ID[word] = word_id

NUM_IDS = len(WORD_TO_ID) + NUM_SPECIAL_IDS

In [None]:
ID_TO_VEC = np.zeros((NUM_IDS, NUM_FEATURES), dtype='float32')
ID_TO_VEC[END_ID, END_INDEX] = 1
ID_TO_VEC[START_ID, START_INDEX] = 1
ID_TO_VEC[UNKNOWN_ID, UNKNOWN_INDEX] = 1
ID_TO_VEC[NUM_1_TO_5_ID, NUM_1_TO_5_INDEX] = 1
ID_TO_VEC[NUM_9_TO_18_ID, NUM_9_TO_18_INDEX] = 1
ID_TO_VEC[NUM_YEAR_ID, NUM_YEAR_INDEX] = 1
ID_TO_VEC[NUM_OTHER_ID, NUM_OTHER_INDEX] = 1
ID_TO_VEC[JEW_OPEN_ID, JEW_OPEN_INDEX] = 1
ID_TO_VEC[JEW_CLOSE_ID, JEW_CLOSE_INDEX] = 1
ID_TO_VEC[QUESTION_ID, QUESTION_INDEX] = 1
ID_TO_VEC[EXCLAMATION_ID, EXCLAMATION_INDEX] = 1
ID_TO_VEC[ELLIPSIS_ID, ELLIPSIS_INDEX] = 1
ID_TO_VEC[PERIOD_ID, PERIOD_INDEX] = 1
ID_TO_VEC[COLON_ID, COLON_INDEX] = 1

for word, word_id in sorted(WORD_TO_ID.items(), key=lambda kv: kv[1]):
    ID_TO_VEC[word_id, W2V_SLICE] = w2v[word]

In [None]:
print(len(WORD_TO_ID), ID_TO_VEC.shape)

In [None]:
def word_to_id(word):
    if WORD_PATTERN.match(word):
        word_id = WORD_TO_ID.get(word)
        if word_id is not None:
            return word_id
        else:
            # TODO: store magnitude for each number?
            match = DIGIT_PATTERN.search(word)  # TODO: make reals their own class
            if match:
                num = int(match.group(0))
                if 1 <= num <= 5:
                    return NUM_1_TO_5_ID
                elif 9 <= num <= 18:
                    return NUM_9_TO_18_ID
                elif 1200 <= num <= 2500:
                    return NUM_YEAR_ID
                else:
                    return NUM_OTHER_ID
            else:
                return UNKNOWN_ID
    elif word.startswith('((('):
        return JEW_OPEN_ID
    elif word.startswith(')))'):
        return JEW_CLOSE_ID
    elif word.startswith('?'):
        return QUESTION_ID
    elif word.startswith('!'):
        return EXCLAMATION_ID
    elif word == '...':
        return ELLIPSIS_ID
    elif word.startswith('.'):
        return PERIOD_ID
    elif word == ':':
        return COLON_ID
    else:
        return None  # to be filtered out

def text_to_ids(text):
    tokens = [
        match.group(0).strip("'")
        for match in PATTERN.finditer(text)
    ]
    word_ids = (
        word_to_id(word)
        for word in lookup_words(tokens, passthrough_tokens=True)
    )
    return [START_ID] + [
        word_id
        for word_id in word_ids
        if word_id is not None
    ] + [END_ID]

X_train = [
    text_to_ids(text)
    for text in train_data.question_text
]

X_test = [
    text_to_ids(text)
    for text in test_data.question_text
]

y_train = train_data.target.values

In [None]:
from sklearn.model_selection import train_test_split
X_train_train, X_train_val, \
y_train_train, y_train_val = train_test_split(
    X_train, y_train, test_size=0.1
)

In [None]:
import keras

def div_up(a, b):
    return (a + b - 1) // b

def batch_slice(index, size):
    return slice(index * size, (index + 1) * size)

def pad_samples(list_list_ids):
    num_words = 8 * div_up(max(map(len, list_list_ids)), 8)
    samples = np.zeros((len(list_list_ids), num_words), dtype='int32')  # END_ID = 0
    for i, list_ids in enumerate(list_list_ids):
        samples[i, 0:len(list_ids)] = list_ids
    return samples

# TODO: generate more data by splicing together more offensive texts
# TODO: mini-epochs for more precise early stopping
class DataGenerator(keras.utils.Sequence):
    def __init__(
        self,
        list_list_ids,
        labels=None,
        batch_size=64,
        num_mini_epochs=1,
        shuffle=True,
    ):
        self.labels = None if labels is None else labels.astype('float32')
        self.list_list_ids = list_list_ids
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = list(range(len(list_list_ids)))
        self.num_mini_epochs = num_mini_epochs
        self.mini_epoch = -1  # see on_epoch_end
        self.on_epoch_end()
        
    def __len__(self):
        return div_up(len(self.mini_epoch_indices), self.batch_size)
    
    def __getitem__(self, index):
        batch_indices = self.mini_epoch_indices[batch_slice(index, self.batch_size)]
        
        samples = pad_samples([self.list_list_ids[i] for i in batch_indices])
        if self.labels is None:
            return samples
        else:
            return samples, self.labels[batch_indices]
    
    def on_epoch_end(self):
        self.mini_epoch = (self.mini_epoch + 1) % self.num_mini_epochs
        if self.mini_epoch == 0 and self.shuffle:
            np.random.shuffle(self.indices)
            
        mini_epoch_size = div_up(len(self.indices), self.num_mini_epochs)
        self.mini_epoch_indices = self.indices[batch_slice(self.mini_epoch, mini_epoch_size)]

In [None]:
import keras.layers as K
from keras.models import Model, Sequential

def combine_stuff(input_layer, *layer_specs, pool=2, unpool=False):
    layer = input_layer
    for size, filters in layer_specs:
        layer = K.Conv1D(filters, size, padding='same')(layer)
        layer = K.BatchNormalization()(layer)
        layer = K.LeakyReLU(0.01)(layer)

    layer = K.MaxPooling1D(pool)(layer)
    if unpool:
        layer = K.UpSampling1D(pool)(layer)
        
    return layer


# TODO: compress the model a bit
# TODO: only use the tuning layer for subject matter, not linkage?
def build_model():
    input_layer = K.Input((None,), dtype='int32')
    
    manual_embed_layer = K.Embedding(
        ID_TO_VEC.shape[0],
        NUM_FEATURES - 300,
        weights=[ID_TO_VEC[:, 300:]],
        trainable=False,
    )(input_layer)
    
    pretrained_embed_layer = K.Embedding(
        ID_TO_VEC.shape[0],
        300,
        weights=[ID_TO_VEC[:, :300]],
        trainable=False,  # trainable=True baseline: val_loss = 0.1053
    )(input_layer)
    
    tuning_embed_layer = K.Embedding(
        ID_TO_VEC.shape[0],
        100,
        embeddings_initializer='lecun_normal',
    )(input_layer)
    
    embedded_tuning_embed_layer = K.Conv1D(300, 1)(tuning_embed_layer)
    
    embed_layer = K.concatenate([
        manual_embed_layer,
        K.add([pretrained_embed_layer, embedded_tuning_embed_layer])
    ])

    basic_layer = combine_stuff(embed_layer, (2, 200), (2, 300), pool=4, unpool=True)
    e_basic_layer = K.concatenate([manual_embed_layer, pretrained_embed_layer, basic_layer])
    
    linkage_layer_4 = combine_stuff(e_basic_layer, (4, 400), (4, 500), pool=8)
    linkage_layer_6 = combine_stuff(e_basic_layer, (6, 400), (6, 500), pool=8)
    
    linkage_layer = K.add([linkage_layer_4, linkage_layer_6])
    
    sentence_layer = combine_stuff(linkage_layer, (2, 500))
    layer = K.GlobalMaxPooling1D()(sentence_layer)
    
    layer = K.Dense(256)(layer)
    layer = K.BatchNormalization()(layer)
    layer = K.LeakyReLU(0.01)(layer)
    #layer = K.Dropout(0.5)(layer)

    layer = K.Dense(1, activation='sigmoid')(layer)
    return Model(input_layer, layer)

keras_model = build_model()
keras_model.summary()

In [None]:
# TODO: add cross-validation
# TODO: proper custom-initialized embedding for fine-tuning?
keras_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
)

# TODO: filter the training set, drop noisy samples (like the ones with zero words)
num_mini_epochs = 5
train_generator = DataGenerator(X_train_train, y_train_train, num_mini_epochs=num_mini_epochs)
val_generator = DataGenerator(X_train_val, y_train_val, batch_size=256)

keras_model.fit_generator(
    train_generator,
    validation_data=val_generator,
    epochs=2 * num_mini_epochs,
    use_multiprocessing=True,
    workers=2,
    callbacks=[keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)],
    verbose=2,
)

In [None]:
from sklearn.metrics import f1_score

eval_val_generator = DataGenerator(X_train_val, y_train_val, shuffle=False, batch_size=256)
prob_val = keras_model.predict_generator(
    eval_val_generator,
    use_multiprocessing=True,
    workers=2,
)

In [None]:
def calc_f1(thresh):
    return f1_score(y_train_val, (prob_val > thresh).astype('int32'))

thresh_options = np.linspace(0.01, 0.5, 100)
f1_values = np.array([calc_f1(thresh) for thresh in thresh_options])
best_thresh_index = np.argmax(f1_values)
best_thresh = thresh_options[best_thresh_index]
print('best_thresh = {}, f1_score = {}'.format(best_thresh, f1_values[best_thresh_index]))

In [None]:
if False:  # Kaggle doesn't like multiple outputs in this comp
    train_error_generator = DataGenerator(X_train, shuffle=False, batch_size=256)
    prob_train = keras_model.predict_generator(
        train_error_generator,
        use_multiprocessing=True,
        workers=2,
    )

    mispred_train = y_train != (prob_train > best_thresh).ravel()
    print(mispred_train.mean(), mispred_train.sum())
    train_data[mispred_train.astype('bool')].to_csv('mispreditions.csv', index=False)

In [None]:
test_generator = DataGenerator(X_test, shuffle=False, batch_size=256)
prob_test = keras_model.predict_generator(
    test_generator,
    use_multiprocessing=True,
    workers=2 ,
)

pred_test = (prob_test > best_thresh).astype('int32')

In [None]:
submission = pd.DataFrame({'qid': test_data.qid, 'prediction': pred_test.ravel()})

In [None]:
submission.shape

In [None]:
submission.to_csv('submission.csv', index=False)