In [3]:
import pandas as pd
import numpy as np
import itertools
import logging
import seaborn
import pickle
import random
import nltk
import json
import csv
import os


from collections import Counter
from tqdm import tqdm
from gensim.models import FastText, Word2Vec, KeyedVectors

from gensim_w2v import tokenizers
from text_processing_utils import vectorize, build_vocab, get_embeddings, read_fasttext

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
%matplotlib inline

### This notebook contains and describes the procedures necessary for pre-training a deep NN on OPUS data and fine-tuning it on the competition data.

# Data preparation

For pre-training we will be using the OPUS dataset, freely available for download at http://opus.nlpl.eu/OpenSubtitles2018.php. 

The repository contains a small sample of the data, consider downloading the whole dataset to get the real results. Russian dataset can be downloaded at http://opus.nlpl.eu/download.php?f=OpenSubtitles2016/en-ru.txt.zip

In [7]:
# read OPUS data
# this might as well be english or spanish subtitle data

tokens = []
c = 0
with open("./assets/cleaned_subs.txt", "r") as fi:
    for l in fi:
        tokens.append(l.strip().split())

In this walkthrough we will use a pre-trained russian fasttext model (not included).

You can get one at https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.ru.300.vec.gz

In [5]:
# read russian fasttext model
w2v = read_fasttext("./assets/cc.ru.300.vec")

In [8]:
# prepare word-to-id and id-to-word mappings
voc, rvoc = build_vocab(tokens, 250000, emb_model=w2v)
print(len(voc))

250000


In [9]:
voc

{'NULL': 0,
 'UNKN': 1,
 '<S>': 2,
 '</S>': 3,
 '.': 4,
 ',': 5,
 'я': 6,
 '?': 7,
 '-': 8,
 'не': 9,
 'что': 10,
 'и': 11,
 'в': 12,
 'это': 13,
 'ты': 14,
 '!': 15,
 '...': 16,
 'на': 17,
 'с': 18,
 'он': 19,
 'мы': 20,
 'как': 21,
 'вы': 22,
 'у': 23,
 'но': 24,
 'мне': 25,
 'меня': 26,
 'а': 27,
 'так': 28,
 'его': 29,
 'она': 30,
 'да': 31,
 'нет': 32,
 'все': 33,
 'если': 34,
 'о': 35,
 'тебя': 36,
 'бы': 37,
 'за': 38,
 'они': 39,
 'тебе': 40,
 'чтобы': 41,
 'из': 42,
 'к': 43,
 'же': 44,
 'когда': 45,
 'для': 46,
 'есть': 47,
 'просто': 48,
 'был': 49,
 'по': 50,
 'то': 51,
 'только': 52,
 'было': 53,
 'ну': 54,
 'здесь': 55,
 'может': 56,
 'от': 57,
 'нас': 58,
 'вас': 59,
 'быть': 60,
 'знаю': 61,
 'всё': 62,
 'их': 63,
 'будет': 64,
 'или': 65,
 'вам': 66,
 'вот': 67,
 'кто': 68,
 'уже': 69,
 'еще': 70,
 'нам': 71,
 'почему': 72,
 'очень': 73,
 'была': 74,
 'ее': 75,
 'этого': 76,
 'там': 77,
 'могу': 78,
 'чем': 79,
 'хорошо': 80,
 'хочу': 81,
 'нужно': 82,
 'сейчас': 83,
 

In [10]:
# prepare embedding matrix
myembs = get_embeddings(w2v, rvoc)
myembs.shape

(250000, 300)

In [11]:
# represent data as a matrix of indices 
VT = vectorize(tokens, voc, max_len=20)

100%|██████████| 9016819/9016819 [00:36<00:00, 245561.64it/s]


In [12]:
# define batch generator

def generate_batch(dmatrix, batch_size=256):

    indices = np.arange(0, len(dmatrix)-4)
    
    def generate_sample():
        sid = random.choice(indices)
        l = random.choice([0,1])
        if l:
            # we either use 4 consecutive utterances (positive sample)
            sample = dmatrix[sid], dmatrix[sid+1], dmatrix[sid+2], dmatrix[sid+3]
        else:
            # or 3 consecutive and one random utterance (negative sample)
            sample = dmatrix[sid], dmatrix[sid+1], dmatrix[sid+2], dmatrix[random.choice(indices)]
        return sample, l
    
    while True:
        # then prepare a batch of given size
        C1, C2, C3, R, L = [], [], [], [], []
        for _ in range(batch_size):
            xx, yy = generate_sample()
            C1.append(xx[0])
            C2.append(xx[1])
            C3.append(xx[2])
            R.append(xx[3])
            L.append(yy)
            
        yield ([np.array(C1), np.array(C2), np.array(C3), np.array(R)], np.array(L))

# Model architecture

In [13]:
from keras import backend as K
from keras.models import Model, model_from_json

from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Lambda, Reshape, Flatten, Input, CuDNNGRU, CuDNNLSTM
from keras.layers import Input, Dense, Dropout, Activation, Embedding
from keras.layers import Bidirectional, TimeDistributed
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import concatenate
from keras.layers.advanced_activations import LeakyReLU

from sklearn.model_selection import KFold

from T_ops import *
from commons import LossHistory, AUC_Saver, maybe_mkdir

os.environ['CUDA_VISIBLE_DEVICES']="0"

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Define some re-usable building blocks 

In [14]:
def build_embedder(embs_matrix, seqlen, vsiz=None, 
                   weighted=False, transformed=False, default_dim=300, prefix='word'):
    
    inp = Input(shape=(seqlen,))
    
    if embs_matrix is not None:
        emb_dim = embs_matrix.shape[1]
        voc_siz = embs_matrix.shape[0]
        enc = Embedding(voc_siz, emb_dim, input_length=seqlen, 
                        weights=[embs_matrix], trainable = False)(inp)
    else:
        emb_dim = default_dim
        voc_siz = vsiz
        enc = Embedding(voc_siz, emb_dim, input_length=seqlen)(inp)
        
    if transformed:
        trf = Dense(emb_dim)(enc)
        act = LeakyReLU()(trf)
    else:
        act = enc
    
    if weighted:
        wwt = Embedding(voc_siz, 1, input_length=seqlen,
                        weights=[np.ones(shape=(voc_siz,1))])(inp)
        wac = Reshape((-1,1))(Activation("softmax")(Reshape((-1,))(wwt)))
        out = Lambda(pairwise_mul, name='MulLayer')([act, wac])
    else:
        out = act
    
    return Model(inputs=[inp], outputs=out, name=prefix+'_embedding_model')

def build_lstm_encoder(input_shape, return_sequences=False, bidirectional=False, lstm_dim=300, prefix="word", rdp=0.1):
    
    LSTM_DIM = lstm_dim
    inp = Input(shape=tuple(input_shape[-2:]))
    if bidirectional:
        rnn = Bidirectional(CuDNNLSTM(LSTM_DIM, return_sequences=return_sequences))(inp)
    else:
        rnn = CuDNNLSTM(LSTM_DIM, return_sequences=return_sequences)(inp)
        
    mod = Model(inputs=inp, outputs=rnn, name=prefix+'_lstm_encoder')
    return mod

def build_gru_encoder(input_shape, return_sequences=False, bidirectional=False, lstm_dim=300, prefix="word", rdp=0.1):
    
    LSTM_DIM = lstm_dim
    inp = Input(shape=tuple(input_shape[-2:]))
    if bidirectional:
        rnn = Bidirectional(CuDNNGRU(LSTM_DIM, return_sequences=return_sequences))(inp)
    else:
        rnn = CuDNNGRU(LSTM_DIM, return_sequences=return_sequences)(inp)
        
    mod = Model(inputs=inp, outputs=rnn, name=prefix+'_lstm_encoder')
    return mod

def build_deep_sim_net(input_shape, inr_dim=300, DROPOUT=0.3):
    
    input_a = Input(shape=(input_shape[-1],))
    input_b = Input(shape=(input_shape[-1],))
    
    mul_layer = Lambda(pairwise_mul, name='MultiplicationLayer')([input_a, input_b])
    dis_layer = Lambda(pairwise_dis, name='SubstractionLayer')([input_a, input_b])
    css_layer = Lambda(cosine_similarity)([input_a, input_b])

    dn1 = concatenate([mul_layer, dis_layer, css_layer])
    bn1 = BatchNormalization()(dn1)
    dr1 = Dropout(DROPOUT)(bn1)
    
    dn2 = Dense(inr_dim)(dr1)    
    bn2 = BatchNormalization()(dn2)
    ac2 = LeakyReLU(alpha=0.3)(bn2)
        
    mod = Model(inputs=[input_a, input_b], outputs=ac2, name='deep_sim_net')
    return mod

### Define the main model architecture

In [15]:
def get_sd_memnet_model():

    BDR=True
    
    DROPOUT=0.3
    LSTM_DIM=300
    SEQ_LEN = 20
    DENSE_DIM = 600

    # inputs for contexts and reply
    inp_ctx1 = Input(shape=(SEQ_LEN,), name='inp_ctx_0')
    inp_ctx2 = Input(shape=(SEQ_LEN,), name='inp_ctx_1')
    inp_ctx3 = Input(shape=(SEQ_LEN,), name='inp_ctx_2')
    inp_rpl = Input(shape=(SEQ_LEN,), name='inp_reply')

    # word embedding model
    embedder = build_embedder(myembs, SEQ_LEN)
    
    # shared sentence-level encoder
    encoder_ctx = build_lstm_encoder(embedder.output_shape, return_sequences=False, 
                                     lstm_dim=LSTM_DIM, bidirectional=BDR, prefix="sentence")

    emb_ctx1 = embedder(inp_ctx1)
    emb_ctx2 = embedder(inp_ctx2)
    emb_ctx3 = embedder(inp_ctx3)
    emb_rpl = embedder(inp_rpl)
    
    
    def dense_comb():
        # shared dense layer to combine context and reply vectors
        inp1 = Input(shape=encoder_ctx.output_shape)
        inp2 = Input(shape=encoder_ctx.output_shape)
        
        dens = Dense(DENSE_DIM, activation='relu')(concatenate([inp1, inp2]))
        dens2 = Dense(DENSE_DIM, activation='relu')(dens)
        return Model(inputs=[inp1, inp2], outputs=dens2, name='merge_model')
    
    dm = dense_comb()

    # encode contexts and reply
    enc_ctx1 = encoder_ctx(emb_ctx1)
    enc_ctx2 = encoder_ctx(emb_ctx2)
    enc_ctx3 = encoder_ctx(emb_ctx3)
    enc_rpl = encoder_ctx(emb_rpl)
    
    # condition context encoding on reply encoding
    ctx1_cmb = Reshape((1,DENSE_DIM))(dm([enc_ctx1, enc_rpl]))
    ctx2_cmb = Reshape((1,DENSE_DIM))(dm([enc_ctx2, enc_rpl]))
    ctx3_cmb = Reshape((1,DENSE_DIM))(dm([enc_ctx3, enc_rpl]))
    
    # encode the whole context into a single vector
    ctx_h = CuDNNGRU(LSTM_DIM*2, return_sequences=False, name='context_lstm_encoder')(
        concatenate([ctx1_cmb, ctx2_cmb, ctx3_cmb], axis=1))
    
    # 2-layer MLP to evaluate relatedness between context and reply
    dsm = build_deep_sim_net((-1,LSTM_DIM*2), inr_dim=DENSE_DIM)
    css = dsm([ctx_h, enc_rpl])

    # output neuron (during pretraining we do binary classification)
    fc2 = Dense(1, activation='sigmoid', name='relevance')(css)

    model = Model(inputs=[inp_ctx1,inp_ctx2,inp_ctx3,inp_rpl], outputs=fc2)


    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    return model

# Pre-training

### Build model and prepare to train

In [16]:
model = get_sd_memnet_model()

Instructions for updating:
dim is deprecated, use axis instead


In [19]:
gen = 3
class_name = 'memnet'
net_name = 'lstm_rus_fasttext'

model = get_sd_memnet_model()

maybe_mkdir("./models/gen{}".format(gen))
maybe_mkdir("./models/gen{}/{}".format(gen, class_name))

chfilepath = "./models/gen{}/{}/{}.hdf5".format(gen, class_name, net_name)

checkpointer = ModelCheckpoint(chfilepath, save_best_only=True)
histlogger = LossHistory("./models/gen{}/{}/{}.csv".format(gen, class_name, net_name))
json.dump(model.to_json(), open("./models/gen{}/{}/{}.json".format(gen, class_name, net_name), "w"))

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

# adjust as needed 
traingen = generate_batch(VT[:7000000], batch_size=512)
evalgen = generate_batch(VT[7000000:], batch_size=512)

### Begin pre-training

The model reaches peak accuracy after 128-256 epochs and can be trained indefinetely without overfitting because of the sheer size of the dataset

In [20]:
model.fit_generator(generator=traingen, steps_per_epoch=2048, epochs=16, 
                    validation_data=evalgen, validation_steps=256, callbacks=[checkpointer, histlogger, reduce_lr])

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<keras.callbacks.History at 0x7f81741f8240>

In [21]:
pretrained_mpath = chfilepath

# Fine-tuning

Now we fine-tune the pre-trained model on the high-quality competition data 

In [22]:
# this loads the architecture and weights of the pretrained-model, then modifies it for the task of the competition

def make_pretrained_model(fpath):
    model = model_from_json(json.load(open(fpath)))
    model.load_weights(fpath.replace("json", "hdf5"))

    intermediate_layer_model = Model(inputs=model.input,
                                     outputs=model.get_layer("deep_sim_net").get_output_at(1))
    
    # gets a list of model layers up to the MLP
    lr = model.get_layer("deep_sim_net").get_output_at(0)

    # freezes all model weights except the MLP part which will be finetuned
    for layer in intermediate_layer_model.layers[:-1]:
        layer.trainable=False

    SEQ_LEN = 20

    inp_ctx1 = Input(shape=(SEQ_LEN,))
    inp_ctx2 = Input(shape=(SEQ_LEN,))
    inp_ctx3 = Input(shape=(SEQ_LEN,))
    inp_rpl = Input(shape=(SEQ_LEN,))

    dns = intermediate_layer_model([inp_ctx1, inp_ctx2, inp_ctx3, inp_rpl])
    
    # during fine-tuning we do regression
    dns_out = Dense(1)(dns)

    fin_model = Model(inputs=[inp_ctx1, inp_ctx2, inp_ctx3, inp_rpl], outputs=dns_out)
    fin_model.compile(optimizer='adam',
                  loss='mean_squared_error',
                  metrics=['accuracy'])
    
    return fin_model

In [23]:
def read_data():
    columns = ['context_id','context_2','context_1','context_0','reply_id','reply','label','confidence']

    test_df = pd.read_csv("./data/final.tsv", sep="\t", header=None, quoting=csv.QUOTE_NONE)
    train_df = pd.read_csv("./data/train.tsv", sep="\t", header=None, quoting=csv.QUOTE_NONE)

    train_df = train_df.fillna("")
    test_df = test_df.fillna("")

    test_df.columns = columns[:-2]
    train_df.columns = columns
    
    return train_df, test_df

def tokenize_sents(sents, tokenizer):
    return [tokenizer(s) for s in sents]

### Load and prepare competition data

In [28]:
train_df, test_df = read_data()

In [29]:
cxv1 = vectorize(tokenize_sents(train_df['context_2'].tolist(), tokenizers.tokenize_split), voc, max_len=20)
cxv2 = vectorize(tokenize_sents(train_df['context_1'].tolist(), tokenizers.tokenize_split), voc, max_len=20)
cxv3 = vectorize(tokenize_sents(train_df['context_0'].tolist(), tokenizers.tokenize_split), voc, max_len=20)
train_rpl_V = vectorize(tokenize_sents(train_df['reply'].tolist(), tokenizers.tokenize_split), voc, max_len=20)

100%|██████████| 97533/97533 [00:00<00:00, 481908.60it/s]
100%|██████████| 97533/97533 [00:00<00:00, 388859.63it/s]
100%|██████████| 97533/97533 [00:00<00:00, 357807.77it/s]
100%|██████████| 97533/97533 [00:00<00:00, 330870.01it/s]


In [30]:
ts_cxv1 = vectorize(tokenize_sents(test_df['context_2'].tolist(), tokenizers.tokenize_split), voc, max_len=20)
ts_cxv2 = vectorize(tokenize_sents(test_df['context_1'].tolist(), tokenizers.tokenize_split), voc, max_len=20)
ts_cxv3 = vectorize(tokenize_sents(test_df['context_0'].tolist(), tokenizers.tokenize_split), voc, max_len=20)
ts_rpl_V = vectorize(tokenize_sents(test_df['reply'].tolist(), tokenizers.tokenize_split), voc, max_len=20)

100%|██████████| 104834/104834 [00:00<00:00, 490874.41it/s]
100%|██████████| 104834/104834 [00:00<00:00, 406835.40it/s]
100%|██████████| 104834/104834 [00:00<00:00, 359543.03it/s]
100%|██████████| 104834/104834 [00:00<00:00, 346161.97it/s]


In [31]:
nl = []
for l in train_df['label'].tolist():
    if l == 'good':
        nl.append(1)
    elif l == 'neutral':
        nl.append(0.5)
    elif l == 'bad':
        nl.append(0)
        
y_train = np.array(nl)
        
confs = np.array(train_df['confidence'])

### Train the model with 10-fold CV

In [34]:
kf = KFold(n_splits=10, shuffle=True, random_state=34)

splits = []
oofpreds = []
tspreds = []
train = True


for i, split in enumerate(kf.split(list(range(cxv1.shape[0])))):
    splits.append(split)
    print("Processing fold {}".format(i+1))
    
    model = make_pretrained_model(pretrained_mpath.replace("hdf5", "json"))
    
    tX = [cxv1[split[0]], cxv2[split[0]], cxv3[split[0]], train_rpl_V[split[0]]]
    #tX = [train_ctx_V[split[0]], train_rpl_V[split[0]]]
    tY = y_train[split[0]]
    
    vX = [cxv1[split[1]], cxv2[split[1]], cxv3[split[1]], train_rpl_V[split[1]]]
    #vX = [train_ctx_V[split[1]], train_rpl_V[split[1]]]
    vY = y_train[split[1]]
    
    gen = 3
    class_name = 'pretrained'
    net_name = 'pretrained_lstm_rus_fasttext_'+str(i)
    
    maybe_mkdir("./models/gen{}".format(gen))
    maybe_mkdir("./models/gen{}/{}".format(gen, class_name))
    
    chfilepath = "./models/gen{}/{}/{}.hdf5".format(gen, class_name, net_name)
    if train:
        checkpointer = ModelCheckpoint(chfilepath, save_best_only=True)
        histlogger = LossHistory("./models/gen{}/{}/{}.csv".format(gen, class_name, net_name))
        json.dump(model.to_json(), open("./models/gen{}/{}/{}.json".format(gen, class_name, net_name), "w"))

        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                      patience=5, min_lr=0.0001)

        h = model.fit(tX, tY, validation_data=(vX, vY), 
                     batch_size=512, epochs=16,
                     verbose=1, callbacks=[checkpointer, histlogger, reduce_lr])
    
    model.load_weights(chfilepath)
    
    oofp = model.predict(vX, batch_size=512)

    oofpreds.append([oofp, split[1]])
    tspreds.append(model.predict([ts_cxv1, ts_cxv2, ts_cxv3, ts_rpl_V], batch_size=512))
    #tspreds.append(model.predict([ts_ctx_V, ts_rpl_V], batch_size=512))

Processing fold 1
Train on 87779 samples, validate on 9754 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Processing fold 2
Train on 87779 samples, validate on 9754 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Processing fold 3
Train on 87779 samples, validate on 9754 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Processing fold 4
Train on 87780 samples, validate on 9753 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16


Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Processing fold 5
Train on 87780 samples, validate on 9753 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Processing fold 6
Train on 87780 samples, validate on 9753 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Processing fold 7
Train on 87780 samples, validate on 9753 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Processing fold 8
Train on 87780 samples, validate on 9753 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16


Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Processing fold 9
Train on 87780 samples, validate on 9753 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Processing fold 10
Train on 87780 samples, validate on 9753 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


### Finally, save the vocabs for later

In [35]:
pickle.dump([voc, rvoc], 
            open("./assets/rus_fasttext.voc","wb"))