In [15]:
import pandas as pd
import numpy as np
import itertools
import nltk
import csv
import seaborn 

import os
import json
import pickle

from tqdm import tqdm
from glob import glob
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from gensim.models import FastText, Word2Vec
from text_processing_utils import vectorize, build_vocab, get_embeddings, read_fasttext

from keras.models import model_from_json, Model
from keras.layers import Input, Dense

os.environ['CUDA_VISIBLE_DEVICES']="0"

%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### In this notebook we use the trained models to generate features for the ensembling model

In [16]:
def read_data():
    columns = ['context_id','context_2','context_1','context_0','reply_id','reply','label','confidence']

    test_df = pd.read_csv("./data/final.tsv", sep="\t", header=None, quoting=csv.QUOTE_NONE)
    train_df = pd.read_csv("./data/train.tsv", sep="\t", header=None, quoting=csv.QUOTE_NONE)

    train_df = train_df.fillna("")
    test_df = test_df.fillna("")

    test_df.columns = columns[:-2]
    train_df.columns = columns
    
    return train_df, test_df

In [17]:
train_df, test_df = read_data()

In [18]:
nl = []
for l in train_df['label'].tolist():
    if l == 'good':
        nl.append(1)
    elif l == 'neutral':
        nl.append(0.5)
    elif l == 'bad':
        nl.append(0)
        
y_train = np.array(nl)

### Load vocabs

Here we load the word-to-id mappings used to train different supervised models. These are needed for reproducability, but are useless without the actual model weights (not included).

In [6]:
VD = pickle.load(open("./assets/repr_vocs.pkl", 'rb'))

# Supervised models

In [19]:
from gensim_w2v import tokenizers

def tokenize_split(sents):
    return [tokenizers.tokenize_split(s) for s in sents]

def tokenize_char(sents):
    return [str(s) for s in sents]

def make_preds(model, wpaths, vocab, tokenizer, MAXLEN):
    kf = KFold(n_splits=10, shuffle=True, random_state=34)

    splits = []
    oofpreds = []
    tspreds = []
    
    cxv1 = vectorize(tokenizer(train_df['context_2'].tolist()), vocab, max_len=MAXLEN)
    cxv2 = vectorize(tokenizer(train_df['context_1'].tolist()), vocab, max_len=MAXLEN)
    cxv3 = vectorize(tokenizer(train_df['context_0'].tolist()), vocab, max_len=MAXLEN)
    train_rpl_V = vectorize(tokenizer(train_df['reply'].tolist()), vocab, max_len=MAXLEN)
    
    ts_cxv1 = vectorize(tokenizer(test_df['context_2'].tolist()), vocab, max_len=MAXLEN)
    ts_cxv2 = vectorize(tokenizer(test_df['context_1'].tolist()), vocab, max_len=MAXLEN)
    ts_cxv3 = vectorize(tokenizer(test_df['context_0'].tolist()), vocab, max_len=MAXLEN)
    ts_rpl_V = vectorize(tokenizer(test_df['reply'].tolist()), vocab, max_len=MAXLEN)

    scores = []
    for i, split in enumerate(kf.split(list(range(cxv1.shape[0])))):
        splits.append(split)

        tX = [cxv1[split[0]], cxv2[split[0]], cxv3[split[0]], train_rpl_V[split[0]]]
        tY = y_train[split[0]]

        vX = [cxv1[split[1]], cxv2[split[1]], cxv3[split[1]], train_rpl_V[split[1]]]
        vY = y_train[split[1]]

        if len(wpaths) == 10:
            model.load_weights(wpaths[i])
        else:
            model.load_weights(wpaths)

        oofp = model.predict(vX, batch_size=512)
        scores.append(mean_squared_error(vY, oofp))
        oofpreds.append([oofp, split[1]])
        tspreds.append(model.predict([ts_cxv1, ts_cxv2, ts_cxv3, ts_rpl_V], batch_size=512))
    print(np.mean(scores))    
    return splits, oofpreds, tspreds, [ts_cxv1, ts_cxv2, ts_cxv3, ts_rpl_V]

def make_pretrained_model(mpath, seqlen):
    model = model_from_json(json.load(open(mpath)))

    intermediate_layer_model = Model(inputs=model.input,
                                     outputs=model.get_layer("deep_sim_net").get_output_at(1))
    lr = model.get_layer("deep_sim_net").get_output_at(0)

    for layer in intermediate_layer_model.layers[:-1]:
        layer.trainable=False

    SEQ_LEN = seqlen

    inp_ctx1 = Input(shape=(SEQ_LEN,))
    inp_ctx2 = Input(shape=(SEQ_LEN,))
    inp_ctx3 = Input(shape=(SEQ_LEN,))
    inp_rpl = Input(shape=(SEQ_LEN,))

    dns = intermediate_layer_model([inp_ctx1, inp_ctx2, inp_ctx3, inp_rpl])
    dns_out = Dense(1)(dns)

    fin_model = Model(inputs=[inp_ctx1, inp_ctx2, inp_ctx3, inp_rpl], outputs=dns_out)
    fin_model.compile(optimizer='adam',
                  loss='mean_squared_error',
                  metrics=['accuracy'])
    
    return fin_model

def assert_equal(pack, path):
    print("checking ... ", end="")
    tf = pickle.load(open(path, "rb"))
    for i, j in zip(pack[1], tf[1]):
        assert(np.allclose(i[0], j[0]))
    print("valid")
    return 0

# 0

Your model, trained on russian subtitles using fasttext embeddings.

In [20]:
model = make_pretrained_model("./models/gen3/memnet/lstm_rus_fasttext.json", 20)

Instructions for updating:
dim is deprecated, use axis instead


In [21]:
my_voc = pickle.load(open("./assets/rus_fasttext.voc", "rb"))[0]

In [22]:
ppack = make_preds(model, 
           sorted(glob("./models/gen3/pretrained/pretrained_lstm_rus_fasttext_*.hdf5")), 
           my_voc, tokenize_split, 20)

100%|██████████| 97533/97533 [00:00<00:00, 518115.87it/s]
100%|██████████| 97533/97533 [00:00<00:00, 439514.60it/s]
100%|██████████| 97533/97533 [00:00<00:00, 194084.08it/s]
100%|██████████| 97533/97533 [00:00<00:00, 355982.56it/s]
100%|██████████| 104834/104834 [00:00<00:00, 244108.69it/s]
100%|██████████| 104834/104834 [00:00<00:00, 442013.42it/s]
100%|██████████| 104834/104834 [00:00<00:00, 376094.54it/s]
100%|██████████| 104834/104834 [00:00<00:00, 198107.20it/s]


0.19421859127633517


In [26]:
pickle.dump(ppack, open("./features/supervised/rus_fasttext.pkl", "wb"))

The following 7 models I used to make the final submission.

# 1

Russian subtitles + CBOW word embeddings trained on OPUS using scripts from ./gensim_w2v directory

In [10]:
model = make_pretrained_model("./models/gen2/memnet/memnet_sd_lstm_dsm_binary_paranoid_xlarge.json", 20)

Instructions for updating:
dim is deprecated, use axis instead


In [11]:
ppack = make_preds(model, 
           sorted(glob("./models/gen2/pretrained/_tunedmemnet_xlarge_myembs_regr_*.hdf5")), 
           VD['cbow_1M_ppc_big'][0], tokenize_split, 20)

100%|██████████| 97533/97533 [00:00<00:00, 332026.64it/s]
100%|██████████| 97533/97533 [00:00<00:00, 423217.24it/s]
100%|██████████| 97533/97533 [00:00<00:00, 379208.62it/s]
100%|██████████| 97533/97533 [00:00<00:00, 256113.74it/s]
100%|██████████| 104834/104834 [00:00<00:00, 340717.12it/s]
100%|██████████| 104834/104834 [00:00<00:00, 354495.28it/s]
100%|██████████| 104834/104834 [00:00<00:00, 274571.33it/s]
100%|██████████| 104834/104834 [00:00<00:00, 259408.32it/s]


0.19139334832704172


In [13]:
assert_equal(ppack, "./reproducable/supervised/tunedmemnet_xlarge_myembs_regr.pkl")

checking ... valid


In [16]:
pickle.dump(ppack, open("./final/supervised/tunedmemnet_xlarge_myembs_regr.pkl", "wb"))

# 2

Russian subtitles + fasttext word embeddings 

In [17]:
model = make_pretrained_model("./models/gen2/memnet/memnet_sd_gru_dsm_binary_paranoid_big.json", 20)

In [18]:
ppack = make_preds(model,
           sorted(glob("./models/gen2/pretrained/_tunedmemnet_biggru_ft_regr_*.hdf5")), 
           VD['cc.ru.300_big'][0], tokenize_split, 20)

100%|██████████| 97533/97533 [00:00<00:00, 314462.78it/s]
100%|██████████| 97533/97533 [00:00<00:00, 285731.98it/s]
100%|██████████| 97533/97533 [00:00<00:00, 369019.03it/s]
100%|██████████| 97533/97533 [00:00<00:00, 214522.28it/s]
100%|██████████| 104834/104834 [00:00<00:00, 319658.77it/s]
100%|██████████| 104834/104834 [00:00<00:00, 264896.02it/s]
100%|██████████| 104834/104834 [00:00<00:00, 254760.20it/s]
100%|██████████| 104834/104834 [00:00<00:00, 239416.25it/s]


0.18832487898291178


In [19]:
assert_equal(ppack, "./reproducable/supervised/tunedmemnet_biggru_ft_regr.pkl")

checking ... valid


In [20]:
pickle.dump(ppack, open("./final/supervised/tunedmemnet_biggru_ft_regr.pkl", "wb"))

# 3

Russian subtitles - char-level model (no pretrained embeddings)

In [21]:
model = make_pretrained_model("./models/gen2/memnet/memnet_cl_sd_lstm_dsm_binary_paranoid_xlarge.json", 96)

In [22]:
ppack = make_preds(model,
           sorted(glob("./models/gen2/pretrained/pretrained_memnet_cl_sd_lstm_dsm_binary_paranoid_xlarge*.hdf5")), 
           VD['clevel_big'][0], tokenize_char, 96)

100%|██████████| 97533/97533 [00:00<00:00, 194610.73it/s]
100%|██████████| 97533/97533 [00:00<00:00, 119922.76it/s]
100%|██████████| 97533/97533 [00:00<00:00, 127411.66it/s]
100%|██████████| 97533/97533 [00:00<00:00, 115021.34it/s]
100%|██████████| 104834/104834 [00:00<00:00, 199373.76it/s]
100%|██████████| 104834/104834 [00:00<00:00, 129825.52it/s]
100%|██████████| 104834/104834 [00:00<00:00, 105224.51it/s]
100%|██████████| 104834/104834 [00:00<00:00, 132692.94it/s]


0.19324107112287126


In [23]:
assert_equal(ppack, "./reproducable/supervised/tuned_memnet_cl_sd_lstm_dsm_binary_paranoid_xlarge.pkl")

checking ... valid


In [24]:
pickle.dump(ppack, open("./final/supervised/tuned_memnet_cl_sd_lstm_dsm_binary_paranoid_xlarge.pkl", "wb"))

# 4

Russian subtitles lemmatized with mystem + CBOW word embeddings trained on them

In [25]:
model = make_pretrained_model("./models/gen2/memnet/memnet_sd_gru_dsm_binary_paranoid_lemma.json", 20)

In [26]:
from pymystem3 import Mystem
stemmer = Mystem()

def lemmatize(t):
    return ''.join(stemmer.lemmatize(t)).strip()

In [27]:
train_df['context_2'] = [lemmatize(t) for t in train_df['context_2']]
train_df['context_1'] = [lemmatize(t) for t in train_df['context_1']]
train_df['context_0'] = [lemmatize(t) for t in train_df['context_0']]
train_df['reply'] = [lemmatize(t) for t in train_df['reply']]

test_df['context_2'] = [lemmatize(t) for t in test_df['context_2']]
test_df['context_1'] = [lemmatize(t) for t in test_df['context_1']]
test_df['context_0'] = [lemmatize(t) for t in test_df['context_0']]
test_df['reply'] = [lemmatize(t) for t in test_df['reply']]

In [28]:
ppack = make_preds(model,
                   sorted(glob("./models/gen2/pretrained/sd_memnet_lemma_mycbow_regr*.hdf5")), 
                   VD['lemma'][0], tokenize_split, 20)

100%|██████████| 97533/97533 [00:00<00:00, 489586.79it/s]
100%|██████████| 97533/97533 [00:00<00:00, 247923.41it/s]
100%|██████████| 97533/97533 [00:00<00:00, 224727.57it/s]
100%|██████████| 97533/97533 [00:00<00:00, 342395.39it/s]
100%|██████████| 104834/104834 [00:00<00:00, 486646.63it/s]
100%|██████████| 104834/104834 [00:00<00:00, 410380.30it/s]
100%|██████████| 104834/104834 [00:00<00:00, 357292.85it/s]
100%|██████████| 104834/104834 [00:00<00:00, 341661.47it/s]


0.19039269212295565


In [29]:
assert_equal(ppack, "./reproducable/supervised/memnet_lemma_mycbow_regr.pkl")

checking ... valid


In [31]:
pickle.dump(ppack, open("./final/supervised/memnet_lemma_mycbow_regr.pkl", "wb"))

## 5

English subtitles + fasttext word embeddings

In [33]:
model = make_pretrained_model("./models/gen2/memnet/memnet_lstm_eng_ft.json", 20)

Instructions for updating:
dim is deprecated, use axis instead


In [49]:
train_df, test_df = read_data()

In [29]:
trans = pickle.load(open("./assets/translations/english_preprocessed.pkl","rb"))
#trans1 = pickle.load(open("./assets/translations/fin.eng.pkl","rb"))

In [36]:
for df in [train_df, test_df]:
    for col in ['context_2','context_1','context_0','reply']:
        df[col] = [trans[t] for t in df[col]]

In [37]:
ppack = make_preds(model,
                   sorted(glob("./models/gen2/pretrained/pretrained_memnet_lstm_eng_ft_*.hdf5")), 
                   VD['eng_ft'][0], tokenize_split, 20)

100%|██████████| 97533/97533 [00:00<00:00, 329223.35it/s]
100%|██████████| 97533/97533 [00:00<00:00, 276766.89it/s]
100%|██████████| 97533/97533 [00:00<00:00, 269277.79it/s]
100%|██████████| 97533/97533 [00:00<00:00, 259398.30it/s]
100%|██████████| 104834/104834 [00:00<00:00, 339675.88it/s]
100%|██████████| 104834/104834 [00:00<00:00, 299523.35it/s]
100%|██████████| 104834/104834 [00:00<00:00, 265300.70it/s]
100%|██████████| 104834/104834 [00:00<00:00, 268936.42it/s]


0.19341991963321165


In [23]:
t = assert_equal(ppack, "./reproducable/supervised/pretrained_memnet_lstm_eng_ft.pkl")

checking ... valid


In [24]:
pickle.dump(ppack, open("./final/supervised/pretrained_memnet_lstm_eng_ft.pkl", "wb"))

## 6

English subtitles + glove word embeddings

In [25]:
model = make_pretrained_model("./models/gen2/memnet/memnet_lstm_eng_glove.json", 20)

In [26]:
ppack = make_preds(model,
                   sorted(glob("./models/gen2/pretrained/pretrained_memnet_lstm_eng_gl_*.hdf5")), 
                   VD['eng_glove'][0], tokenize_split, 20)

100%|██████████| 97533/97533 [00:00<00:00, 295888.96it/s]
100%|██████████| 97533/97533 [00:00<00:00, 419124.86it/s]
100%|██████████| 97533/97533 [00:00<00:00, 243955.89it/s]
100%|██████████| 97533/97533 [00:00<00:00, 239618.99it/s]
100%|██████████| 104834/104834 [00:00<00:00, 309389.55it/s]
100%|██████████| 104834/104834 [00:00<00:00, 267594.01it/s]
100%|██████████| 104834/104834 [00:00<00:00, 240605.15it/s]
100%|██████████| 104834/104834 [00:00<00:00, 248212.75it/s]


0.19370656250713264


In [27]:
t = assert_equal(ppack, "./reproducable/supervised/pretrained_memnet_lstm_eng_glove.pkl")

checking ... valid


In [28]:
pickle.dump(ppack, open("./final/supervised/pretrained_memnet_lstm_eng_glove.pkl", "wb"))

## 7

Spanish subtitles + fasttext word embeddings

In [73]:
train_df, test_df = read_data()

In [67]:
trans = pickle.load(open("./assets/translations/all_esp.finn.pkl","rb"))

for df in [train_df, test_df]:
    for col in ['context_2','context_1','context_0','reply']:
        df[col] = [trans[t] for t in df[col]]

In [69]:
model = make_pretrained_model("./models/gen2/memnet/memnet_lstm_esp_fst.json", 20)

In [70]:
ppack = make_preds(model,
                   sorted(glob("./models/gen2/pretrained/pretrained_memnet_lstm_esp_ft_*.hdf5")), 
                   VD['esp_ft'][0], tokenize_split, 20)

100%|██████████| 97533/97533 [00:00<00:00, 315864.74it/s]
100%|██████████| 97533/97533 [00:00<00:00, 279702.09it/s]
100%|██████████| 97533/97533 [00:00<00:00, 266336.87it/s]
100%|██████████| 97533/97533 [00:00<00:00, 410264.61it/s]
100%|██████████| 104834/104834 [00:00<00:00, 318784.14it/s]
100%|██████████| 104834/104834 [00:00<00:00, 291721.29it/s]
100%|██████████| 104834/104834 [00:00<00:00, 270777.40it/s]
100%|██████████| 104834/104834 [00:00<00:00, 399855.29it/s]


0.1916239633483493


In [34]:
t = assert_equal(ppack, "./reproducable/supervised/pretrained_memnet_lstm_esp_fasttext.pkl")

checking ... valid


In [35]:
pickle.dump(ppack, open("./final/supervised/pretrained_memnet_lstm_esp_fasttext.pkl", "wb"))

# Unsupervised models

Here we compute assorted unsupervised NLP features.

Make sure you install wmd-relax from https://github.com/src-d/wmd-relax

In [27]:
import sys
sys.path.append("/home/aphex/Downloads/wmd-relax")
import libwmdrelax

import numpy as np
from gensim.models import FastText, Word2Vec, KeyedVectors
from text_processing_utils import read_fasttext

import nltk
import pickle
import pandas as pd
import numpy as np
import itertools
import nltk
import csv
import seaborn 

from collections import Counter, defaultdict
from tqdm import tqdm
from gensim.models import FastText

%matplotlib inline

In [28]:
train_df, test_df = read_data()

### WMD

In [29]:
from pymystem3 import Mystem
stemmer = Mystem()

def lemmatize(t):
    return ''.join(stemmer.lemmatize(t)).strip()

In [30]:
class SpacySimilarityHook(object):
        """
        This guy is needed for the integration with `spaCy <https://spacy.io>`_.
        Use it like this:
        ::
           nlp = spacy.load('en', create_pipeline=wmd.WMD.create_spacy_pipeline)
        It defines :func:`~wmd.WMD.SpacySimilarityHook.compute_similarity()` \
        method which is called by spaCy over pairs of
        `documents <https://spacy.io/docs/api/doc>`_.
        .. automethod:: wmd::WMD.SpacySimilarityHook.__init__
        """
        def __init__(self, nlp, **kwargs):
            """
            Initializes a new instance of SpacySimilarityHook class.
            :param nlp: `spaCy language object <https://spacy.io/docs/api/language>`_.
            :param ignore_stops: Indicates whether to ignore the stop words.
            :param only_alpha: Indicates whether only alpha tokens must be used.
            :param frequency_processor: The function which is applied to raw \
                                        token frequencies.
            :type ignore_stops: bool
            :type only_alpha: bool
            :type frequency_processor: callable
            """
            
            self.lower = kwargs.get("lower", False)
            self.nlp = nlp
            self.ignore_stops = kwargs.get("ignore_stops", True)
            self.only_alpha = kwargs.get("only_alpha", True)
            self.frequency_processor = kwargs.get(
                "frequency_processor", lambda t, f: np.log(1 + f))

        def __call__(self, doc):
            doc.user_hooks["similarity"] = self.compute_similarity
            doc.user_span_hooks["similarity"] = self.compute_similarity

        def compute_similarity(self, doc1, doc2):
            """
            Calculates the similarity between two spaCy documents. Extracts the
            nBOW from them and evaluates the WMD.
            :return: The calculated similarity.
            :rtype: float.
            """
            if self.lower:
                doc1 = doc1.lower()
                doc2 = doc2.lower()
            
            doc1 = self._convert_document(doc1)
            doc2 = self._convert_document(doc2)
            vocabulary = {
                w: i for i, w in enumerate(sorted(set(doc1).union(doc2)))}
            
            #print(vocabulary)
            w1 = self._generate_weights(doc1, vocabulary)
            w2 = self._generate_weights(doc2, vocabulary)
            
            #print(w1,w2, vocabulary)
            if hasattr(self.nlp, "vector_size"):
                evec = np.zeros((len(vocabulary), self.nlp.vector_size),
                                   dtype=np.float32)
            else:
                evec = np.zeros((len(vocabulary), 300),
                                   dtype=np.float32)
            for w, i in vocabulary.items():
                evec[i] = self.nlp[w]
            evec_sqr = (evec * evec).sum(axis=1)
            dists = evec_sqr - 2 * evec.dot(evec.T) + evec_sqr[:, np.newaxis]
            dists[dists < 0] = 0
            dists = np.sqrt(dists)
            try:
                return libwmdrelax.emd(w1, w2, dists)
            except (RuntimeError, MemoryError):
                return 100.0
            
        def compute_similarity_batch(self, doc1, docs):
            return np.array([self.compute_similarity(doc1, doc) for doc in docs])

        def _convert_document(self, doc):
            words = defaultdict(int)
            for t in nltk.word_tokenize(doc):
                if t in self.nlp:
                    words[t] += 1
            return {t: self.frequency_processor(t, v) for t, v in words.items()}

        def _generate_weights(self, doc, vocabulary):
            w = np.zeros(len(vocabulary), dtype=np.float32)
            for t, v in doc.items():
                w[vocabulary[t]] = v
            w /= w.sum()
            return w
        
def get_wmd_similarities(w2v_model, dframe, lemma=False):

    ssh = SpacySimilarityHook(w2v_model)

    preds = []

    for row in tqdm(dframe.iterrows(), total=len(dframe)):
        t = row[1]
        if lemma:
            preds.append([ssh.compute_similarity(lemmatize(t['context_2']), lemmatize(t['reply'])),
                          ssh.compute_similarity(lemmatize(t['context_1']), lemmatize(t['reply'])),
                          ssh.compute_similarity(lemmatize(t['context_0']), lemmatize(t['reply']))])
        else:
            preds.append([ssh.compute_similarity(t['context_2'], t['reply']),
                          ssh.compute_similarity(t['context_1'], t['reply']),
                          ssh.compute_similarity(t['context_0'], t['reply'])])
        
    return np.array(preds)

In [13]:
# russian fasttext
# russian araneum http://rusvectores.org/static/models/rusvectores4/fasttext/araneum_none_fasttextcbow_300_5_2018.tgz
# russian OPUS gensim CBOW model
# russian lemmatized OPUS gensim CBOW model

word_vectors = {
    'fasttext': read_fasttext("./assets/cc.ru.300.vec"),
    'araneum': FastText.load("./assets/araneum_none_fasttextcbow_300_5_2018.model"),
    'my_ppc': Word2Vec.load("./assets/cbow_1M.w2v"),
    'my_ppc_lemma': Word2Vec.load("./assets/cbow_lemma.w2v")
}

In [31]:
wmd_r = {}
for wv in word_vectors:
    print(wv)
    wmd_r[wv] = [get_wmd_similarities(word_vectors[wv], train_df, lemma=('lemma' in wv)),
                 get_wmd_similarities(word_vectors[wv], test_df, lemma=('lemma' in wv))]

  0%|          | 43/97533 [00:00<03:49, 424.97it/s]

fasttext


100%|██████████| 97533/97533 [02:09<00:00, 753.93it/s]
100%|██████████| 104834/104834 [02:19<00:00, 750.34it/s]
  0%|          | 33/97533 [00:00<04:56, 328.51it/s]

araneum


100%|██████████| 97533/97533 [03:40<00:00, 442.28it/s]
100%|██████████| 104834/104834 [03:57<00:00, 441.26it/s]
  0%|          | 58/97533 [00:00<02:49, 574.01it/s]

my_ppc


100%|██████████| 97533/97533 [02:27<00:00, 660.77it/s]
100%|██████████| 104834/104834 [02:39<00:00, 658.18it/s]
  0%|          | 0/97533 [00:00<?, ?it/s]

my_ppc_lemma


100%|██████████| 97533/97533 [07:00<00:00, 231.83it/s]
100%|██████████| 104834/104834 [07:36<00:00, 229.73it/s]


In [32]:
for w in wmd_r:
    pickle.dump(wmd_r[w], open("./features/unsupervised/wmd_{}.pkl".format(w), "wb"))

### TF_IDF

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import paired_cosine_distances, cosine_similarity

In [34]:
alltexts = set()

for df in [train_df, test_df]:
    for col in ['context_2','context_1','context_0', 'reply']:
        alltexts.update(df[col].tolist())
        
alltexts = list(alltexts)

In [36]:
def get_tf_sims(vect, dframe):
    c2 = vect.transform(dframe['context_2'].tolist())
    c1 = vect.transform(dframe['context_1'].tolist())
    c0 = vect.transform(dframe['context_0'].tolist())

    r0 = vect.transform(dframe['reply'].tolist())

    c2r0 = paired_cosine_distances(c2,r0)
    c1r0 = paired_cosine_distances(c1,r0)
    c0r0 = paired_cosine_distances(c0,r0)

    preds = np.hstack([c2r0.reshape(-1,1), c1r0.reshape(-1,1), c0r0.reshape(-1,1)])
    return preds

In [37]:
tf_r = {}
for i in range(3):
    tfv = TfidfVectorizer(ngram_range=(1,i), max_features=200000)
    tfv.fit(alltexts)
    tf_r[i] = [get_tf_sims(tfv, train_df), get_tf_sims(tfv, test_df)]
    print(i)

0
1
2


In [38]:
for w in tf_r:
    pickle.dump(tf_r[w], open("./features/unsupervised/tfv_{}.pkl".format(w), "wb"))

### AVG-W2V

In [40]:
import nltk

def encode(text, w2v, lemma):
    if lemma:
        text = lemmatize(text)
    toks = nltk.word_tokenize(text)
    empty = np.zeros(300)
    ftoks = [tok for tok in toks if tok in w2v]
    if not len(ftoks):
        return empty
    else:
        return np.mean([w2v[tok] for tok in ftoks], axis=0)
    
def get_cossim(t1,t2):
    return cosine_similarity(encode(t1, w2v).reshape(1,-1), encode(t2, w2v).reshape(1,-1))[0][0]

def get_avgw2v_sims(w2v_model, dframe, lemma=False):
    ct2 = np.array([encode(t, w2v_model, lemma) for t in dframe['context_2']])
    ct1 = np.array([encode(t, w2v_model, lemma) for t in dframe['context_1']])
    ct0 = np.array([encode(t, w2v_model, lemma) for t in dframe['context_0']])
    
    rp = np.array([encode(t, w2v_model, lemma) for t in dframe['reply']])
    
    c2r0 = paired_cosine_distances(ct2,rp)
    c1r0 = paired_cosine_distances(ct1,rp)
    c0r0 = paired_cosine_distances(ct0,rp)

    preds = np.hstack([c2r0.reshape(-1,1), c1r0.reshape(-1,1), c0r0.reshape(-1,1)])
    return preds
    

In [41]:
avw_r = {}
for wv in word_vectors:
    
    avw_r[wv] = [get_avgw2v_sims(word_vectors[wv], test_df, 'lemma' in wv), 
                 get_avgw2v_sims(word_vectors[wv], train_df, 'lemma' in wv)]
    print(wv, 'lemma' in wv)

fasttext False


  
  if sys.path[0] == '':


araneum False
my_ppc False
my_ppc_lemma True


In [42]:
for w in avw_r:
    pickle.dump(avw_r[w][::-1], open("./features/unsupervised/avw_{}.pkl".format(w), "wb"))

## wms

In [43]:
import string
stops = nltk.corpus.stopwords.words('russian')

In [45]:
def word_match_share(s1, s2, lemma):
    if lemma:
        s1 = lemmatize(str(s1))
        s2 = lemmatize(str(s2))

    q1words = [word for word in str(s1).lower().split() if word not in stops and word not in string.punctuation]
    q2words = [word for word in str(s2).lower().split() if word not in stops and word not in string.punctuation]


    if len(q1words) == 0 and len(q2words) == 0:
        return 0
    
    shared_words = len(set(q1words)&set(q2words))

    R = 2*shared_words/(len(q1words) + len(q2words))
    return R

def get_wms(dframe, lemma=False):

    preds = []

    for row in tqdm(dframe.iterrows(), total=len(dframe)):
        t = row[1]
        preds.append([word_match_share(t['context_2'], t['reply'], lemma),
                      word_match_share(t['context_1'], t['reply'], lemma),
                      word_match_share(t['context_0'], t['reply'], lemma)])

    return np.array(preds)

In [46]:
wms_stops = [get_wms(train_df), get_wms(test_df)]

100%|██████████| 97533/97533 [00:13<00:00, 7350.82it/s]
100%|██████████| 104834/104834 [00:14<00:00, 7349.10it/s]


In [47]:
stops = []
wms_nostops = [get_wms(train_df), get_wms(test_df)]

100%|██████████| 97533/97533 [00:10<00:00, 9742.70it/s]
100%|██████████| 104834/104834 [00:10<00:00, 9999.10it/s] 


In [49]:
pickle.dump(wms_stops, open("./features/unsupervised/wms_stops.pkl","wb"))
pickle.dump(wms_nostops, open("./features/unsupervised/wms_nostops.pkl","wb"))

In [15]:
#pickle.dump(wms_lemma, open("./reproducable/unsupervised/wms_lemma.pkl","wb"))

## lengths

In [50]:
def get_lengths(dframe):

    preds = []

    for row in tqdm(dframe.iterrows(), total=len(dframe)):
        t = row[1]
        preds.append([len(t['context_2']), len(t['context_2'].split()),
                      len(t['context_1']), len(t['context_1'].split()),
                      len(t['context_0']), len(t['context_0'].split()),
                      len(t['reply']), len(t['reply'].split())])

    return np.array(preds)

In [51]:
lengths = [get_lengths(train_df), get_lengths(test_df)]

100%|██████████| 97533/97533 [00:09<00:00, 10552.50it/s]
100%|██████████| 104834/104834 [00:10<00:00, 10481.80it/s]


In [52]:
pickle.dump(lengths, open("./features/unsupervised/lengths.pkl","wb"))

## fuzzy dists

In [53]:
from fuzzywuzzy import fuzz

In [54]:
def get_fuzzy_dists(dframe):
    preds = []
    for row in tqdm(dframe.iterrows(), total=len(dframe)):
        t = row[1]
        preds.append([fuzz.ratio(t['context_2'], t['reply']),
                      fuzz.ratio(t['context_1'], t['reply']),
                      fuzz.ratio(t['context_0'], t['reply'])])

    return np.array(preds)

In [55]:
fuzz = [get_fuzzy_dists(train_df), get_fuzzy_dists(test_df)]

100%|██████████| 97533/97533 [00:09<00:00, 9958.37it/s]
100%|██████████| 104834/104834 [00:10<00:00, 10251.96it/s]


In [56]:
pickle.dump(fuzz, open("./features/unsupervised/fuzz.pkl","wb"))

## markdown

In [57]:
train_qs = train_df['context_0']
test_qs = test_df['context_0']
rpc = Counter(train_qs.tolist()+test_qs.tolist())

In [58]:
def get_markdown_ftrs(dframe):
    fs = []
    for col in ['context_0','reply']:
        ser = dframe[col]

        qmarks = ser.apply(lambda x: '?' in x)
        fullstop = ser.apply(lambda x: '.' in x)
        capitals = ser.apply(lambda x: sum([y in string.punctuation for y in x]))
        numbers = ser.apply(lambda x: sum([y.isdigit() for y in x]))
        rfreq = ser.apply(lambda x: rpc[x])

        fts = np.array([qmarks.as_matrix().astype('int'), fullstop.as_matrix().astype('int'), 
                              capitals.as_matrix().astype('int'),numbers.as_matrix().astype('int'), 
                              rfreq.as_matrix().astype('int')])
        fs.append(fts.T)
    return np.hstack(fs)

In [59]:
mdf = [get_markdown_ftrs(train_df), get_markdown_ftrs(test_df)]

In [60]:
pickle.dump(mdf, open("./features/unsupervised/markdown.pkl",'wb'))

## rogue

In [61]:
from rouge import Rouge 

rouge = Rouge()

In [62]:
def get_rogue_scores(dframe):
    preds = []

    for row in tqdm(dframe.iterrows(), total=len(dframe)):
        t = row[1]

        scores = rouge.get_scores(t['context_0'], t['reply'])

        preds.append([scores[0]['rouge-1'][t] for t in ['f','p','r']] + \
    [scores[0]['rouge-2'][t] for t in ['f','p','r']] + \
    [scores[0]['rouge-l'][t] for t in ['f','p','r']])

    return np.array(preds)

In [63]:
rog = [get_rogue_scores(train_df), get_rogue_scores(test_df)]

100%|██████████| 97533/97533 [00:12<00:00, 7619.26it/s]
100%|██████████| 104834/104834 [00:13<00:00, 7562.71it/s]


In [64]:
pickle.dump(rog, open("./features/unsupervised/rogue.pkl",'wb'))

## props

In [None]:
anl_cache = {t:stemmer.analyze(t) for t in tqdm(alltexts)}

In [40]:
def get_propn_scores(dframe):
    preds = []
    for row in tqdm(dframe.iterrows(), total=len(dframe)):
        t = row[1]
        preds.append([sum([1 for t in anl_cache[t['context_2']] if 'имя' in str(t)]),
                     sum([1 for t in anl_cache[t['context_1']] if 'имя' in str(t)]),
                     sum([1 for t in anl_cache[t['context_0']] if 'имя' in str(t)]),
                     sum([1 for t in anl_cache[t['reply']] if 'имя' in str(t)])])


    return np.array(preds)

In [None]:
propn = [get_propn_scores(train_df), get_propn_scores(test_df)]

In [48]:
pickle.dump(propn, open("./reproducable/unsupervised/propn.pkl","wb"))