In [7]:
import pandas as pd
import re
import pymorphy2
import numpy as np
import multiprocessing
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
from joblib import Parallel, delayed
from tqdm import tqdm
from gensim.models.wrappers.fasttext import FastTextKeyedVectors
import keras.backend as K
import pickle as pkl

In [4]:
morph = pymorphy2.MorphAnalyzer()
data = pd.read_excel('data/task_10+kw_only_ru.xlsx', header=1)
#data = pd.read_csv('new_apps_compatible.csv', sep=';').drop('Пустая колонка для совместимости', axis=1)
fasttext_model_path = '/Users/egor/Downloads/187/model.model'

#Параллельно выполняющийся map функции func по массиву massive
def parallelization(func, massive, jobs=3, tq=True):
    
    num_cores = multiprocessing.cpu_count() # Число наших ядер
    if tq:
        results = np.array(Parallel(n_jobs=num_cores)(delayed(func)(i) for i in tqdm(massive)))
        return results
    else:
        results = Parallel(n_jobs=num_cores)(delayed(func)(i) for i in massive)
        return results

def _word2canonical4w2v(word):
    elems = morph.parse(word)
    my_tag = ''
    res = []
    for elem in elems:
        if 'VERB' in elem.tag or 'GRND' in elem.tag or 'INFN' in elem.tag:
            my_tag = 'V'
        if 'NOUN' in elem.tag:
            my_tag = 'S'
        normalised = elem.normalized.word
        res.append((normalised, my_tag))
    tmp = list(filter(lambda x: x[1] != '', res))
    if len(tmp) > 0:
        return tmp[0]
    else:
        return res[0]

    
def word2canonical(word):
    return _word2canonical4w2v(word)[0]


def getWords(text, filter_short_words=False):
    if filter_short_words:
        return filter(lambda x: len(x) > 3, re.findall(r'(?u)\w+', text))
    else:
        return re.findall(r'(?u)\w+', text)


def text2canonicals(text, add_word=False, filter_short_words=True):
    words = []
    for word in getWords(text, filter_short_words=filter_short_words):
        words.append(word2canonical(word.lower()))
        if add_word:
            words.append(word.lower())
    return words

In [23]:
def build_weight_matrix(word2vec, target_vocab, emb_dim=300):
    matrix_len = len(target_vocab)
    weights_matrix = np.zeros((matrix_len, emb_dim))
    words_found = 0

    for i, word in enumerate(target_vocab):
        try: 
            weights_matrix[i] = word2vec[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
    
    return weights_matrix

def get_vocab(texts):
    return Counter([word for text in texts for word in text]).keys()

In [84]:
texts = parallelization(text2canonicals, data.Core.values, tq=True)

100%|██████████| 8225/8225 [02:40<00:00, 51.19it/s]


# Loading data

In [9]:
texts = pkl.load(open('data/texts.pkl', 'rb'))

In [28]:
queries = np.load('data/all_keywords_keys.npy')[1:]
queries = np.array(list(map(lambda x: x.split(), queries)))

In [93]:
#pkl.dump(texts, open('texts.pkl', 'wb'))

In [94]:
import torch
from torch import nn

In [21]:
fasttext = FastTextKeyedVectors.load(fasttext_model_path)

In [32]:
target_vocab = get_vocab(texts)
query_vocab = get_vocab(queries)
context_emb_matrix = build_weight_matrix(fasttext, target_vocab)
query_emb_matrix = build_weight_matrix(fasttext, query_vocab)

In [99]:
emb_layer = nn.Embedding(*weight_matrix.shape)

In [None]:
emb_layer.load_state_dict()

In [101]:
torch.tensor([1,2,3])

# test

tensor([1, 2, 3])

In [100]:
embs, num_embeddings, embedding_dim = create_emb_layer(weight_matrix, False)

In [58]:
embs(torch.tensor([1]))

tensor([[-0.2697, -0.5741, -1.5527, -2.2854, -1.2620, -0.5197,  3.0912,  0.7927,
         -0.6073,  2.3412,  0.3576,  2.0995, -0.9263, -0.5159, -2.0464, -0.4677,
         -2.9064, -0.0943,  0.6719,  2.4594,  1.6578, -2.9194, -0.0200,  1.2851,
         -2.8199, -0.4918,  0.2292, -0.9744, -3.2194, -3.5566, -1.6003,  0.2985,
          0.5833, -0.0737,  1.0696, -2.0138, -0.1739,  0.6468,  0.0755,  0.0971,
         -0.1788,  2.0031,  0.4137, -2.9208,  1.8085,  1.6108,  1.3114, -1.9675,
         -1.4978, -2.3921, -0.3813,  0.7139,  1.5572,  0.0862, -0.5874,  1.2259,
         -2.1570,  0.3506,  0.5972,  0.7138,  2.2535, -0.2766,  0.2211, -0.9814,
          0.5154, -0.7607,  0.3884,  3.5891,  1.7439,  0.7744,  1.3007,  1.2588,
          1.2458, -1.9587,  0.3296,  1.2414, -1.3811, -2.7530, -0.3852,  1.8004,
         -1.3537,  3.0254,  3.0503, -1.7619, -1.0446,  0.6383, -0.6296,  2.4570,
          0.9960,  1.8724,  2.5050, -0.1670, -0.1157, -1.9065, -0.1082, -1.3729,
          0.9684, -0.8983, -

In [102]:
rnn = nn.GRU(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
output, hn = rnn(input, h0)

In [425]:
class SiameseNetwork(nn.Module):
    def __init__(self, context_dim, query_dim):
        super(SiameseNetwork, self).__init__()

        self.linear_1 = nn.Linear(context_dim + hidden_size * 2, 128)
        self.linear_2 = nn.Linear(128, 1)
        self.relu = nn.LeakyReLU()
        
    def forward(self, context, query_pos, query_neg):
        # takes two inputs. first is the hidden representation of app description and the second is the queries batch
        _, q_repr_pos = self.query_gru(self.query_emb(query_pos))
        q_repr_pos = torch.cat([*q_repr_pos], dim=-1)
        
        _, q_repr_neg = self.query_gru(self.query_emb(query_neg))
        q_repr_neg = torch.cat([*q_repr_neg], dim=-1)
        print(q_repr_neg.shape)
        siamese_inp_pos = torch.cat([q_repr_pos, context], dim=-1)
        siamese_inp_neg = torch.cat([q_repr_neg, context], dim=-1)
        
        score_pos = self.linear_2(self.relu(self.linear_1(siamese_inp_pos)))
        score_neg = self.linear_2(self.relu(self.linear_1(siamese_inp_neg)))
        
        return score_pos - score_neg

In [426]:
class Encoder(nn.Module):
    def __init__(self, emb_matrix, hidden_size=64):
        super(ContextEncoder, self).__init__()
        
        self.embedding, num_embeddings, embedding_dim = self.create_emb_layer(emb_matrix)
        self.hidden_size = hidden_size
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers=1, bidirectional=True, batch_first=True)
        
    
    def create_emb_layer(self, weights_matrix, non_trainable=False):
        num_embeddings, embedding_dim = weights_matrix.shape
        emb_layer = nn.Embedding(num_embeddings, embedding_dim)
        emb_layer.load_state_dict({'weight': torch.tensor(weights_matrix)})
        if non_trainable:
            emb_layer.weight.requires_grad = False

        return emb_layer, num_embeddings, embedding_dim
        

    def forward(self, X):
        # X = app vector
        embedded = self.embedding(X)
        output, hn = self.gru(embedded)
        return torch.cat([*hn], dim=-1)

In [415]:
context_enc = ContextEncoder(weight_matrix)

In [427]:

net = SiameseNetwork(len(target_vocab), 128, )

In [421]:
context_vec[:0]

torch.Size([1, 128])

In [433]:
samples[:11]

array([], shape=(0, 382), dtype=int32)

In [432]:
net(context_vec[:1], torch.tensor(samples[:1], dtype=torch.long), torch.tensor(samples[10:11], dtype=torch.long))

torch.Size([0, 128])


RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 1. Got 0 and 1 in dimension 0 at ../aten/src/TH/generic/THTensor.cpp:689

In [154]:
word_idx = dict(zip(target_vocab, range(1, len(target_vocab) + 1)))

In [155]:
def text_to_ids(text):
    return list(map(lambda x: word_idx[x],text))

In [434]:
samples = list(map(text_to_ids, texts[:10]))

In [251]:
samples = pad_sequences(samples)

In [385]:
context_vec = context_enc(torch.tensor(samples, dtype=torch.long))

tensor([[ 0.2355,  0.3443,  0.8532,  ...,  0.9841,  0.8898,  0.9152],
        [ 0.2514,  0.0933, -0.1488,  ...,  0.9848,  0.9995,  0.9163],
        [-0.4006, -0.3471, -0.8224,  ...,  0.9850,  0.9995,  0.9163],
        ...,
        [-0.4236, -0.9753, -0.8926,  ...,  0.9846,  0.9996,  0.9163],
        [-0.6892, -0.7874, -0.5873,  ...,  0.9846,  0.9993,  0.9160],
        [ 0.4210, -0.9002, -0.2217,  ..., -0.0679, -0.8469,  0.2181]],
       grad_fn=<CatBackward>)

In [298]:
last_h = output[:, -1, :]

In [358]:
hn[1][0]

tensor([ 0.8601, -0.9466,  0.7250, -0.6369,  0.9348, -0.0059,  0.9198,  0.1797,
         0.9451, -0.9606,  0.9995,  0.5406,  0.9725, -0.4963,  0.9374,  0.9493,
         0.9917, -0.9907, -0.9998,  0.9675,  0.1656, -0.7281,  0.2684,  0.9403,
        -0.9107, -0.4866, -0.6494,  0.9715,  0.9971, -0.1840,  0.8659,  0.0870,
        -0.8100,  0.4066, -0.9718, -0.9794,  0.9616,  0.5550, -0.9980, -0.9904,
        -0.9829, -0.7062, -0.5788, -0.8205, -0.9125, -0.9986,  0.9630, -0.9921,
        -0.9903,  0.9998,  0.9872,  0.9989,  0.1507, -0.1279, -0.9667,  0.9838,
        -0.2132,  0.8981,  0.7400,  0.6980,  0.9585,  0.9114, -0.6787, -0.9613],
       grad_fn=<SelectBackward>)

In [344]:
torch.cat([*hn], dim=-1)[:, 64:][-1]

tensor([-0.3220, -0.7119,  0.9637, -0.1117, -0.4310,  0.5825,  0.3635, -0.7355,
        -0.4684,  0.4242, -0.8651, -0.1600,  0.2515, -0.3122, -0.2554,  0.7875,
        -0.5054,  0.2652, -0.5298, -0.6273, -0.2431,  0.0521,  0.5271,  0.0659,
         0.6781, -0.9272,  0.1873,  0.3154,  0.6975, -0.2104, -0.0027,  0.4963,
        -0.5183, -0.6684, -0.2173,  0.4662,  0.8450, -0.8730, -0.9728, -0.2313,
        -0.2085, -0.7575, -0.4352,  0.2833, -0.5789,  0.6157, -0.3858,  0.4125,
        -0.8934, -0.9073, -0.1653, -0.1109, -0.7207, -0.3492, -0.1671, -0.0198,
         0.4761,  0.4061, -0.8034,  0.3017,  0.8329,  0.6911,  0.5665, -0.2176],
       grad_fn=<SelectBackward>)

In [365]:
output[0, 0, 64:]

tensor([ 0.8601, -0.9466,  0.7250, -0.6369,  0.9348, -0.0059,  0.9198,  0.1797,
         0.9451, -0.9606,  0.9995,  0.5406,  0.9725, -0.4963,  0.9374,  0.9493,
         0.9917, -0.9907, -0.9998,  0.9675,  0.1656, -0.7281,  0.2684,  0.9403,
        -0.9107, -0.4866, -0.6494,  0.9715,  0.9971, -0.1840,  0.8659,  0.0870,
        -0.8100,  0.4066, -0.9718, -0.9794,  0.9616,  0.5550, -0.9980, -0.9904,
        -0.9829, -0.7062, -0.5788, -0.8205, -0.9125, -0.9986,  0.9630, -0.9921,
        -0.9903,  0.9998,  0.9872,  0.9989,  0.1507, -0.1279, -0.9667,  0.9838,
        -0.2132,  0.8981,  0.7400,  0.6980,  0.9585,  0.9114, -0.6787, -0.9613],
       grad_fn=<SliceBackward>)

In [350]:
hn

tensor([[[-0.8493,  0.6618,  0.3642,  ..., -0.4162, -0.9182,  0.1056],
         [-0.4135, -0.8876, -0.1321,  ..., -0.3913, -0.7934,  0.8455],
         [ 0.0877,  0.8097, -0.1604,  ..., -0.4779, -0.2237,  0.5258],
         ...,
         [ 0.7312, -0.7872, -0.7111,  ..., -0.3504, -0.3364, -0.5374],
         [ 0.4145, -0.6863,  0.0040,  ..., -0.2079, -0.7165,  0.2850],
         [ 0.4713, -0.6098,  0.2443,  ...,  0.5707,  0.2151,  0.8724]],

        [[ 0.8601, -0.9466,  0.7250,  ...,  0.9114, -0.6787, -0.9613],
         [ 0.8664, -0.9466,  0.2125,  ...,  0.9149, -0.6854, -0.9614],
         [ 0.8669, -0.9466, -0.2779,  ...,  0.9091, -0.6775, -0.9615],
         ...,
         [ 0.8663, -0.9466,  0.2378,  ...,  0.9158, -0.6866, -0.9614],
         [ 0.8674, -0.9467, -0.6760,  ...,  0.9055, -0.6742, -0.9616],
         [-0.3220, -0.7119,  0.9637,  ...,  0.6911,  0.5665, -0.2176]]],
       grad_fn=<StackBackward>)

In [353]:
output[:, 0, :]

tensor([[ 0.1923,  0.0825, -0.1506,  ...,  0.9114, -0.6787, -0.9613],
        [ 0.1923,  0.0825, -0.1506,  ...,  0.9149, -0.6854, -0.9614],
        [ 0.1923,  0.0825, -0.1506,  ...,  0.9091, -0.6775, -0.9615],
        ...,
        [ 0.1923,  0.0825, -0.1506,  ...,  0.9158, -0.6866, -0.9614],
        [ 0.1923,  0.0825, -0.1506,  ...,  0.9055, -0.6742, -0.9616],
        [-0.0946,  0.3019,  0.0660,  ...,  0.6911,  0.5665, -0.2176]],
       grad_fn=<SliceBackward>)