In [449]:
import pandas as pd
import re
import pymorphy2
import numpy as np
import multiprocessing
from collections import Counter
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from joblib import Parallel, delayed
from tqdm import tqdm, tqdm_notebook
from gensim.models.wrappers.fasttext import FastTextKeyedVectors
import torch
from torch import nn
import pickle as pkl

In [413]:
morph = pymorphy2.MorphAnalyzer()
data = pd.read_excel('data/task_10+kw_only_ru.xlsx', header=1)
#data = pd.read_csv('new_apps_compatible.csv', sep=';').drop('Пустая колонка для совместимости', axis=1)
fasttext_model_path = '/Users/egor/Downloads/187/model.model'

#Параллельно выполняющийся map функции func по массиву massive
def parallelization(func, massive, jobs=3, tq=True):
    
    num_cores = multiprocessing.cpu_count() # Число наших ядер
    if tq:
        results = np.array(Parallel(n_jobs=num_cores)(delayed(func)(i) for i in tqdm(massive)))
        return results
    else:
        results = Parallel(n_jobs=num_cores)(delayed(func)(i) for i in massive)
        return results

    
def _word2canonical4w2v(word):
    elems = morph.parse(word)
    my_tag = ''
    res = []
    for elem in elems:
        if 'VERB' in elem.tag or 'GRND' in elem.tag or 'INFN' in elem.tag:
            my_tag = 'V'
        if 'NOUN' in elem.tag:
            my_tag = 'S'
        normalised = elem.normalized.word
        res.append((normalised, my_tag))
    tmp = list(filter(lambda x: x[1] != '', res))
    if len(tmp) > 0:
        return tmp[0]
    else:
        return res[0]

    
def word2canonical(word):
    return _word2canonical4w2v(word)[0]


def getWords(text, filter_short_words=False):
    if filter_short_words:
        return filter(lambda x: len(x) > 3, re.findall(r'(?u)\w+', text))
    else:
        return re.findall(r'(?u)\w+', text)


def text2canonicals(text, add_word=False, filter_short_words=True):
    words = []
    for word in getWords(text, filter_short_words=filter_short_words):
        words.append(word2canonical(word.lower()))
        if add_word:
            words.append(word.lower())
    return words

In [503]:
def build_weight_matrix(word2vec, target_vocab, emb_dim=300):
    matrix_len = len(target_vocab)
    weights_matrix = np.zeros((matrix_len, emb_dim))
    words_found = 0

    for i, word in enumerate(target_vocab):
        try: 
            weights_matrix[i] = word2vec[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
    
    return weights_matrix


def get_vocab(texts):
    return Counter([word for text in texts for word in text]).keys()


def get_queries_vocab(queries):
    return Counter([word for qs in queries for qury in qs for word in qury]).keys()


def text_to_idx(text, word_idx):
    return list(map(lambda x: word_idx.get(x) if word_idx.get(x) is not None else len(word_idx) + 1,text))


def make_dataset(texts, queries, nb_train_samples=None, num_neg_samples=5):
    
    # construct a dataset in a format of (context, query_positive, query_negative)
    # assuming texts[i] maps to queries[i]
    assert len(texts) == len(queries)
    train_data = []
    q_space = [q for subspace in queries for q in subspace]
    
    n = len(texts)
    if nb_train_samples is not None:
        n = nb_train_samples
    
    
    for i in tqdm_notebook(range(n)):
        for j in range(len(queries[i])):
            negatives = sample_negatives(q_space, num_neg_samples)
            for k in range(num_neg_samples):
                train_data.append([texts[i], queries[i][j], negatives[k]])
        
    return train_data
    
        
def sample_negatives(neg_space, n_samples):
    # TODO: probs
    return np.random.choice(neg_space, n_samples)



def train(train_data, nb_epochs, test_size=0.3):
    X_train, X_test = train_test_split(train_data, test_size=test_size, random_state=42)
    

In [84]:
texts = parallelization(text2canonicals, data.Core.values, tq=True)

100%|██████████| 8225/8225 [02:40<00:00, 51.19it/s]


# Loading data

In [475]:
texts = np.load('data/all_descriptions_keys.npy')

queries = np.load('data/matched_keywords.npy', )

In [493]:
queries = list(map(lambda x: list(map(lambda y: y.split(), x)), queries))

In [515]:
td = make_dataset(samples, queries)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

In [523]:
t, q, _q = td[1]

In [529]:
net(torch.LongTensor([t]), torch.LongTensor([q]), torch.LongTensor([_q]))

tensor([[-0.1451]], grad_fn=<SubBackward0>)

In [28]:
#all_queries = np.load('data/all_keywords_keys.npy')[1:]
#all_queries = np.array(list(map(lambda x: x.split(), queries)))

In [None]:
context_vocab = get_vocab(texts)
query_vocab = get_queries_vocab(queries)

context_word_idx = dict(zip(context_vocab, range(1, len(context_vocab) + 1)))
query_word_idx = dict(zip(query_vocab, range(1, len(query_vocab) + 1)))

In [506]:
fasttext = FastTextKeyedVectors.load(fasttext_model_path)
context_emb_matrix = build_weight_matrix(fasttext, context_vocab)
query_emb_matrix = build_weight_matrix(fasttext, query_vocab)

In [94]:
class SiameseNetwork(nn.Module):
    def __init__(self, context_encoder, query_encoder, context_dim, query_dim):
        super(SiameseNetwork, self).__init__()
        self.context_encoder = context_encoder
        self.query_encoder = query_encoder
        
        # siamese network arch
        self.linear_1 = nn.Linear(context_dim + query_dim, 128)
        self.linear_2 = nn.Linear(128, 1)
        self.relu = nn.LeakyReLU()
        
    def forward(self, context, query_pos, query_neg=None, train=True):
        # take both queries while training and only one while testing to assign a score
        # (second input just ignored if train=False)
        context_repr = self.context_encoder(context)
        query_pos_repr = self.query_encoder(query_pos)
        siamese_inp_pos = torch.cat([query_pos_repr, context_repr], dim=-1)
        score_pos = self.linear_2(self.linear_1(siamese_inp_pos))
        
        if train:
            assert query_neg is not None, "you have to provide a second input" 
            query_neg_repr = self.query_encoder(query_neg)
            siamese_inp_neg = torch.cat([query_neg_repr, context_repr], dim=-1)
            score_neg = self.linear_2(self.linear_1(siamese_inp_neg))
            return score_pos - score_neg
        
        else:
            return score_pos

In [46]:
class Encoder(nn.Module):
    def __init__(self, emb_matrix, hidden_size=64):
        super(Encoder, self).__init__()
        
        self.embedding, num_embeddings, embedding_dim = self.create_emb_layer(emb_matrix)
        self.hidden_size = hidden_size
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers=1,
                          bidirectional=True, batch_first=True)
        
    
    def create_emb_layer(self, weights_matrix, non_trainable=False):
        num_embeddings, embedding_dim = weights_matrix.shape
        emb_layer = nn.Embedding(num_embeddings, embedding_dim)
        emb_layer.load_state_dict({'weight': torch.tensor(weights_matrix)})
        if non_trainable:
            emb_layer.weight.requires_grad = False

        return emb_layer, num_embeddings, embedding_dim
        

    def forward(self, X):
        # X = app vector
        embedded = self.embedding(X)
        output, hn = self.gru(embedded)
        return torch.cat([*hn], dim=-1)

In [91]:
context_enc = Encoder(context_emb_matrix)
query_enc = Encoder(query_emb_matrix)

net = SiameseNetwork(context_enc, query_enc, 128, 128)

In [508]:
samples = list(map(lambda x: text_to_idx(x, context_word_idx), texts))
queries = list(map(lambda x: list(map(lambda y: text_to_idx(y, query_word_idx), x)), queries))

In [67]:
#samples = pad_sequences(samples, 300)
#queries = pad_sequences(queries)

In [98]:
net(torch.LongTensor(samples[:10]), torch.LongTensor(queries[:10]), torch.LongTensor(queries[10:20]), train=True)

tensor([[-0.1957],
        [-0.2361],
        [-0.0274],
        [ 0.0900],
        [-0.0607],
        [ 0.0679],
        [-0.2605],
        [-0.0944],
        [ 0.1030],
        [-0.1079]], grad_fn=<SubBackward0>)