In [1]:
import pandas as pd
import re
import pymorphy2
import numpy as np
import multiprocessing
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
from joblib import Parallel, delayed
from tqdm import tqdm
from gensim.models.wrappers.fasttext import FastTextKeyedVectors
import keras.backend as K


Using TensorFlow backend.


In [82]:
morph = pymorphy2.MorphAnalyzer()
data = pd.read_excel('task_10+kw_only_ru.xlsx', header=1)
#data = pd.read_csv('new_apps_compatible.csv', sep=';').drop('Пустая колонка для совместимости', axis=1)
fasttext_model_path = '/Users/egor/Downloads/187/model.model'

#Параллельно выполняющийся map функции func по массиву massive
def parallelization(func,massive, tq=True):
    
    num_cores = multiprocessing.cpu_count() # Число наших ядер
    if tq:
        results = np.array(Parallel(n_jobs=num_cores)(delayed(func)(i) for i in tqdm(massive)))
        return results
    else:
        results = Parallel(n_jobs=num_cores)(delayed(func)(i) for i in massive)
        return results

def _word2canonical4w2v(word):
    elems = morph.parse(word)
    my_tag = ''
    res = []
    for elem in elems:
        if 'VERB' in elem.tag or 'GRND' in elem.tag or 'INFN' in elem.tag:
            my_tag = 'V'
        if 'NOUN' in elem.tag:
            my_tag = 'S'
        normalised = elem.normalized.word
        res.append((normalised, my_tag))
    tmp = list(filter(lambda x: x[1] != '', res))
    if len(tmp) > 0:
        return tmp[0]
    else:
        return res[0]

    
def word2canonical(word):
    return _word2canonical4w2v(word)[0]


def getWords(text, filter_short_words=False):
    if filter_short_words:
        return filter(lambda x: len(x) > 3, re.findall(r'(?u)\w+', text))
    else:
        return re.findall(r'(?u)\w+', text)


def text2canonicals(text, add_word=False, filter_short_words=True):
    words = []
    for word in getWords(text, filter_short_words=filter_short_words):
        words.append(word2canonical(word.lower()))
        if add_word:
            words.append(word.lower())
    return words

In [21]:
def build_weight_matrix(word2vec, target_vocab, emb_dim=300):
    matrix_len = len(target_vocab)
    weights_matrix = np.zeros((matrix_len, emb_dim))
    words_found = 0

    for i, word in enumerate(target_vocab):
        try: 
            weights_matrix[i] = word2vec[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
    
    print(words_found, 'words found of', matrix_len, 'in target vocab')
    return weights_matrix

In [84]:
texts = parallelization(text2canonicals, data.Core.values, True)

100%|██████████| 8225/8225 [02:40<00:00, 51.19it/s]


In [91]:
import pickle as pkl

In [93]:
#pkl.dump(texts, open('texts.pkl', 'wb'))

In [94]:
import torch
from torch import nn

In [95]:
fasttext = FastTextKeyedVectors.load(fasttext_model_path)

In [96]:
target_vocab = Counter([word for text in texts for word in text]).keys()

In [97]:
weight_matrix = build_weight_matrix(fasttext, target_vocab)

53158 words found of 53158 in target vocab


In [98]:
weight_matrix.shape

(53158, 300)

In [99]:
emb_layer = nn.Embedding(*weight_matrix.shape)

In [None]:
emb_layer.load_state_dict()

In [46]:
torch.tensor([1,2,3])

tensor([1, 2, 3])

In [100]:
embs, num_embeddings, embedding_dim = create_emb_layer(weight_matrix, False)

In [58]:
embs(torch.tensor([1]))

tensor([[-0.2697, -0.5741, -1.5527, -2.2854, -1.2620, -0.5197,  3.0912,  0.7927,
         -0.6073,  2.3412,  0.3576,  2.0995, -0.9263, -0.5159, -2.0464, -0.4677,
         -2.9064, -0.0943,  0.6719,  2.4594,  1.6578, -2.9194, -0.0200,  1.2851,
         -2.8199, -0.4918,  0.2292, -0.9744, -3.2194, -3.5566, -1.6003,  0.2985,
          0.5833, -0.0737,  1.0696, -2.0138, -0.1739,  0.6468,  0.0755,  0.0971,
         -0.1788,  2.0031,  0.4137, -2.9208,  1.8085,  1.6108,  1.3114, -1.9675,
         -1.4978, -2.3921, -0.3813,  0.7139,  1.5572,  0.0862, -0.5874,  1.2259,
         -2.1570,  0.3506,  0.5972,  0.7138,  2.2535, -0.2766,  0.2211, -0.9814,
          0.5154, -0.7607,  0.3884,  3.5891,  1.7439,  0.7744,  1.3007,  1.2588,
          1.2458, -1.9587,  0.3296,  1.2414, -1.3811, -2.7530, -0.3852,  1.8004,
         -1.3537,  3.0254,  3.0503, -1.7619, -1.0446,  0.6383, -0.6296,  2.4570,
          0.9960,  1.8724,  2.5050, -0.1670, -0.1157, -1.9065, -0.1082, -1.3729,
          0.9684, -0.8983, -

In [61]:
class SiameseNetwork(nn.Module):
    def __init__(self, embeddings_path=None):
        super(SiameseNetwork, self).__init__()
        self.embedding, num_embeddings, embedding_dim = self.create_emb_layer(torch.tensor(weights_matrix))
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers, batch_first=True)
        
    
    def create_emb_layer(self, weights_matrix, non_trainable=False):
        num_embeddings, embedding_dim = weights_matrix.shape
        emb_layer = nn.Embedding(num_embeddings, embedding_dim)
        emb_layer.load_state_dict({'weight': torch.tensor(weights_matrix)})
        if non_trainable:
            emb_layer.weight.requires_grad = False

        return emb_layer, num_embeddings, embedding_dim
        

    def forward(self, X):
        pass
        
    def get_embeddings(self):
        assert self.embeddings_path is not None, 'you must specify embeddings path'
        fasttext = FastTextKeyedVectors.load(self.embeddings_path)
        The 
    
    def get_loss(self):
        # Implement pairwise loss
        pass
    
    def get_prob(self, s1, s2):
        return 1 / (1 + torch.exp(s1 - s2))
    
    def build(self):
        # build arch of siamese network
        self.input_embedding = nn.Embedding()
        self.gru = nn.GRU(input_size, hidden_size)
        