In [1]:
import pandas as pd
import re
import pymorphy2
import numpy as np
import multiprocessing
from collections import Counter
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from joblib import Parallel, delayed
from tqdm import tqdm, tqdm_notebook
from gensim.models.wrappers.fasttext import FastTextKeyedVectors
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
#from torch.utils.data import DataLoader
import pickle as pkl

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

Using TensorFlow backend.


cuda:0


In [2]:
morph = pymorphy2.MorphAnalyzer()
#data = pd.read_excel('data/task_10+kw_only_ru.xlsx', header=1)
#data = pd.read_csv('new_apps_compatible.csv', sep=';').drop('Пустая колонка для совместимости', axis=1)
fasttext_model_path = '/home/egor/Downloads/187/model.model'

#Параллельно выполняющийся map функции func по массиву massive
def parallelization(func, massive, jobs=3, tq=True):
    
    num_cores = multiprocessing.cpu_count() # Число наших ядер
    if tq:
        results = np.array(Parallel(n_jobs=num_cores)(delayed(func)(i) for i in tqdm(massive)))
        return results
    else:
        results = Parallel(n_jobs=num_cores)(delayed(func)(i) for i in massive)
        return results

    
def _word2canonical4w2v(word):
    elems = morph.parse(word)
    my_tag = ''
    res = []
    for elem in elems:
        if 'VERB' in elem.tag or 'GRND' in elem.tag or 'INFN' in elem.tag:
            my_tag = 'V'
        if 'NOUN' in elem.tag:
            my_tag = 'S'
        normalised = elem.normalized.word
        res.append((normalised, my_tag))
    tmp = list(filter(lambda x: x[1] != '', res))
    if len(tmp) > 0:
        return tmp[0]
    else:
        return res[0]

    
def word2canonical(word):
    return _word2canonical4w2v(word)[0]


def getWords(text, filter_short_words=False):
    if filter_short_words:
        return filter(lambda x: len(x) > 3, re.findall(r'(?u)\w+', text))
    else:
        return re.findall(r'(?u)\w+', text)


def text2canonicals(text, add_word=False, filter_short_words=True):
    words = []
    for word in getWords(text, filter_short_words=filter_short_words):
        words.append(word2canonical(word.lower()))
        if add_word:
            words.append(word.lower())
    return words

2

In [32]:
def build_weight_matrix(word2vec, target_vocab, emb_dim=300):
    matrix_len = len(target_vocab)
    weights_matrix = np.zeros((matrix_len, emb_dim))
    words_found = 0

    for i, word in enumerate(target_vocab):
        try: 
            weights_matrix[i] = word2vec.get_vector(word)#word2vec[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
    
    return weights_matrix


def get_vocab(texts):
    return Counter([word for text in texts for word in text]).keys()


def get_queries_vocab(queries):
    return Counter([word for qs in queries for qury in qs for word in qury]).keys()


def text_to_idx(text, word_idx):
    return list(map(lambda x: word_idx.get(x) if word_idx.get(x) is not None else len(word_idx) + 1,text))


def filter_zero_length(train_data):
    i = 0
    cnt = 0
    while i < len(train_data):
        t, q, _q = train_data[i]
        if len(t) == 0 or len(q) == 0 or len(_q) == 0:
            train_data.pop(i)
            cnt += 1
            i -= 1
        
        i += 1
        
    print cnt, 'found'
    return train_data


def make_dataset(texts, queries, nb_train_samples=None, num_neg_samples=5):
    # construct a dataset in a format of (context, query_positive, query_negative)
    # assuming texts[i] maps to queries[i]
    assert len(texts) == len(queries)
    train_data = []
    q_space = [q for subspace in queries for q in subspace]
    # we have len(q_space) queries at all
    # let's just sample all negatives in one run
    negatives = np.random.choice(q_space, len(q_space) * num_neg_samples)
    # now write it all into train data
    k = 0
    for i in tqdm_notebook(range(len(texts))):
        for j in range(len(queries[i])):
            z = 0
            while k < len(negatives) and z < num_neg_samples:
                train_data.append([texts[i], queries[i][j], negatives[k]])
                k += 1
                z += 1
    
    if nb_train_samples is not None:
        return train_data[:nb_train_samples] 
    return train_data
    
    
def sample_negatives(neg_space, n_samples):
    # TODO: probs
    return np.random.choice(neg_space, n_samples)


def iterate_minibatches(inputs, batchsize, shuffle=False):
    if shuffle:
        indices = np.arange(len(inputs))
        np.random.shuffle(indices)

    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
            batch = [inputs[x] for x in excerpt]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
            batch = inputs[excerpt]
            
        context, q_pos, q_neg = zip(*batch)
    
        clen = torch.cuda.LongTensor(list(map(len, context)))
        qposlen = torch.cuda.LongTensor(list(map(len, q_pos)))
        qneglen = torch.cuda.LongTensor(list(map(len, q_neg)))
        
        context = torch.cuda.LongTensor(pad_sequences(context, padding='post'))
        q_pos = torch.cuda.LongTensor(pad_sequences(q_pos, padding='post'))
        q_neg = torch.cuda.LongTensor(pad_sequences(q_neg, padding='post'))
        
        yield context, clen, q_pos, qposlen, q_neg, qneglen
        
# def get_data_loader(inputs, batch_size, shuffle=False):
    
#     context, q_pos, q_neg = zip(*inputs)
#     clen = list(map(len, context))
#     qposlen = list(map(len, q_pos))
#     qneglen = list(map(len, q_neg))
#     dataset = list(zip(context, clen, q_pos, qposlen, q_neg, qneglen)) # [context, clen, q_pos, qposlen, q_neg, qneglen]
#     #return dataset[0]
#     context = pad_sequences(context, padding='post')
#     q_pos = pad_sequences(q_pos, padding='post')
#     q_neg = pad_sequences(q_neg, padding='post')

#     print(len(dataset))
#     dataloader = DataLoader(dataset=context, batch_size=batch_size, shuffle=shuffle)
#     return dataloader


def train(train_data, batch_size, nb_epochs, test_size=0.3, shuffle=False, lr=0.001):
    #X_train, X_test = train_test_split(train_data, test_size=test_size, random_state=42)    
    loss_fn = lambda x: torch.mean(-F.logsigmoid(x)) # negative log likelihood
    optimizer = optim.Adam(net.parameters(), lr=lr)
    print('Start training...')
    for epoch in range(nb_epochs):
        running_loss = 0.0
        i = 0
        generator = iterate_minibatches(train_data, batch_size, shuffle=shuffle)
        for sample in generator:
            context, clen, q_pos, qposlen, q_neg, qneglen = sample
            
            optimizer.zero_grad()
            
            outputs = net(context, clen, q_pos, qposlen, q_neg, qneglen)
            loss = loss_fn(outputs)
            #print 'loss', loss.item()
            loss.backward()
            optimizer.step()
            
            # print statistics
            running_loss += loss.item()#loss.data[0]
            if i % 200 == 199:    # print every 200 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
                
            i += 1
            
    print('Finished Training')

# Loading data

In [84]:
# texts = parallelization(text2canonicals, data.Core.values, tq=True)

100%|██████████| 8225/8225 [02:40<00:00, 51.19it/s]


In [39]:
#all_queries = np.load('data/all_keywords_keys.npy')[1:]
#all_queries = np.array(list(map(lambda x: x.split(), queries)))

In [4]:
texts = np.load('all_descriptions_keys.npy')
queries = np.load('matched_keywords.npy')
queries = list(map(lambda x: list(map(lambda y: y.split(), x)), queries))

In [5]:
context_vocab = get_vocab(texts)
query_vocab = get_queries_vocab(queries)

#context_word_idx = dict(zip(context_vocab, range(1, len(context_vocab) + 1)))
#query_word_idx = dict(zip(query_vocab, range(1, len(query_vocab) + 1)))
context_word_idx, query_word_idx = pkl.load(open('word_idx_dicts.pkl', 'rb'))

samples = list(map(lambda x: text_to_idx(x, context_word_idx), texts))
queries = list(map(lambda x: list(map(lambda y: text_to_idx(y, query_word_idx), x)), queries))

train_data = make_dataset(samples, queries)
train_data = filter_zero_length(train_data)
np.random.shuffle(train_data)


692 found


In [6]:
#fasttext = FastTextKeyedVectors.load(fasttext_model_path)
#context_emb_matrix = build_weight_matrix(fasttext, context_vocab)
#query_emb_matrix = build_weight_matrix(fasttext, query_vocab)
context_emb_matrix = np.load('context_emb_matr.npy')
query_emb_matrix = np.load('query_emb_matr.npy')

In [7]:
class SiameseNetwork(nn.Module):
    def __init__(self, context_encoder, query_encoder, context_dim, query_dim):
        super(SiameseNetwork, self).__init__()
        self.context_encoder = context_encoder
        self.query_encoder = query_encoder
       
        # siamese network arch
        self.linear_1 = nn.Linear(context_dim + query_dim, 1024)
        self.linear_2 = nn.Linear(1024, 1)
        self.relu = nn.LeakyReLU()
       
    def forward(self, context, clens, query_pos, qposlens, query_neg=None, qneglens=None, train=True):
        # take both queries while training and only one while testing to assign a score
        # (second input just ignored if train=False)
        context_repr = self.context_encoder(context, clens)
        query_pos_repr = self.query_encoder(query_pos, qposlens)
        siamese_inp_pos = torch.cat([query_pos_repr, context_repr], dim=-1)
        score_pos = self.linear_2(self.linear_1(siamese_inp_pos))
       
        if train:
            assert query_neg is not None, "you have to provide a second input"
            query_neg_repr = self.query_encoder(query_neg, qneglens)
            siamese_inp_neg = torch.cat([query_neg_repr, context_repr], dim=-1)
            score_neg = self.linear_2(self.linear_1(siamese_inp_neg))
            return score_pos - score_neg
       
        else:
            return score_pos

In [8]:
class Encoder(nn.Module):
    def __init__(self, emb_matrix, hidden_size=64):
        super(Encoder, self).__init__()
       
        self.embedding, num_embeddings, embedding_dim = self.create_emb_layer(emb_matrix)
        self.hidden_size = hidden_size
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers=1,
                          bidirectional=True, batch_first=True)
       
   
    def create_emb_layer(self, weights_matrix, non_trainable=False):
        num_embeddings, embedding_dim = weights_matrix.shape
        emb_layer = nn.Embedding(num_embeddings, embedding_dim)
        emb_layer.load_state_dict({'weight': weights_matrix})
        if non_trainable:
            emb_layer.weight.requires_grad = False
 
        return emb_layer, num_embeddings, embedding_dim
       
 
    def forward(self, X, X_lengths):
        # X = app vector
        with torch.no_grad():
            embedded = self.embedding(X)
       
        embedded = torch.nn.utils.rnn.pack_padded_sequence(embedded, X_lengths, batch_first=True, enforce_sorted=False)
        output, hn = self.gru(embedded)
        #output, _ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        output = torch.cat([hn[0], hn[1]], dim=-1)

        return output

In [9]:
context_emb_matrix = torch.cuda.FloatTensor(context_emb_matrix)
query_emb_matrix = torch.cuda.FloatTensor(query_emb_matrix)

In [10]:
context_enc = Encoder(context_emb_matrix)
query_enc = Encoder(query_emb_matrix)

net = SiameseNetwork(context_enc, query_enc, 128, 128)

net.to(device)

SiameseNetwork(
  (context_encoder): Encoder(
    (embedding): Embedding(53160, 300)
    (gru): GRU(300, 64, batch_first=True, bidirectional=True)
  )
  (query_encoder): Encoder(
    (embedding): Embedding(16908, 300)
    (gru): GRU(300, 64, batch_first=True, bidirectional=True)
  )
  (linear_1): Linear(in_features=256, out_features=128, bias=True)
  (linear_2): Linear(in_features=128, out_features=1, bias=True)
  (relu): LeakyReLU(negative_slope=0.01)
)

# Training

In [22]:
train_data?

In [54]:
gen = iterate_minibatches(train_data[:10000], 64)

In [55]:
sample = gen.next()

tensor([[  523,   560,   641,  ...,     0,     0,     0],
        [ 2956,  2957, 14435,  ...,   815,  3540,     0],
        [  400,   317,  1497,  ...,     0,     0,     0],
        ...,
        [ 6762,  1062,  5186,  ...,     0,     0,     0],
        [  227,  5550,  2008,  ...,     0,     0,     0],
        [   79,  2521, 19432,  ...,     0,     0,     0]], device='cuda:0')

In [60]:
net(sample[0], sample[1], sample[2], sample[3], train=False)

tensor([[ 2.6321],
        [ 4.0733],
        [ 4.4542],
        [ 2.7738],
        [ 6.5376],
        [-0.6955],
        [-2.6492],
        [-0.7273],
        [ 6.1513],
        [-0.0266],
        [-0.1195],
        [ 1.4893],
        [ 0.3598],
        [ 0.9696],
        [-1.5867],
        [ 6.6357],
        [ 2.7812],
        [ 6.9800],
        [10.9473],
        [ 5.0718],
        [-1.1097],
        [ 7.9970],
        [-0.1672],
        [-0.5453],
        [ 1.7603],
        [-3.2352],
        [ 0.9964],
        [10.1794],
        [ 1.1529],
        [ 3.0358],
        [ 5.1405],
        [ 2.3718],
        [ 0.5341],
        [ 6.3462],
        [18.2517],
        [-1.5369],
        [-1.8085],
        [ 4.8054],
        [-0.4793],
        [-0.0184],
        [ 2.4577],
        [ 1.5279],
        [ 3.2587],
        [ 5.3874],
        [ 2.3431],
        [ 3.3633],
        [-1.2965],
        [ 2.1588],
        [ 0.0718],
        [ 7.0235],
        [-5.5722],
        [ 7.4561],
        [-0.

In [None]:
train(train_data[:50000], 64 , 10, lr=0.001)

Start training...
[1,   200] loss: 0.033
[1,   400] loss: 0.020
[1,   600] loss: 0.041
[2,   200] loss: 0.025
[2,   400] loss: 0.015
[2,   600] loss: 0.042
[3,   200] loss: 0.027
[3,   400] loss: 0.015
[3,   600] loss: 0.041
[4,   200] loss: 0.028
[4,   400] loss: 0.014
[4,   600] loss: 0.041
[5,   200] loss: 0.029
[5,   400] loss: 0.014
[5,   600] loss: 0.039
[6,   200] loss: 0.030
[6,   400] loss: 0.014
[6,   600] loss: 0.037
[7,   200] loss: 0.031
[7,   400] loss: 0.015
[7,   600] loss: 0.032


In [16]:
gen = iterate_minibatches(train_data[:20000], 64)

In [18]:
64*200

12800