In [None]:
"""
Из курса "Нейронные сети и обработка текста" со Stepic
GloVe realization with sparse matrix
"""

import sys
import ast
import numpy as np
from scipy.sparse import dok_matrix
from itertools import permutations


def read_array():
    return ast.literal_eval(sys.stdin.readline())

def write_array(arr):
    print(repr(arr.tolist()))


def generate_coocurrence_matrix(texts, vocab_size):
    """
    returns scipy.sparse.dok_matrix with words coocurrence statistic
    
    texts - list of lists of ints - i-th sublist contains identifiers of tokens in i-th document
    vocab_size - int - size of vocabulary
    """
    stat_matrix = dok_matrix((vocab_size, vocab_size), dtype=np.int_)
    for doc in texts:
        doc_u = list(set(doc))
        pairs = permutations(doc_u, 2)
        for pair in pairs:
            stat_matrix[pair] += 1
    return stat_matrix

def update_glove_weights(x, w, d, alpha, max_x, learning_rate):
    """
    inplace unpdating of glove weights
    
    x - square integer matrix VocabSize x VocabSize - coocurrence matrix
    w - VocabSize x EmbSize - first word vectors
    d - VocabSize x EmbSize - second word vectors
    alpha - float - power in weight smoothing function f
    max_x - int - maximum coocurrence count in weight smoothing function f
    learning_rate - positive float - size of gradient step
    """
    vocab_size, emb_size = w.shape
    
    x_lim = np.array(x)
    x_lim[x_lim>max_x] = max_x
    
    f = (x_lim / max_x)**alpha
    y = np.log1p(x)
    
    grad_w = (2 * f * (w@d.T - y))@d
    grad_d = (2 * f * (w@d.T - y)).T@w
    
    w += -learning_rate * grad_w 
    d += -learning_rate * grad_d
    return w, d

In [None]:
def get_nearest(embeddings, query_word_id, get_n):
    """
    returns list of `get_n` tuples (word_id, similarity) sorted by descending order of similarity value 
    
    embeddings - VocabSize x EmbSize - word embeddings
    query_word_id - integer - id of query word to find most similar to
    get_n - integer - number of most similar words to retrieve
    """
    voc_size, emb_size = embeddings.shape
    embeddings_normed = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    targer_word_emb = embeddings_normed[query_word_id]
    similarity_list = np.stack([np.arange(voc_size), -np.linalg.norm(targer_word_emb-embeddings_normed, axis=1)]).T
    result = sorted(similarity_list, reverse=True, key=lambda x: x[1])[0:get_n]
    return result 