In [None]:
"""
Из курса "Нейронные сети и обработка текста" со Stepic
Word2Vec Skip Gram Negative Sampling
"""

import sys
import ast
import numpy as np


def parse_array(s):
    return np.array(ast.literal_eval(s))

def read_array():
    return parse_array(sys.stdin.readline())

def write_array(arr):
    print(repr(arr.tolist()))


def generate_w2v_sgns_samples(text, window_size, vocab_size, ns_rate):
    """
    genetate samples with ns_rate negative samples for the positive
    
    text - list of integer numbers - ids of tokens in text
    window_size - odd integer - width of window
    vocab_size - positive integer - number of tokens in vocabulary
    ns_rate - positive integer - number of negative tokens to sample per one positive sample

    returns list of training samples (CenterWord, CtxWord, Label)
    """
    text_l = len(text)
    text = text + 1 # смещаем индексы слов для удобства работы с маской
    samples = []
    radius = window_size // 2
    
    # создаем маску для отбора слов за счет матриц с соответствующими смещениями по диагоналям
    mask_left = sum([np.diag(np.repeat(1, text_l + i), k=i) for i in range(-radius, 0)])
    mask_right = sum([np.diag(np.repeat(1, text_l - i), k=i) for i in range(1, radius + 1)])  
    mask = mask_left + mask_right
    
    # подсчитываем число позитивных примеров для расчета числа негативных и отбираем индексы слов
    pos_num = np.sum(mask, axis=0)
    positives = text * mask
    positives = np.array(positives[positives != 0]) - 1
    text = text - 1
    
    i = 0
    j = 0
    while i < text_l:
        for _ in range(pos_num[i]):
            samples.append([text[i], positives[j], 1])
            j += 1    
        for _ in range(ns_rate * pos_num[i]):
            random_number = np.random.randint(0, text_l - 1)
            if random_number == i:
                random_number = text_l - 1 - random_number # если попадаем в центральное слово, то отражаем индекс
            samples.append([text[i], text[random_number], 0])
        i += 1
    return np.array(samples)    
    
    
def update_w2v_weights(center_embeddings, context_embeddings, center_word, context_word, label, learning_rate):
    """
    update center_embeddings and context_embeddings inplace
    
    center_embeddings - VocabSize x EmbSize
    context_embeddings - VocabSize x EmbSize
    center_word - int - identifier of center word
    context_word - int - identifier of context word
    label - 1 if context_word is real, 0 if it is negative
    learning_rate - float > 0 - size of gradient step
    """
    center = center_embeddings[center_word]
    context = context_embeddings[context_word]
    
    probability = 1/(1 + np.exp(-sum(center * context)))
    
    der_w = (probability - label) * context
    der_d = (probability - label) * center
    
    center_emb_new = center - learning_rate * der_w
    context_emb_new = context - learning_rate * der_d    
    
    center_embeddings[center_word] = center_emb_new
    context_embeddings[context_word] = context_emb_new
    
    return center_embeddings, context_embeddings


In [None]:
"""
Из курса "Нейронные сети и обработка текста" со Stepic
FastText SkipGram Negative Sampling 
"""

import sys
import ast
import numpy as np


def read_list():
    return ast.literal_eval(sys.stdin.readline())

def parse_array(s):
    return np.array(ast.literal_eval(s))

def read_array():
    return parse_array(sys.stdin.readline())

def write_array(arr):
    print(repr(arr.tolist()))


def generate_ft_sgns_samples(text, window_size, vocab_size, ns_rate, token2subwords):
    """
    genetate samples with ns_rate negative samples for the positive    
    
    text - list of integer numbers - ids of tokens in text
    window_size - odd integer - width of window
    vocab_size - positive integer - number of tokens in vocabulary
    ns_rate - positive integer - number of negative tokens to sample per one positive sample
    token2subwords - list of lists of int - i-th sublist contains list of identifiers of n-grams for token #i (list of subword units)

    returns list of training samples (CenterSubwords, CtxWord, Label)
    """
    text_l = len(text)
    text = text + 1
    samples = []
    radius = (window_size - 1) // 2
    
    if radius >= text_l - 1: # если текст меньше окна, то вручную создаем позитивные примеры
        pos_num = np.repeat(text_l - 1, text_l)
        positives = np.hstack([list(text[0:i]) + list(text[i+1:]) for i in range(text_l)]) 
        positives = positives - 1
    else:     
        # создаем маску для отбора слов за счет матриц с соответствующими смещениями по диагоналям       
        mask_left = sum([np.diag(np.repeat(1, text_l + i), k=i) for i in range(-radius, 0)])
        mask_right = sum([np.diag(np.repeat(1, text_l - i), k=i) for i in range(1, radius + 1)])  
        mask = mask_left + mask_right
       
        # подсчитываем число позитивных примеров для расчета числа негативных и отбираем индексы слов    
        pos_num = np.sum(mask, axis=0)
        positives = text * mask
        positives = np.array(positives[positives != 0]) - 1
    
    text = text - 1
    i = 0
    j = 0
    while i < text_l:
        for _ in range(pos_num[i]):
            samples.append(tuple([[text[i]] + list(set(token2subwords[text[i]])), positives[j], 1]))
            j += 1    
        for _ in range(ns_rate * pos_num[i]):
            random_number = np.random.randint(0, text_l - 1)
            if random_number == i:
                random_number = text_l - 1 - random_number
            samples.append(tuple([[text[i]] + list(set(token2subwords[text[i]])), text[random_number], 0]))
        i += 1
    return samples

def update_ft_weights(center_embeddings, context_embeddings, center_subwords, context_word, label, learning_rate):
    """
    update center_embeddings, context_embeddings inplace
    
    center_embeddings - VocabSize x EmbSize
    context_embeddings - VocabSize x EmbSize
    center_subwords - list of ints - list of identifiers of n-grams contained in center word
    context_word - int - identifier of context word
    label - 1 if context_word is real, 0 if it is negative
    learning_rate - float > 0 - size of gradient step
    """
    center_l = len(center_subwords)
    center = sum([center_embeddings[word] for word in center_subwords]) / center_l
    context = context_embeddings[context_word]
    
    probability = 1 / (1 + np.exp(-sum(center * context)))
    
    der_w = ((probability-label) * context) / center_l
    der_d = (probability-label) * center
    
    context_emb_new = context - learning_rate * der_d    
    center_embeddings[center_subwords] +=  - learning_rate * der_w
    
    context_embeddings[context_word] = context_emb_new
    return center_embeddings, context_embeddings    
