In [5]:
#简单的skip-gram在训练300维度*10000个词时会很慢，因为会有3亿个weigth要去训练
#本文采用二次采样的方法
#1. Subsampling frequent words to decrease the number of training examples.
#2. Modifying the optimization objective with a technique they called “Negative Sampling”, 
#which causes each training sample to update only a small percentage of the model’s weights.

#ref https://programmer.group/pytorch-implements-word2vec.html

import numpy as np
import torch
from torch import nn, optim
from collections import Counter
#import matplotlib.pyplot as plt
embedding_dim = 2 #词嵌入的维度
print_every = 100
epochs = 100      #训练多少epochs
batch_size = 6    #batch_size
N_samples = 3
window_size = 5   #窗口大小
threshhold = 0
keep_threshold = 0.001 #分数值，通过公式计算出来的概率值大于该值时单词保留
a = 0.001         #计算单词保留分数时使用，越小保留的单词越少

def basic_preprocess(text,freq):
    text = text.lower()
    words = text.split()
    
    #单词为key，单词出现次数为value的词典
    word_counts = Counter(words)
    
    #仅保留单词个数大于freq的词
    trimmed_words = [word for word in words if word_counts[word] > freq]
    return  trimmed_words
    
text = 'Running gradient descent on a neural network that large is going to be slow. And to make matters worse, you need a huge amount of training data in order to tune that many weights and avoid over-fitting. millions of weights times billions of training samples means that training this model is going to be a beast'
#语料比较小，threshhold为0，所有词都保留
words = basic_preprocess(text,threshhold)

#去掉重复的词
vocabulary = set(words)

#单词为key，index为value的词典
#enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标
vocabulary2index = {w:c for c,w in enumerate(vocabulary)}

#index为key，单词为value的词典
index2vocabulary = {c:w for c,w in enumerate(vocabulary)}


#将所有单词转换为单词对应的index，用index来表示语料
index_words = [vocabulary2index[w] for w in words]
 
#单词index为key，单词出现次数为value的词典
word_count = Counter(index_words)

#词典长度，有多少个单词
total_count = len(word_count)

#单词index为key，词频为value的词典
word_freqs = {w:c/total_count for w,c in word_count.items()} 

#----------------------------------------------sampling rate------------------------------------------
#计算词被保留的分数，该公式课参考论文。
#公式 (sqrt(x/0.001) + 1) / 0.001 / x, 其中x为单词的频率(单词出现的次数/ 语料中总的单词数)
#0.001就是下式中的a，该值越小表示保留该词的可能性越小
prob_keep = {w:np.sqrt(word_freqs[w]/a + 1)* a /word_freqs[w]   for w in index_words}

#分数大于keep_threshold的单词保留
train_words = [w for w in index_words if prob_keep[w] >keep_threshold ]
#----------------------------------------------sampling rate End---------------------------------------

#----------------------------------------------Negative Sampling------------------------------------------
#词频转换为ndarray
word_freqs = np.array(list(word_freqs.values()))

#每一个单词出现的频率
unigram_dist = word_freqs / word_freqs.sum()

#选择3/4是根据实验结果，3/4的效果好; 词频越高的词越容易被选择为负样本
#每个词被采样的概率，论文中叫噪声点分布，噪声分布是指与输入单词无关得单词，一般从词汇表中随机提取以获得这些噪声
#因此我们可以决定如何设置提取单词的权重。它可以是均匀分布，也就是说，提取所有单词的概率是相同的。也可以根据每个单词出现在文本语料库中的频率进行采样。根据作者的实践，最佳分布是3/4。
noise_dist = torch.from_numpy(unigram_dist ** (0.75) / np.sum(unigram_dist ** (0.75)))

#----------------------------------------------Negative Sampling End------------------------------------------


#取得context_word
#words：单词列表(1个batch的单词列表)
#index:中心词的位置
#widow_size: 中心词的前后window_size的词可作为context word
def get_positive_words(words,index,window_size):
    #返回一个随机整数
    target_window = np.random.randint(1,window_size+1)
    
    #起始单词的位置
    start_point = index-target_window if (index-target_window) > 0 else 0
    #结束单词的位置
    end_point = index + target_window
    
    #中心词的前n个单词和后n个单词，n<=window_size
    targets = set(words[start_point:index] + words[index +1:end_point + 1])
    return list(targets)

#load一个batch的训练数据， x为中心词， y为上下文单词
#words:单词的index
#batch_size:一次返回batch_size大小的数据训练
def batch_loader(words,batch_size,window_size):
    
    #需要多少个batch
    n_batches = len(words)//batch_size
    words = words[:n_batches*batch_size]
    for idx in range(0,len(words),batch_size):
        batch_x, batch_y = [],[]
        batch = words[idx:idx+batch_size]
        for i in range(len(batch)):
            x = batch[i]
            y = get_positive_words(batch,i,window_size)
            batch_x.extend([x]*len(y))
            batch_y.extend(y)
        yield batch_x,batch_y

#该类继承nn.Module, nn.Module是所有神经网络类的基类
#ref https://pytorch.org/docs/stable/nn.html
class SkipGramNeg(nn.Module):
    '''
    n_vocab:词典大小，单词个数
    n_embed:词向量的维度
    noise_dist：单词的采样概率(根据论文中的公式事前计算)
    '''
    #init函数构建神经网络
    def __init__(self,n_vocab,n_embed,noise_dist):
        super().__init__()
        self.n_vocab = n_vocab
        self.n_embed = n_embed
        self.noise_dist = noise_dist
        
        #nn.Embedding: 存储一个固定词典大小和词向量维度的表，输入是一个词的index，输出是该词的词向量
        #词向量，n_vocab:词表大小，n_embed:每个单词的维度
        self.in_embed = nn.Embedding(n_vocab,n_embed)
        
         #词向量，n_vocab:词表大小，n_embed:每个单词的维度
        self.out_embed = nn.Embedding(n_vocab,n_embed)
        
        #用-1到1之间的随机均匀分布来初始化权重
        self.in_embed.weight.data.uniform_(-1,1)
        self.out_embed.weight.data.uniform_(-1,1)

    # 输入词，返回输入词的词向量
    def forward_input(self,input_words):
        input_vectors = self.in_embed(input_words)
        return input_vectors
    
    #输入词，返回输入词对应的词向量
    def forward_output(self,output_words):
        output_vectors = self.out_embed(output_words)
        return output_vectors
    
    #对上下文词的词向量进行负采样
    #size：单词表的大小， N_sample
    def forward_noise(self,size,N_sample):
        noise_dist = self.noise_dist
        
        #从采样概率分布表中抽取size*N_sample   size是单词表大小
        noise_words = torch.multinomial(noise_dist,
                                        size * N_sample,
                                        replacement=True)
        noise_vectors = self.out_embed(noise_words).view(size, N_sample, self.n_embed)
        return noise_vectors

#负采样损失  
class NegativeSampleLoss(nn.Module):
    def __init__(self):
        super().__init__()
    
    #前向传播函数
    #损失函数参考https://programmer.group/pytorch-implements-word2vec.html，里面有详细说明
    def forward(self,input_vectors,output_vectors,noise_vectors):
    
        batch_size,emded_size = input_vectors.shape
        
        # Input vectors should be a batch of column vectors
        input_vectors = input_vectors.view(batch_size,emded_size,1)
        
        # Output vectors should be a batch of row vectors
        output_vectors = output_vectors.view(batch_size,1, emded_size)# can be multiply with each other

        # bmm = batch matrix multiplication
        # correct log-sigmoid loss
        out_loss = torch.bmm(output_vectors,input_vectors).sigmoid().log()
        
        #the size is [batchsize,1,1], so we need to reduce the dim
        out_loss = out_loss.squeeze()

        #loss value of negative word
        noise_loss = torch.bmm(noise_vectors.neg(),input_vectors).sigmoid().log()
        noise_loss = noise_loss.squeeze().sum(1) # sum the losses over the sample of noise vectors

        return -(out_loss + noise_loss).mean()

#创建model，参数分别为词典大小（单词个数），词向量的维度， 每个词的采样概率
model = SkipGramNeg(len(vocabulary2index),embedding_dim,noise_dist)
criterion = NegativeSampleLoss()
optimizer = optim.Adam(model.parameters(),lr = 0.001)

steps = 0
epochs = 100
for e in range(epochs):
    #取中心词和上下文词
    for input_words,target_word in batch_loader(train_words,batch_size,window_size):
        steps +=1
        inputs,targets = torch.LongTensor(input_words),torch.LongTensor(target_word)
        
        #中心词喂给model作为x，输出中心词的词向量，
        #weight是在torch embdding（self.in_embed）内部作为属性保存着，刚开始时候用初期化的weight* inputs得到input_vectors
        #后期随着训练，会更新self.in_embed中的weight，返回新的input_vectors
        input_vectors = model.forward_input(inputs)
        
        #上下文词喂给model作为y，输出上下文词的词向量
        output_vectors = model.forward_output(targets)
        size,_ = input_vectors.shape
        
        noise_vectors = model.forward_noise(size,N_samples)
        
        #损失计算
        loss = criterion(input_vectors,output_vectors,noise_vectors)
        if steps%print_every ==0:
            print('loss: ',loss)
        
        #保存的Variable grad清零，梯度清0
        optimizer.zero_grad()
        
        #会调用torch.autograd.backward(loss)，进行反向传播的梯度计算
        loss.backward()
        
        #用上一步计算出的梯度去更新各层的权重
        optimizer.step()

print(input_vectors)

loss:  tensor(2.9019, grad_fn=<NegBackward>)
loss:  tensor(2.8068, grad_fn=<NegBackward>)
loss:  tensor(2.8553, grad_fn=<NegBackward>)
loss:  tensor(2.9406, grad_fn=<NegBackward>)
loss:  tensor(2.7972, grad_fn=<NegBackward>)
loss:  tensor(2.7880, grad_fn=<NegBackward>)
loss:  tensor(2.8808, grad_fn=<NegBackward>)
loss:  tensor(2.8411, grad_fn=<NegBackward>)
loss:  tensor(2.7883, grad_fn=<NegBackward>)
tensor([[ 0.2576, -0.4979],
        [ 0.2576, -0.4979],
        [ 0.2576, -0.4979],
        [ 0.2576, -0.4979],
        [ 0.2576, -0.4979],
        [-0.6731,  0.6583],
        [-0.6731,  0.6583],
        [-0.2405,  0.0640],
        [-0.2405,  0.0640],
        [-0.2405,  0.0640],
        [-0.2405,  0.0640],
        [-0.2405,  0.0640],
        [-0.2466, -0.8853],
        [-0.2466, -0.8853],
        [ 0.4153, -0.1400],
        [ 0.4153, -0.1400],
        [ 0.4153, -0.1400],
        [ 0.4153, -0.1400],
        [ 0.1676, -0.0508],
        [ 0.1676, -0.0508],
        [ 0.1676, -0.0508],
       