In [9]:
#refrence：https://towardsdatascience.com/implementing-word2vec-in-pytorch-skip-gram-model-e6bae040d2fb?gi=f106913ccce5
#https://zhangweifeng.top/2019/08/03/%E4%BD%BF%E7%94%A8Pytorch%E5%AE%9E%E7%8E%B0word2vec-skip-gram/ 中文说明
#Skip-gram的简单实现，不涉及负采样
import torch
import numpy as np
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
]

def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)

#print(tokenized_corpus)

vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

#print(word2idx)
#print(idx2word)

window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    #print(indices)
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array
#print(idx_pairs)

#Input layer is just the center word encoded in one-hot manner. It dimensions are [1, vocabulary_size]
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x
    
#Hidden layer makes our v vectors. Therefore it has to have embedding_dims neurons. To compute it value we have to define W1 weight matrix. Of course its has to be #[embedding_dims, vocabulary_size]. There is no activation function — just plain matrix multiplication.
#embedding_dims = 5
#W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
#z1 = torch.matmul(W1, x)

#Last layer must have vocabulary_size neurons — because it generates probabilities for each word. Therefore, W2 is [vocabulary_size, embedding_dims] in terms of shape.
#W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
#z2 = torch.matmul(W2, z1)

#On top on that we have to use softmax layer. PyTorch provides optimized version of this, combined with log — because regular softmax is not really numerically stable:
#log_softmax = F.log_softmax(a2, dim=0)

#Now we can compute loss. As usual PyTorch provides everything we need:
#loss = F.nll_loss(log_softmax.view(1,-1), y_true)
#The nll_loss computes negative-log-likelihood on logsoftmax. y_true is context word — we want to make this as high as possible
# — because pair x, y_true is from training data — so the are indeed center, context.

#As we fished forward pass, now it’s time to perform backward pass. Simply:
#loss.backward()

#For optimization SDG is used. It is so simple, that it was faster to write it by hand instead of creating optimizer object:
#W1.data -= 0.01 * W1.grad.data
#W2.data -= 0.01 * W2.grad.data

#Last step is to zero gradients to make next pass clear:
#W1.grad.data.zero_()
#W2.grad.data.zero_()

#Training loop
#Time to compile it into training loop. It can look like:

embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    
    #data:the index of centerword
    #target:the index of context word
    for data, target in idx_pairs:
        #初始化x为字典大小的维度，且该词所在的位置设为1， 相当于对x进行one hot编码
        x = Variable(get_input_layer(data)).float()
        
        #context word的index作为y值
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        #(embedding_dims, vocabulary_size) * (vocabulary_size,1)
        z1 = torch.matmul(W1, x)
        #z1:(embedding_dims,1)
        
        #(vocabulary_size, embedding_dims) * (embedding_dims,1)
        z2 = torch.matmul(W2, z1)
        #z2:(vocabulary_size,1)

        #对z2进行sfotmax后计算log
        log_softmax = F.log_softmax(z2, dim=0)
        
        #负对数似然
        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data
        
        #当我们调用loss.backward()函数的时候，整张图都被依次计算误差，所有Variable的.grad属性会被累加．
        loss.backward()
        
        #根据梯度改变权重
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data
        
        #对现有梯度清零
        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 4.5037336349487305
Loss at epo 10: 3.858903169631958
Loss at epo 20: 3.518991470336914
Loss at epo 30: 3.3026914596557617
Loss at epo 40: 3.145211935043335
Loss at epo 50: 3.0224287509918213
Loss at epo 60: 2.9229726791381836
Loss at epo 70: 2.8402323722839355
Loss at epo 80: 2.7698750495910645
Loss at epo 90: 2.708897113800049


提取向量
现在我们训练了一个网络，最后一件事就是提取每个单词的向量，这里有三个可能的方式
- 使用W1的v向量
- 使用W2的u向量
- 使用u和v的平均
你可以自己思考什么时候用哪个

In [16]:
#使用W1的v向量
print(W1)
print(idx2word)
#w1的列就是每个词对应的vect
def get_vect_by_word(word):
    index = word2idx[word]
    return W1[:,index]
print(get_vect_by_word('he'))

tensor([[-0.2192, -0.3107,  0.1193,  0.5108, -0.0473, -1.0777,  0.2223, -1.2885,
          1.1545, -0.0242,  0.5189,  0.4307, -0.8447, -0.4843,  0.4388],
        [-0.5908, -0.4645,  1.0955, -0.3151,  0.3382,  0.7444,  0.3455, -0.5546,
          0.3973, -0.6931,  1.5637, -1.0222,  0.4685,  1.4698,  0.9312],
        [-0.6092, -0.3868, -0.7027, -1.6485, -0.1471,  0.8466,  0.6977, -0.4477,
         -0.0858, -1.2249,  0.4202,  0.6882,  1.5914,  1.0136,  0.8029],
        [ 0.8918, -0.4556,  0.5177,  0.1766,  0.4958,  0.1987,  1.3939, -0.4566,
          1.7754, -0.8591,  0.7134, -1.4443,  0.5845, -0.4377, -0.7417],
        [-0.8782,  0.2848, -1.1601,  0.6272,  0.3698,  0.3644, -1.4403,  0.1981,
         -0.5880,  1.3435,  0.0704,  0.6243, -0.6992,  0.6870,  0.9619]],
       requires_grad=True)
{0: 'he', 1: 'is', 2: 'a', 3: 'king', 4: 'she', 5: 'queen', 6: 'man', 7: 'woman', 8: 'warsaw', 9: 'poland', 10: 'capital', 11: 'berlin', 12: 'germany', 13: 'paris', 14: 'france'}
tensor([-0.2192, -0.590

In [24]:
#使用W2的u向量
print(W2)
def get_vect_by_word_from_w2(word):
    index = word2idx[word]
    return W2[index]
get_vect_by_word_from_w2('is')

tensor([[ 1.0457e+00, -1.0377e+00,  4.8538e-01, -5.0248e-01, -1.9111e+00],
        [ 1.0480e-01,  1.4736e+00,  7.1006e-01,  2.0185e+00,  4.1934e-01],
        [-1.7788e-01, -1.1168e+00, -1.4770e-01,  8.1876e-01,  9.5763e-01],
        [-1.4697e+00,  4.1635e-01, -2.7296e-01,  1.4065e+00,  4.5451e-01],
        [ 2.5076e-01,  1.4806e+00,  1.0296e+00, -5.4036e-01, -1.0844e+00],
        [ 1.3061e+00,  1.8377e-01,  5.5527e-01,  6.5548e-01,  7.4693e-02],
        [-4.8760e-01, -2.9554e-01, -4.3771e-01, -3.1433e-01, -1.4220e+00],
        [-1.0272e+00,  4.0388e-01,  8.9364e-01, -1.2516e+00, -8.8262e-01],
        [ 1.5421e+00, -2.0172e-01, -4.2868e-01, -5.6718e-01, -3.4759e-01],
        [ 7.5248e-01,  7.6497e-01,  3.4004e-02,  3.7912e-01, -3.0912e-01],
        [-9.6537e-01, -7.3450e-02, -4.9049e-02,  1.9782e+00,  6.6227e-01],
        [-8.2628e-01, -1.3237e-01, -5.8901e-01,  1.3122e+00,  5.0472e-01],
        [ 3.1152e+00, -1.3643e+00,  3.7410e-01,  2.6550e-03, -1.2258e+00],
        [-2.2485e-01, -2.

tensor([0.1048, 1.4736, 0.7101, 2.0185, 0.4193], grad_fn=<SelectBackward>)

In [25]:
#使用u和v的平均
(get_vect_by_word('is')+get_vect_by_word_from_w2('is'))/2

tensor([-0.1029,  0.5046,  0.1616,  0.7815,  0.3521], grad_fn=<DivBackward0>)