### Practice 4  Skip-Gram

In [None]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.utils.data as Data
dtype = torch.FloatTensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def read_data(filename):
    with open(filename) as f:
        data = []
        for i in f.readlines():
            data = data + i.strip().split(' ') # strip()
    return data
word_sequence = read_data('./toy_text.txt')
# read data set
print('Data size %d' % len(word_sequence))
print(word_sequence[:20])

In [None]:
# calculate vocab
vocab = list(set(word_sequence))  
vocab.sort()
print(vocab)
# enumerate() 函式來同時輸出索引與元素 :　for index, value in enumerate(List):
word2idx = {w: i for i, w in enumerate(vocab)}  
idx2word = {v: k for k, v in word2idx.items()} #items()函数以列表返回可遍历的(键, 值) 元组数组。　
print('word2idx=\n',word2idx)
print('idx2word=\n',idx2word)

batch_size = 32
embedding_size = 2  
C = 2  
voc_size = len(vocab)
skip_grams = []
# generate skip_gram
for idx in range(C, len(word_sequence) - C): 
    center = word2idx[word_sequence[idx]]  # 中心词
    if idx == 2:
        print('center=',center)
    # a = [1,2,3], b=[4,5,6] -> a+b = [1,2,3,4,5,6]
    context_idx = list(range(idx - C, idx)) + list(range(idx + 1, idx + C + 1)) 
    context = [word2idx[word_sequence[i]] for i in context_idx]
    for w in context:
        skip_grams.append([center, w])  
print(skip_grams)

In [None]:
def make_data(skip_grams):
    input_data = []
    output_data = []
    for i in range(len(skip_grams)):
        input_data.append(np.eye(voc_size)[skip_grams[i][0]]) # one hot 編碼for center as x(feature)
        output_data.append(skip_grams[i][1]) # 前後字 = y (label=answer)
    return input_data, output_data
input_data, output_data = make_data(skip_grams)
input_data, output_data = torch.Tensor(input_data), torch.LongTensor(output_data)
print(input_data[:5])
print(output_data[:5])

dataset = Data.TensorDataset(input_data, output_data)
loader = Data.DataLoader(dataset, batch_size, True) # True is shuffle

In [None]:
class Word2Vec(nn.Module):
    def __init__(self):
        super(Word2Vec, self).__init__()
        # finish this part
        self.W = nn.Parameter(torch.randn(voc_size, embedding_size).type((dtype)))
        self.V = nn.Parameter(torch.randn(embedding_size, voc_size).type((dtype)))
    def forward(self, X):
        # finish this part
        hidden_layer = torch.matmul(X, self.W)  # hidden_layer : [batch_size, embedding_size]
        output_layer = torch.matmul(hidden_layer, self.V)  # output_layer : [batch_size, voc_size]
        return output_layer
model = Word2Vec().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(2000):
    for i, (batch_x, batch_y) in enumerate(loader):
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        pred = model(batch_x)
        loss = criterion(pred, batch_y)
        if (epoch + 1) % 1000 == 0:
            print(epoch + 1, i, loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
# draw figure
for i, label in enumerate(vocab):
    W, WT = model.parameters()
    x, y = float(W[i][0]), float(W[i][1])
    plt.scatter(x, y)
    plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()