<a href="https://colab.research.google.com/github/dahnhwang/cbow/blob/master/train_cbow_word_analogy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import time

torch.manual_seed(1)

use_cuda = True

EMBEDDING_DIM = 100
if torch.cuda.is_available():
  VOCAB_SIZE = 30000
  print('Using Cuda')
else:
  VOCAB_SIZE = 5000

UNK_TOKEN = "<UNK>"
WINDOW_SIZE = 5
BATCH_SIZE = 1024



Using Cuda


In [0]:
words = []
with open("text8.txt") as f:
  for line in f.readlines():
    words += line.strip().split(" ") #라인을 하나씩 읽어들이며 띄어쓰기 단위로 단어들을 분리하여 words array에 집어넣음
    
print("total words in corpus: %d" % (len(words))) #총 words array 안에 들어간 단어 개수

word_cnt = Counter() #컨테이너에 동일한 값의 자료가 몇개인지를 파악하는데 사용하는 객체

for w in words:
  if w not in word_cnt: #한번도 나타나지 않았던 단어라면 0으로 셋팅
    word_cnt[w] = 0
  word_cnt[w] += 1 #나타난 단어라면 카운트하기



total words in corpus: 17005207


In [0]:
# calculate word coverage of 30k most common words
total = 0

# print(word_cnt.most_common(VOCAB_SIZE)[0]) ('the', 1061396)

for cnt_tup in word_cnt.most_common(VOCAB_SIZE): #word_cnt에서 most common word 3만개 추출
  total += cnt_tup[1] 
print("coverage: %.4f " % (total * 1.0 / len(words))) #전체 단어대비 추출된 단어의 커버리지 퍼센트
# 95.94%




coverage: 0.9594 


In [0]:
# make vocabulary with most common words
word_to_ix = dict()
ix_to_word = dict()

for i, cnt_tup in enumerate(word_cnt.most_common(VOCAB_SIZE)):
  word_to_ix[cnt_tup[0]] = i # 3만개의 각 단어에 대해 고유숫자를 부여
  ix_to_word[i] = cnt_tup[0]
  last = i
  
  
  
print(word_to_ix['banana'])
if words[0] not in word_to_ix:
  print(words[0])

print(ix_to_word[29999])



10967
batavian


In [0]:
# add unk token to vocabulary
word_to_ix[UNK_TOKEN] = len(word_to_ix) #UNK_TOKEN에 가장 마지막 고유숫자를 부여
ix_to_word[len(word_to_ix)] = UNK_TOKEN

print(word_to_ix[UNK_TOKEN])

# replace rare words in train data with UNK_TOKEN
train_words = []
count = 0
for w in words:
  if w not in word_to_ix: #전체단어 중 word_to_ix(고유숫자가 부여된 3만개의 단어)에 없는 것이라면 UNK로 넣기
    train_words += [UNK_TOKEN]
    count = count + 1
  else:
    train_words += [w] #있는 것이라면 해당 단어를 그대로 넣기
    
print(train_words[3953])


30000
century


In [0]:
# make train samples for CBOW
train_input = []  #window size 만큼의 주변 context words
train_target = [] #context words를 통해 맞춰야 하는 center word
span = (WINDOW_SIZE - 1) // 2 #span = 2
for i in range(span, len(train_words) - span): # center word 등록하는 숫자
  context = []
  for j in range(WINDOW_SIZE): # center word 당 주변 context words 등록
    if j != span:
      context.append(word_to_ix[train_words[i + j - span]]) # center word에서 span뺀만큼에서 i더한 위치
  target = word_to_ix[train_words[i]]
  train_input.append(context)
  train_target.append(target)
print("data is generated!") 
print(train_input[0],train_target[0])

for k, v in word_to_ix.items(): #word_to_ix에 아이템을 하나씩 접근해서, key, value를 각각 k, v 저장
    if v == train_target[0]:
        print(k)
        


data is generated!
[5233, 3080, 5, 194] 11
as


In [0]:
# model class
class CBOW(nn.Module):

  def __init__(self, vocab_size, embedding_dim, window_size):
    super(CBOW, self).__init__()
    self.embeddings = nn.Embedding(vocab_size, embedding_dim) #input layer to projection layer
    self.linear1 = nn.Linear(embedding_dim, vocab_size) #projection layer to output layer
    pass

  def forward(self, inputs): #stacking each layer together
    embeds = self.embeddings(inputs)
    out = self.linear1(torch.mean(embeds, dim=1)) #임베딩된 결과를 평균함
    log_probs = F.log_softmax(out, dim=1) #임베딩된 결과에 활성함수를 취함
    return log_probs

  def get_word_embedding(self, word):
    word = torch.cuda.LongTensor([word_to_ix[word]]) #int를 torch에서 사용하는 자료형으로 변환(?)
    return self.embeddings(word).view(1, -1) # ?????


In [0]:
# set up to train
losses = []
loss_function = nn.NLLLoss() #음의 로그 우도 손실 / NLLLoss에 대한 입력 은 로그 확률의 벡터이고 목표 레이블입니다.
model = CBOW(len(word_to_ix), EMBEDDING_DIM, WINDOW_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)


In [0]:
# if gpu is available, then use it
if torch.cuda.is_available():
  model.cuda()

In [0]:
# make data loader for batch training
# Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
# into integer indices and wrap them in tensors)
train_input = torch.from_numpy(np.asarray(train_input)).long() # Creates a Tensor from a numpy.ndarray
train_target = torch.from_numpy(np.asarray(train_target)).long()
print(train_target[0])
dataset_train = data_utils.TensorDataset(train_input, train_target)
train_loader = data_utils.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=False)


tensor(11)


In [0]:
# training loop
for epoch in range(10):
  total_loss = 0
  start = time.time()
  for i, (context, target) in enumerate(train_loader): #첫번째 context & target set 부터 시작

    if torch.cuda.is_available():
      context = context.cuda()
      target = target.cuda()

      
    # Step 2. Recall that torch *accumulates* gradients. Before passing in a
    # new instance, you need to zero out the gradients from the old instance  
    model.zero_grad()

    # Step 3. Run the forward pass, getting log probabilities over next words
    log_probs = model(context)

    # Step 4. Compute your loss function. (Again, Torch wants the target
    # word wrapped in a tensor)
    loss = loss_function(log_probs, target)

    # Step 5. Do the backward pass and update the gradient
    loss.backward()
    optimizer.step()

    # Get the Python number from a 1-element Tensor by calling tensor.item()
    total_loss += loss.item()

    if (i+1) % 1000 == 0:
      print("loss: %.4f, steps: %dk" % (loss.item(), ((i+1)/1000)))

  end = time.time()
  print("epochs: %d" % (epoch+1))
  print("time eplased: %d seconds" % (end-start))
  print("mean loss: %.4f" % (total_loss / (train_input.shape[0] // BATCH_SIZE)))
  
  # torch.save: Saves a serialized object to disk. 
  # Models, tensors, and dictionaries of all kinds of objects can be saved using this function.
  torch.save(model.state_dict(), "cbow.epoch{}.model".format(epoch))

# Here you need to save the model's hidden layer which is V * D word embedding matrix.
# Then, use the word embedding matrix to get vectors for word


loss: 8.5509, steps: 1k
loss: 8.0301, steps: 2k
loss: 7.7606, steps: 3k
loss: 7.6806, steps: 4k
loss: 7.4551, steps: 5k
loss: 7.4861, steps: 6k
loss: 7.4873, steps: 7k
loss: 7.3449, steps: 8k
loss: 7.1948, steps: 9k
loss: 7.2088, steps: 10k
loss: 7.1543, steps: 11k
loss: 6.9786, steps: 12k
loss: 7.2583, steps: 13k
loss: 7.1724, steps: 14k
loss: 7.1015, steps: 15k
loss: 6.9712, steps: 16k
epochs: 1
time eplased: 495 seconds
mean loss: 7.4801
loss: 6.7463, steps: 1k
loss: 6.7881, steps: 2k
loss: 6.9220, steps: 3k
loss: 6.8098, steps: 4k
loss: 6.8238, steps: 5k
loss: 6.8017, steps: 6k
loss: 7.1033, steps: 7k
loss: 6.7951, steps: 8k
loss: 6.8073, steps: 9k
loss: 6.8608, steps: 10k
loss: 6.7857, steps: 11k
loss: 7.0295, steps: 12k
loss: 6.7807, steps: 13k
loss: 6.7096, steps: 14k
loss: 6.7414, steps: 15k
loss: 6.6999, steps: 16k
epochs: 2
time eplased: 481 seconds
mean loss: 6.8377
loss: 6.8977, steps: 1k
loss: 6.6177, steps: 2k
loss: 6.7630, steps: 3k
loss: 6.4671, steps: 4k
loss: 6.6834, 

In [0]:
def get_index_of_max(input):
    index = 0
    for i in range(1, len(input)):
        if input[i] > input[index]:
            index = i 
    return index
  
def get_max_prob_result(input, ix_to_word):
    return ix_to_word[get_index_of_max(input)]
  

In [0]:
def test_word_analogy(model, ix_to_word, word_to_ix):

    # test word analogy
    first_line = 1
    accuracy = 0
    with open("questions_words.txt") as f2:
      equal_count = 0
      wrong_count = 0
      for line in f2.readlines():
        analogy_set = line.lower().strip().split(" ")
        if(analogy_set[0]!=':'):
          word_vec = []
          for i in range(0, len(analogy_set)):
            check_word = analogy_set[i]
            if check_word not in word_to_ix:
                check_word = '<UNK>'
            word_vec.append(model.get_word_embedding(check_word)[0].cpu())
#             print(analogy_set[i])

          resulting_vector = word_vec[0] - word_vec[1] + word_vec[2]
          actual_result_vec = word_vec[3]
          resulting_word = get_max_prob_result(resulting_vector, ix_to_word)
#           print('resulting vector: ', resulting_vector)
#           print('actual correct word:', words[3])
          if(resulting_word == words[3]):
            equal_count = equal_count + 1
          else :
            wrong_count = wrong_count + 1
      print("accuracy of word analogy test:",equal_count / (equal_count+wrong_count))
test_word_analogy(model, ix_to_word, word_to_ix)


accuracy of word analogy test: 0.007419156774457634
