# 예제: CBOW 임베딩 학습하기

In [1]:
import pandas as pd

df = pd.read_csv("../../data/frankenstein_with_splits.csv")
df

Unnamed: 0,context,target,split
0,", or the",frankenstein,train
1,frankenstein or the modern,",",train
2,"frankenstein , the modern prometheus",or,train
3,"frankenstein , or modern prometheus by",the,train
4,", or the prometheus by mary",modern,train
...,...,...,...
90693,newsletter to hear new ebooks .,about,test
90694,to hear about ebooks .,new,test
90695,hear about new .,ebooks,test
90696,about new ebooks,.,test


In [2]:
# 데이터를 다시 train/valid/test로 나눠줌

# train 데이터 
train_df = df[df.split=='train']
train_size = len(train_df)

# valid 데이터 
val_df = df[df.split=='val']
val_size = len(val_df)

# test 데이터 
test_df = df[df.split=='test']
test_size = len(test_df)

In [3]:
print("train : ",train_size)
print("valid : ",val_size)
print("test : ",test_size)

train :  63489
valid :  13605
test :  13604


In [4]:
lookup_dict = {'train': (train_df, train_size),
                             'val': (val_df, val_size),
                             'test': (test_df, test_size)}
# lookup_dict

In [5]:
train_df.tail(10)

Unnamed: 0,context,target,split
63479,i be alone,?,train
63480,be alone ?,,train
63481,had feelings of,i,train
63482,i feelings of affection,had,train
63483,"i had of affection ,",feelings,train
63484,"i had feelings affection , and",of,train
63485,"had feelings of , and they",affection,train
63486,feelings of affection and they were,",",train
63487,"of affection , they were requited",and,train
63488,"affection , and were requited by",they,train


### 2. Vocabulary

In [7]:
class Vocabulary:
    def __init__(self, mask_token="<MASK>",add_unk=True):
        self.token_to_idx = {}
        self.idx_to_token = {}
        
        self.mask_index = self.add_token(mask_token)
#         "UNK" 토큰이 추가되지 않는 경우에는 -1로 설정,
        self.unk_index = -1
        if add_unk:
#         "UNK" 토큰이 추가될 경우에는 UNK에 해당하는 인덱스로 설정,
            self.unk_index = self.add_token('<UNK>') 

    def add_token(self, token):
        
#       만약 해당 토큰이 있으면 토큰 idx만 return
        if token in self.token_to_idx:
            index = self.token_to_idx[token]
            
#       만약 해당 토큰이 없으면 새로운 토큰 만들어줌
        else:
            index = len(self.token_to_idx)
            self.token_to_idx[token] = index
            self.idx_to_token[index] = token
        return index

In [8]:
# CBOW Vocabulary 객체 생성

cbow_vocab = Vocabulary()
for index, row in df.iterrows():
    for token in row.context.split(' '):
        cbow_vocab.add_token(token)
    cbow_vocab.add_token(row.target)


In [9]:
print(dict(list(cbow_vocab.token_to_idx.items())[:5]))

{'<MASK>': 0, '<UNK>': 1, ',': 2, 'or': 3, 'the': 4}


In [10]:
print(dict(list(cbow_vocab.idx_to_token.items())[:5]))

{0: '<MASK>', 1: '<UNK>', 2: ',', 3: 'or', 4: 'the'}


In [41]:
print(len(cbow_vocab.token_to_idx.items()))

7270


## 3. Vectorizer

In [11]:
# 주어진 토큰에 대응하는 인덱스 반환

def lookup_token(vocabulary_class,token):

# UNK 토큰이 있을 경우
    if vocabulary_class.unk_index >= 0:
#           토큰을 찾아보고 없으면 unk_index 반환, 있으면 해당 토큰의 idx를 반환
        return vocabulary_class.token_to_idx.get(token, vocabulary_class.unk_index)
    else:
        return vocabulary_class.token_to_idx[token]
    

In [12]:
# 주어진 인덱스에 대응하는 토큰 반환

def lookup_index(vocabulary_class, index):
        if index not in vocabulary_class.idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return vocabulary_class.idx_to_token[index]
    

### Vectorize

In [13]:
import numpy as np 

def vectorize(context, vector_length=-1):
    
    indices = [lookup_token(cbow_vocab,token) for token in context.split(' ')]
    if vector_length < 0:
        vector_length = len(indices)
    
    
#   인덱스와 mask로 이루어진 out_vector를 만든다.
    out_vector = np.zeros(vector_length, dtype=np.int64)
    out_vector[:len(indices)] = indices
#     문장 길이가 작아서 padding 진행하면 해당 부분 mask 처리
    out_vector[len(indices):] = cbow_vocab.mask_index
    
    
    return out_vector

# vector_length 보다 문장의 토큰의 개수가 작으면 MASK 처리된다. 
print(vectorize("all dafs dfkdl",vector_length=10))

[215   1   1   0   0   0   0   0   0   0]


### Dataset class

In [14]:
# max_seq_length를 구해야 한다. 

import torch
from torch.utils.data import Dataset

class CBOWDataset(Dataset):
    def __init__(self, cbow_df):
        
        self.cbow_df = cbow_df
        
        measure_len = lambda context: len(context.split(" "))
        self.max_seq_length = max(map(measure_len, cbow_df.context))

    def __len__(self):
        return len(self.cbow_df)

    def __getitem__(self, index):
        
        row = self.cbow_df.iloc[index]

        context_vector = vectorize(row.context, self.max_seq_length)
        target_index = lookup_token(cbow_vocab,row.target)

        return {'x_data': context_vector,
                'y_target': target_index}

In [16]:
# 데이터셋을 인스턴스화 해주어야 로더에 넣어줄 수 있다. 

train_dataset = CBOWDataset(train_df)
train_dataset

valid_dataset = CBOWDataset(val_df)
valid_dataset

test_dataset = CBOWDataset(test_df)
test_dataset


<__main__.CBOWDataset at 0x7fd4d3250bb0>

In [17]:
print(train_dataset.max_seq_length)

6


In [36]:
print(valid_dataset.max_seq_length)

6


In [37]:
print(test_dataset.max_seq_length)

6


In [18]:
# 데이터 로더 설정
from torch.utils.data import DataLoader

# drop_last=True -> 배치 사이즈보다 over하면 drop

Traindataloader = DataLoader(dataset=train_dataset, batch_size=512,
                            shuffle=True, drop_last=True)

Validdataloader = DataLoader(dataset=valid_dataset, batch_size=512,
                            shuffle=False, drop_last=True)

Testdataloader = DataLoader(dataset=test_dataset, batch_size=512,
                            shuffle=False, drop_last=True)


In [19]:
print(len(train_dataset),len(Traindataloader))

63489 124


In [40]:
for batch_index, batch_dict in reversed(list(enumerate(Traindataloader))):
    print(batch_index)
    print(batch_dict)
    
    break
    

123
{'x_data': tensor([[ 115, 2917,   82,   15,   43,    0],
        [  72, 5566,   33,  176,  239,   48],
        [  33,  230, 1651,  225,  212,  908],
        ...,
        [ 642, 1164,   19,   15,   43,    0],
        [5232,   33, 1326,   19,    4, 5233],
        [ 496, 1359,   49,   33,  688, 1482]]), 'y_target': tensor([1383,    4,    2, 4968, 1166,   33,    4,  426,    8, 1844,  295,  698,
        3138,  676,   49,    4,   39,  273,  115,   19,    4,   33,   44,  209,
         239, 4081, 3854,  766,   39,    4,   19,  282,  377,   72, 1380, 1533,
         377,    2, 3451,  319,  192,  474,   39,  749,  334, 1160,  255,   85,
         337,   33,   60,  682,   86, 5513,   44,   19, 3228,  806, 1752,  204,
          49,  298,   19,   44,    2,   19, 1716,   33, 1651, 1229,   19, 2222,
         760,  515,   53,  230, 4213,  226,   72,   60,   44,  180,    2,    4,
          23,  406,  349, 3228,   15,  319,    4,  226, 5929,   72,  345,  590,
         732,  656,   48, 3412,  870,   39

### 모델정의 NameClassifier

In [21]:
# embedding_size를 지정해줘야 한다.
# 1. Embedding layer를 이용해서 문맥의 단어를 나타내는 인덱스를 각 단어에 대한 벡터로 만든다.
# 2. 전반적인 문맥을 감지하도록 벡터를 결합한다.(sum)
 
import torch.nn as nn
import torch.nn.functional as F

class CBOWClassifier(nn.Module):
    """
    매개변수:
        vocabulary_size (int): 어휘 사전 크기, 임베딩 개수와 예측 벡터 크기를 결정합니다
        embedding_size (int): 임베딩 크기
        padding_idx (int): 기본값 0; 임베딩은 이 인덱스를 사용하지 않습니다
    """
    
    def __init__(self, vocabulary_size, embedding_size, padding_idx=0):
        
#       torch.nn.Module의 초기화 메서드를 실행하여 해당 클래스의 기능을 상속받음
        super(CBOWClassifier, self).__init__()
        # Embedding layer를 이용해서 문맥의 단어를 나타내는 인덱스를 각 단어에 대한 벡터로 만듦
        
        self.embedding =  nn.Embedding(num_embeddings=vocabulary_size, 
                                       embedding_dim=embedding_size,
                                       padding_idx=padding_idx)
#         embedding에서 나오면 embedding_size * vocab_size 차원이 된다. 
        self.fc1 = nn.Linear(in_features=embedding_size,
                             out_features=vocabulary_size)
    
    def forward(self, x_in, apply_softmax=False):
        # 전반적인 문맥을 감지하도록 벡터를 결합함(sum)
        x_embedded_sum = F.dropout(self.embedding(x_in).sum(dim=1), 0.3)
        y_out = self.fc1(x_embedded_sum)
        
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)
            
        return y_out

In [22]:
# embedding_size를 지정해줘야 한다. 
embedding_size=50 

classifier = CBOWClassifier(vocabulary_size=len(cbow_vocab.token_to_idx), 
                            embedding_size=embedding_size)
classifier

CBOWClassifier(
  (embedding): Embedding(7270, 50, padding_idx=0)
  (fc1): Linear(in_features=50, out_features=7270, bias=True)
)

### 모델 구조 뜯어보기(번외)

In [74]:

vocabulary_size = len(cbow_vocab.token_to_idx) 
embedding_size = embedding_size  
embedding_size=50
padding_idx=0

embedding =  nn.Embedding(num_embeddings=vocabulary_size, 
                           embedding_dim=embedding_size,
                           padding_idx=padding_idx)

fc1 = nn.Linear(in_features=embedding_size,
                     out_features=vocabulary_size)
    

# 데이터 로더에서 미니배치를 반복합니다.
for batch_index, batch in enumerate(Traindataloader):
    # 각 미니배치에서 샘플을 가져옵니다.
    x_in = batch['x_data']
    y_in = batch['y_target']
    
    
    embedded_sample = embedding(x_in)
    
    # 임베딩 결과를 출력합니다.
    print(f"미니배치 {batch_index}의 임베딩 결과:")
    print(embedded_sample.size())
    print(embedded_sample)
    
    
    sum_result = embedding(x_in).sum(dim=1)
    
    print(" ")
    print(f"미니배치 {batch_index}의 임베딩 sum 결과:")
    print(sum_result.size())
    print(sum_result)
    
    x_embedded_sum = F.dropout(sum_result, 0.3)
    
    print(" ")
    print(f"미니배치 {batch_index}의 임베딩 dropout 결과:")
    print(x_embedded_sum.size())
    print(x_embedded_sum)
    
    y_out = fc1(x_embedded_sum)
    
    print(" ")
    print(f"미니배치 {batch_index}의 fc1 결과:")
    print(y_out.size())
    print(y_out)
    
    y_out = F.softmax(y_out, dim=1)
    
    print(" ")
    print(f"미니배치 {batch_index}의 fc1 결과:")
    print(y_out.size())
    print(y_out)
    
    y_out = F.softmax(y_out, dim=1)
    
    print(" ")
    print(f"미니배치 {batch_index}의 fc1 결과:")
    print(y_out.size())
    print(y_out)
    
#     y_pred = y_out
#     loss =  loss_func(y_pred, y_in)
    
    break

미니배치 0의 임베딩 결과:
torch.Size([512, 6, 50])
tensor([[[-0.7889, -0.7210, -0.7649,  ...,  0.7189,  0.8779,  0.1937],
         [ 0.6209,  1.1932,  0.3220,  ..., -0.4130,  0.3254, -1.3003],
         [-0.9514,  2.3714, -0.8090,  ...,  0.2681, -1.5132,  1.7616],
         [-0.8146,  0.2504,  1.0204,  ..., -0.0242,  0.5439, -0.0686],
         [-1.3744,  0.5519,  0.4760,  ...,  1.2907,  0.5133, -0.5947],
         [ 2.2647, -1.1518,  0.4901,  ..., -0.8892, -0.9164,  0.1901]],

        [[-1.2474, -0.1531, -0.4585,  ..., -0.1053, -1.6733, -1.3185],
         [-0.1541,  1.0257, -1.8475,  ..., -0.4574, -0.8569,  0.6387],
         [ 0.0754,  0.3734, -1.3440,  ...,  0.6772, -1.1051,  1.0829],
         [-0.5872, -1.6810,  0.9565,  ..., -0.4432, -1.7029,  1.0940],
         [ 1.4295,  1.0614,  0.6999,  ...,  1.1473, -0.4687,  1.0768],
         [-0.1541,  1.0257, -1.8475,  ..., -0.4574, -0.8569,  0.6387]],

        [[-0.8634,  0.6928, -0.7762,  ..., -0.2246,  0.7483,  0.4033],
         [-0.6907, -0.8391,  0.6

### 옵티마이저, loss function

In [23]:
lr = 0.001
num_epochs = 100

In [24]:
# 옵티마이저
import torch.optim as optim

optimizer = optim.Adam(classifier.parameters(), lr = lr)
optimizer


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [25]:
loss_func = nn.CrossEntropyLoss()
loss_func

CrossEntropyLoss()

### Train

In [26]:
def compute_accuracy(y_pred, y_target):
#      예측값과 타겟값을 비교하여 일치하는 개수를 계산
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100
   

In [27]:
# Train state 초기화 
def make_train_state():
    return {
        'stop_early':False,
        'early_stopping_step':0,
        'early_stopping_best_val':1e8,
        'early_stopping_criteria' : 10,
        'epoch_index' : 0,
        'train_loss': [], 
        'train_acc' :[], 
        'val_loss' : [],
        'val_acc' : [], 
        'test_loss' : [],
        'test_acc' : [],
         
#       모델 저장파일
        'model_filename' : 'model.pth'
    } 


# Train update 
def update_train_state(model, train_state):
    
#   학습시작하면 초기에 모델 저장하기 
    
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(),train_state['model_filename'])
        
#   모델 성능이 향상되면 모델 저장(valid loss가 더 낮아지면)
    elif train_state['epoch_index'] >=1 :
        loss_t = train_state['val_loss'][-1]
#        loss가 나빠지면 early stop step 업데이트
        if loss_t >= train_state['early_stopping_best_val']:
            train_state['early_stopping_step']+=1
            
#        loss가 좋아지면   
        else:
#            early stop step 0으로 다시 초기화        
            train_state['early_stopping_step']=0
    
#           최저 loss이면 모델 저장 
            if loss_t < train_state['early_stopping_best_val']:
                train_state['early_stopping_best_val'] = loss_t
                torch.save(model.state_dict(),train_state['model_filename'])

#       기준점 넘으면 early stop 
        if train_state['early_stopping_step'] >= train_state['early_stopping_criteria']:
            train_state['stop_early'] = True
        
        return train_state


In [28]:
# 모델 진행 상황 함수 초기화
train_state = make_train_state()
train_state

{'stop_early': False,
 'early_stopping_step': 0,
 'early_stopping_best_val': 100000000.0,
 'early_stopping_criteria': 10,
 'epoch_index': 0,
 'train_loss': [],
 'train_acc': [],
 'val_loss': [],
 'val_acc': [],
 'test_loss': [],
 'test_acc': [],
 'model_filename': 'model.pth'}

In [29]:
import tqdm

# 에포크만큼
for epoch in tqdm.tqdm(range(num_epochs)):

#     print('epoch',epoch)
#     print(train_state['epoch_index']) 
    train_state['epoch_index'] +=1 

    running_loss = 0.0
    running_acc = 0.0


#     모델을 학습 모드로 설정 -> 드롭아웃 및 배치 정규화와 같은 학습 중에만 적용되는 기법들이 활성화
#     모델을 평가 모드로 전환하려면 classifier.eval()을 사용
    classifier.train()
# 배치 만큼
    for batch_idx, batch_data in enumerate(Traindataloader):

        

#       1. 옵티마이저 그레디언트 0으로 초기화
        optimizer.zero_grad()
#       2. 모델에 데이터 넣어서 출력받기
        y_pred = classifier(x_in=batch_data['x_data'])
#       3. loss 계산하기
        loss =  loss_func(y_pred, batch_data['y_target'])
    
#       tensor(0.3190) -> 0.3190, item()으로 스칼라 값만 추출
        loss_t = loss.item()

#       배치에서의 평균 loss 구하기
        running_loss += (loss_t - running_loss) / (batch_idx + 1)

#       4. gradient 계산하기
        loss.backward()

#       5. 옵티마이저 가중치 업데이트
        optimizer.step()

#       Accuracy 계산
        acc_t = compute_accuracy(y_pred, batch_data['y_target'])
        running_acc += (acc_t - running_acc) / (batch_idx + 1)



    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)


#   valid에 대한 계산

    running_loss = 0.0
    running_acc = 0.0

    classifier.eval() # 모델 파라미터를 수정하지 못 하게 비활성화

    for batch_idx, batch_data in enumerate(Validdataloader):

#       1. 모델의 출력값(y_pred)계산
        y_pred = classifier(x_in=batch_data['x_data'])

#       2. loss 계산
        loss = loss_func(y_pred,batch_data['y_target'])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_idx + 1)

#       3. Accuracy 계산
        acc_t = compute_accuracy(y_pred,batch_data['y_target'])
        running_acc += (acc_t - running_acc) / (batch_idx + 1)
    
    print("val_loss",running_loss)
    print("val_acc",running_acc)

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    

#   전체 loss, acc 저장
    train_state = update_train_state(model=classifier,
                                     train_state=train_state)
#   early stop해라고 했으면 학습 멈추기    
    if train_state['stop_early']:
        break



  1%|▍                                          | 1/100 [00:07<11:36,  7.04s/it]

val_loss 7.613911078526424
val_acc 7.158954326923076


  2%|▊                                          | 2/100 [00:13<11:03,  6.77s/it]

val_loss 7.004817229050856
val_acc 10.223858173076922


  3%|█▎                                         | 3/100 [00:20<11:07,  6.88s/it]

val_loss 6.769168120164138
val_acc 11.70372596153846


  4%|█▋                                         | 4/100 [00:27<11:09,  6.97s/it]

val_loss 6.636771440505982
val_acc 12.620192307692308


  5%|██▏                                        | 5/100 [00:34<10:52,  6.87s/it]

val_loss 6.557459134321947
val_acc 13.45402644230769


  6%|██▌                                        | 6/100 [00:41<10:41,  6.82s/it]

val_loss 6.511043603603657
val_acc 13.536658653846153


  7%|███                                        | 7/100 [00:47<10:34,  6.82s/it]

val_loss 6.484455603819627
val_acc 13.581730769230768


  8%|███▍                                       | 8/100 [00:54<10:19,  6.74s/it]

val_loss 6.448257996485784
val_acc 14.212740384615387


  9%|███▊                                       | 9/100 [01:01<10:15,  6.76s/it]

val_loss 6.435615429511437
val_acc 14.002403846153845


 10%|████▏                                     | 10/100 [01:09<10:38,  7.09s/it]

val_loss 6.432023066740769
val_acc 14.024939903846152


 11%|████▌                                     | 11/100 [01:16<10:29,  7.08s/it]

val_loss 6.439559789804312
val_acc 14.04747596153846


 12%|█████                                     | 12/100 [01:23<10:21,  7.06s/it]

val_loss 6.439962717202995
val_acc 14.325420673076922


 13%|█████▍                                    | 13/100 [01:29<10:05,  6.96s/it]

val_loss 6.446129450431237
val_acc 14.197716346153848


 14%|█████▉                                    | 14/100 [01:36<09:51,  6.88s/it]

val_loss 6.4573691991659325
val_acc 14.460637019230768


 15%|██████▎                                   | 15/100 [01:43<09:40,  6.83s/it]

val_loss 6.452565504954411
val_acc 14.423076923076922


 16%|██████▋                                   | 16/100 [01:50<09:47,  7.00s/it]

val_loss 6.448926980678851
val_acc 14.693509615384615


 17%|███████▏                                  | 17/100 [01:57<09:30,  6.87s/it]

val_loss 6.4620731610518245
val_acc 14.678485576923078


 18%|███████▌                                  | 18/100 [02:04<09:25,  6.90s/it]

val_loss 6.488405814537636
val_acc 14.438100961538463


 19%|███████▉                                  | 19/100 [02:11<09:22,  6.95s/it]

val_loss 6.479242746646589
val_acc 14.595853365384613


 19%|███████▉                                  | 19/100 [02:18<09:52,  7.31s/it]

val_loss 6.48716009580172
val_acc 14.75360576923077





### Test 진행

In [30]:
# 가장 좋은 모델을 사용해 테스트 세트의 손실과 정확도를 계산합니다

classifier.load_state_dict(torch.load(train_state['model_filename']))

running_loss = 0.0
running_acc = 0.0

# 가중치 업데이트 하지 못 하게
classifier.eval()

for batch_idx, batch_data in enumerate(Testdataloader):
    
    y_pred = classifier(x_in=batch_data['x_data'])
    loss = loss_func(y_pred,batch_data['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_idx + 1)
    
    acc_t = compute_accuracy(y_pred, batch_data['y_target'])
    running_acc += (acc_t - running_acc) / (batch_idx + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [31]:
print("테스트 손실: {:.3f}".format(train_state['test_loss']))
print("테스트 정확도: {:.2f}".format(train_state['test_acc']))

테스트 손실: 7.265
테스트 정확도: 13.17


In [32]:
train_state

{'stop_early': True,
 'early_stopping_step': 10,
 'early_stopping_best_val': 6.432023066740769,
 'early_stopping_criteria': 10,
 'epoch_index': 20,
 'train_loss': [8.603957253117713,
  7.111192495592179,
  6.5485151544693965,
  6.213718360470187,
  5.98085355758667,
  5.796104023533482,
  5.652096659906449,
  5.525909473819118,
  5.419759035110474,
  5.318656921386717,
  5.229448260799533,
  5.148317594682014,
  5.074486763246597,
  5.004274406740742,
  4.942227005958556,
  4.885120161118047,
  4.825584430848396,
  4.779782606709388,
  4.728214671534876,
  4.682388713282925],
 'train_acc': [3.0257686491935494,
  9.452179939516125,
  11.364352318548386,
  12.614982358870972,
  13.289125504032262,
  13.623046875000002,
  14.134954637096778,
  14.501953125000007,
  14.703566028225806,
  14.990234375000002,
  15.368258568548384,
  15.661227318548383,
  15.854964717741938,
  16.396799395161285,
  16.692918346774196,
  17.16072328629032,
  17.634828629032267,
  17.89944556451614,
  18.329448

In [33]:
def pretty_print(results):
    """
    임베딩 결과를 출력합니다
    """
    for item in results:
        print ("...[%.2f] - %s"%(item[1], item[0]))

def get_closest(target_word, word_to_idx, embeddings, n=5):
    """
    n개의 최근접 단어를 찾습니다.
    """

    # 다른 모든 단어까지 거리를 계산합니다
    word_embedding = embeddings[word_to_idx[target_word.lower()]]
    distances = []
    for word, index in word_to_idx.items():
        if word == "<MASK>" or word == target_word:
            continue
        distances.append((word, torch.dist(word_embedding, embeddings[index])))
    
    results = sorted(distances, key=lambda x: x[1])[1:n+2]
    return results

In [34]:
word = input('단어를 입력해 주세요: ')
embeddings = classifier.embedding.weight.data
word_to_idx = cbow_vocab.token_to_idx
pretty_print(get_closest(word, word_to_idx, embeddings, n=5))

단어를 입력해 주세요: of
...[6.83] - pained
...[6.89] - extremity
...[6.92] - dreaming
...[7.05] - contradictory
...[7.10] - elect
...[7.27] - expressed


In [35]:
target_words = ['frankenstein', 'monster', 'science', 'sickness', 'lonely', 'happy']

embeddings = classifier.embedding.weight.data
word_to_idx = cbow_vocab.token_to_idx

for target_word in target_words: 
    print(f"======={target_word}=======")
    if target_word not in word_to_idx:
        print("Not in vocabulary")
        continue
    pretty_print(get_closest(target_word, word_to_idx, embeddings, n=5))

...[6.67] - shuddered
...[6.81] - ingratitude
...[6.85] - mannheim
...[6.92] - thoughts
...[6.96] - sincerity
...[7.01] - dragging
...[6.46] - deserted
...[6.55] - distinguished
...[6.57] - recovery
...[6.58] - almighty
...[6.62] - inclemency
...[6.63] - contrary
...[7.12] - playfellow
...[7.13] - meeting
...[7.19] - breezes
...[7.25] - teachers
...[7.27] - bestowing
...[7.32] - struck
...[7.01] - destroy
...[7.14] - somewhat
...[7.19] - copy
...[7.31] - doth
...[7.32] - bent
...[7.38] - classes
...[6.41] - luxury
...[6.61] - account
...[6.66] - moulded
...[6.66] - clasped
...[6.68] - married
...[6.70] - juras
...[6.46] - keep
...[6.53] - stump
...[6.54] - belonging
...[6.56] - brutality
...[6.74] - intoxicating
...[6.74] - avidity
