# 예제: CBOW 임베딩 학습하기

In [1]:
import pandas as pd

df = pd.read_csv("../../data/frankenstein_with_splits.csv")
df

Unnamed: 0,context,target,split
0,", or the",frankenstein,train
1,frankenstein or the modern,",",train
2,"frankenstein , the modern prometheus",or,train
3,"frankenstein , or modern prometheus by",the,train
4,", or the prometheus by mary",modern,train
...,...,...,...
90693,newsletter to hear new ebooks .,about,test
90694,to hear about ebooks .,new,test
90695,hear about new .,ebooks,test
90696,about new ebooks,.,test


In [2]:
# 데이터를 다시 train/valid/test로 나눠줌

# train 데이터 
train_df = df[df.split=='train']
train_size = len(train_df)

# valid 데이터 
val_df = df[df.split=='val']
val_size = len(val_df)

# test 데이터 
test_df = df[df.split=='test']
test_size = len(test_df)

In [38]:
print("train : ",train_size)
print("valid : ",val_size)
print("test : ",test_size)

train :  63489
valid :  13605
test :  13604


In [3]:
lookup_dict = {'train': (train_df, train_size),
                             'val': (val_df, val_size),
                             'test': (test_df, test_size)}
# lookup_dict

In [22]:
train_df.tail(10)

Unnamed: 0,context,target,split
63479,i be alone,?,train
63480,be alone ?,,train
63481,had feelings of,i,train
63482,i feelings of affection,had,train
63483,"i had of affection ,",feelings,train
63484,"i had feelings affection , and",of,train
63485,"had feelings of , and they",affection,train
63486,feelings of affection and they were,",",train
63487,"of affection , they were requited",and,train
63488,"affection , and were requited by",they,train


### 2. Vocabulary

In [5]:
# from collections import Counter
# import string

# # Counter()를 통해 어떤 단어가 얼만큼의 횟수로 들어있는지를 알 수 있다.
# word_counts = Counter()
# for name_text in df.context:
#     for word in name_text.split(" "):
#         # word가 .(구두점,punctuation)이 아닐 경우 word에 추가
#         if word not in string.punctuation:
#             word_counts[word] += 1

# # word_counts

In [6]:

class Vocabulary:
    def __init__(self, mask_token="<MASK>",add_unk=True):
        self.token_to_idx = {}
        self.idx_to_token = {}
        
        self.mask_index = self.add_token(mask_token)
#         "UNK" 토큰이 추가되지 않는 경우에는 -1로 설정,
        self.unk_index = -1
        if add_unk:
#         "UNK" 토큰이 추가될 경우에는 UNK에 해당하는 인덱스로 설정,
            self.unk_index = self.add_token('<UNK>') 

    def add_token(self, token):
        
#       만약 해당 토큰이 있으면 토큰 idx만 return
        if token in self.token_to_idx:
            index = self.token_to_idx[token]
            
#       만약 해당 토큰이 없으면 새로운 토큰 만들어줌
        else:
            index = len(self.token_to_idx)
            self.token_to_idx[token] = index
            self.idx_to_token[index] = token
        return index

In [7]:
# CBOW Vocabulary 객체 생성

cbow_vocab = Vocabulary()
for index, row in df.iterrows():
    for token in row.context.split(' '):
        cbow_vocab.add_token(token)
    cbow_vocab.add_token(row.target)


In [8]:
print(dict(list(cbow_vocab.token_to_idx.items())[:5]))

{'<MASK>': 0, '<UNK>': 1, ',': 2, 'or': 3, 'the': 4}


In [9]:
print(dict(list(cbow_vocab.idx_to_token.items())[:5]))

{0: '<MASK>', 1: '<UNK>', 2: ',', 3: 'or', 4: 'the'}


## 3. Vectorizer

In [10]:
# 주어진 토큰에 대응하는 인덱스 반환

def lookup_token(vocabulary_class,token):

# UNK 토큰이 있을 경우
    if vocabulary_class.unk_index >= 0:
#           토큰을 찾아보고 없으면 unk_index 반환, 있으면 해당 토큰의 idx를 반환
        return vocabulary_class.token_to_idx.get(token, vocabulary_class.unk_index)
    else:
        return vocabulary_class.token_to_idx[token]
    

In [11]:
# 주어진 인덱스에 대응하는 토큰 반환

def lookup_index(vocabulary_class, index):
        if index not in vocabulary_class.idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return vocabulary_class.idx_to_token[index]
    

### Vectorize

In [12]:
import numpy as np 

def vectorize(context, vector_length=-1):
    
    indices = [lookup_token(cbow_vocab,token) for token in context.split(' ')]
    if vector_length < 0:
        vector_length = len(indices)
    
    
#   인덱스와 mask로 이루어진 out_vector를 만든다.
    out_vector = np.zeros(vector_length, dtype=np.int64)
    out_vector[:len(indices)] = indices
#     문장 길이가 작아서 padding 진행하면 해당 부분 mask 처리
    out_vector[len(indices):] = cbow_vocab.mask_index
    
    
    return out_vector

# vector_length 보다 문장의 토큰의 개수가 작으면 MASK 처리된다. 
print(vectorize("all dafs dfkdl",vector_length=10))

[215   1   1   0   0   0   0   0   0   0]


### Dataset class

In [13]:
# max_seq_length를 구해야 한다. 

import torch
from torch.utils.data import Dataset

class CBOWDataset(Dataset):
    def __init__(self, cbow_df):
        
        self.cbow_df = cbow_df
        
        measure_len = lambda context: len(context.split(" "))
        self.max_seq_length = max(map(measure_len, cbow_df.context))

    def __len__(self):
        return len(self.cbow_df)

    def __getitem__(self, index):
        
        row = self.cbow_df.iloc[index]

        context_vector = vectorize(row.context, self.max_seq_length)
        target_index = lookup_token(cbow_vocab,row.target)

        return {'x_data': context_vector,
                'y_target': target_index}

In [21]:
print(train_dataset.max_seq_length)

6


In [14]:
# 데이터셋을 인스턴스화 해주어야 로더에 넣어줄 수 있다. 

train_dataset = CBOWDataset(train_df)
train_dataset

valid_dataset = CBOWDataset(val_df)
valid_dataset

test_dataset = CBOWDataset(test_df)
test_dataset


<__main__.CBOWDataset at 0x7fd83901fd30>

In [15]:
# 데이터 로더 설정
from torch.utils.data import DataLoader

# drop_last=True -> 배치 사이즈보다 over하면 drop

Traindataloader = DataLoader(dataset=train_dataset, batch_size=512,
                            shuffle=True, drop_last=True)

Validdataloader = DataLoader(dataset=valid_dataset, batch_size=512,
                            shuffle=False, drop_last=True)

Testdataloader = DataLoader(dataset=test_dataset, batch_size=512,
                            shuffle=False, drop_last=True)


In [16]:
print(len(train_dataset),len(Traindataloader))

63489 124


In [17]:
for batch_index, batch_dict in enumerate(Traindataloader):
    print(batch_index)
    print(batch_dict)
    
    break
    

0
{'x_data': tensor([[ 455,  173,   26, 1637,  319, 1089],
        [  48,  319,  632,  648, 5482,   27],
        [  48, 1275,   23,   19,  794,   82],
        ...,
        [ 760,  196,  116,  983,  568,   48],
        [2449, 4453,    2,   48,  783,   28],
        [  72,  154,  203,  207,    8,    4]]), 'y_target': tensor([ 358,   72,  173, 1139,   19,   19, 5194, 3724, 4969,  226, 1383,   96,
           8,    2,   44, 6068, 6065,  760,  305,  225,    2,    2,   63, 2658,
         819,   27,  766,    4,  658,   44,  255, 3026,  534,  569, 5413,    4,
           2,   44, 2725,  255,    2, 2339,   15,   33, 2174, 2954,   50,   19,
          39,  642,   44, 1774, 5275,    2,  925,   50,   50,    3,   49,  715,
         535, 1206,  689,   85,   48,   33,   44,   15, 2528,   68,   10,   15,
           2, 1874,  280,  474,   50,   49,   48,   48, 1177,  724, 2252,   34,
          48,  194, 5264,  715,  104,    4, 4001,   33,  218, 4572,   48,   19,
        2222,   72,  535,  939, 3170, 1706, 

### 모델정의 NameClassifier

In [23]:
# embedding_size를 지정해줘야 한다.
# 1. Embedding layer를 이용해서 문맥의 단어를 나타내는 인덱스를 각 단어에 대한 벡터로 만든다.
# 2. 전반적인 문맥을 감지하도록 벡터를 결합한다.(sum)
 
import torch.nn as nn
import torch.nn.functional as F

class CBOWClassifier(nn.Module):
    """
    매개변수:
        vocabulary_size (int): 어휘 사전 크기, 임베딩 개수와 예측 벡터 크기를 결정합니다
        embedding_size (int): 임베딩 크기
        padding_idx (int): 기본값 0; 임베딩은 이 인덱스를 사용하지 않습니다
    """
    
    def __init__(self, vocabulary_size, embedding_size, padding_idx=0):
        
#       torch.nn.Module의 초기화 메서드를 실행하여 해당 클래스의 기능을 상속받음
        super(CBOWClassifier, self).__init__()
        # Embedding layer를 이용해서 문맥의 단어를 나타내는 인덱스를 각 단어에 대한 벡터로 만듦
        
        self.embedding =  nn.Embedding(num_embeddings=vocabulary_size, 
                                       embedding_dim=embedding_size,
                                       padding_idx=padding_idx)
#         embedding에서 나오면 embedding_size * vocab_size 차원이 된다. 
        self.fc1 = nn.Linear(in_features=embedding_size,
                             out_features=vocabulary_size)
    
    def forward(self, x_in, apply_softmax=False):
        # 전반적인 문맥을 감지하도록 벡터를 결합함(sum)
        x_embedded_sum = F.dropout(self.embedding(x_in).sum(dim=1), 0.3)
        y_out = self.fc1(x_embedded_sum)
        
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)
            
        return y_out

In [24]:
# embedding_size를 지정해줘야 한다. 
embedding_size=50 

classifier = CBOWClassifier(vocabulary_size=len(cbow_vocab.token_to_idx), 
                            embedding_size=embedding_size)
classifier

CBOWClassifier(
  (embedding): Embedding(7270, 50, padding_idx=0)
  (fc1): Linear(in_features=50, out_features=7270, bias=True)
)

### 옵티마이저, loss function

In [25]:
lr = 0.001
num_epochs = 100

In [26]:
# 옵티마이저
import torch.optim as optim

optimizer = optim.Adam(classifier.parameters(), lr = lr)
optimizer


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [27]:
loss_func = nn.CrossEntropyLoss()
loss_func

CrossEntropyLoss()

### Train

In [28]:
def compute_accuracy(y_pred, y_target):
#      예측값과 타겟값을 비교하여 일치하는 개수를 계산
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100
   

In [29]:
# Train state 초기화 
def make_train_state():
    return {
        'stop_early':False,
        'early_stopping_step':0,
        'early_stopping_best_val':1e8,
        'early_stopping_criteria' : 10,
        'epoch_index' : 0,
        'train_loss': [], 
        'train_acc' :[], 
        'val_loss' : [],
        'val_acc' : [], 
        'test_loss' : [],
        'test_acc' : [],
         
#       모델 저장파일
        'model_filename' : 'model.pth'
    } 


# Train update 
def update_train_state(model, train_state):
    
#   학습시작하면 초기에 모델 저장하기 
    
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(),train_state['model_filename'])
        
#   모델 성능이 향상되면 모델 저장(valid loss가 더 낮아지면)
    elif train_state['epoch_index'] >=1 :
        loss_t = train_state['val_loss'][-1]
#        loss가 나빠지면 early stop step 업데이트
        if loss_t >= train_state['early_stopping_best_val']:
            train_state['early_stopping_step']+=1
            
#        loss가 좋아지면   
        else:
#            early stop step 0으로 다시 초기화        
            train_state['early_stopping_step']=0
    
#           최저 loss이면 모델 저장 
            if loss_t < train_state['early_stopping_best_val']:
                train_state['early_stopping_best_val'] = loss_t
                torch.save(model.state_dict(),train_state['model_filename'])

#       기준점 넘으면 early stop 
        if train_state['early_stopping_step'] >= train_state['early_stopping_criteria']:
            train_state['stop_early'] = True
        
        return train_state


In [30]:
# 모델 진행 상황 함수 초기화
train_state = make_train_state()
train_state

{'stop_early': False,
 'early_stopping_step': 0,
 'early_stopping_best_val': 100000000.0,
 'early_stopping_criteria': 10,
 'epoch_index': 0,
 'train_loss': [],
 'train_acc': [],
 'val_loss': [],
 'val_acc': [],
 'test_loss': [],
 'test_acc': [],
 'model_filename': 'model.pth'}

In [31]:
import tqdm

# 에포크만큼
for epoch in tqdm.tqdm(range(num_epochs)):

#     print('epoch',epoch)
#     print(train_state['epoch_index']) 
    train_state['epoch_index'] +=1 

    running_loss = 0.0
    running_acc = 0.0


#     모델을 학습 모드로 설정 -> 드롭아웃 및 배치 정규화와 같은 학습 중에만 적용되는 기법들이 활성화
#     모델을 평가 모드로 전환하려면 classifier.eval()을 사용
    classifier.train()
# 배치 만큼
    for batch_idx, batch_data in enumerate(Traindataloader):

        

#       1. 옵티마이저 그레디언트 0으로 초기화
        optimizer.zero_grad()
#       2. 모델에 데이터 넣어서 출력받기
        y_pred = classifier(x_in=batch_data['x_data'])
#       3. loss 계산하기
        loss =  loss_func(y_pred, batch_data['y_target'])
    
#       tensor(0.3190) -> 0.3190, item()으로 스칼라 값만 추출
        loss_t = loss.item()

#       배치에서의 평균 loss 구하기
        running_loss += (loss_t - running_loss) / (batch_idx + 1)

#       4. gradient 계산하기
        loss.backward()

#       5. 옵티마이저 가중치 업데이트
        optimizer.step()

#       Accuracy 계산
        acc_t = compute_accuracy(y_pred, batch_data['y_target'])
        running_acc += (acc_t - running_acc) / (batch_idx + 1)



    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)


#   valid에 대한 계산

    running_loss = 0.0
    running_acc = 0.0

    classifier.eval() # 모델 파라미터를 수정하지 못 하게 비활성화

    for batch_idx, batch_data in enumerate(Validdataloader):

#       1. 모델의 출력값(y_pred)계산
        y_pred = classifier(x_in=batch_data['x_data'])

#       2. loss 계산
        loss = loss_func(y_pred,batch_data['y_target'])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_idx + 1)

#       3. Accuracy 계산
        acc_t = compute_accuracy(y_pred,batch_data['y_target'])
        running_acc += (acc_t - running_acc) / (batch_idx + 1)
    
    print("val_loss",running_loss)
    print("val_acc",running_acc)

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    

#   전체 loss, acc 저장
    train_state = update_train_state(model=classifier,
                                     train_state=train_state)
#   early stop해라고 했으면 학습 멈추기    
    if train_state['stop_early']:
        break



  1%|▍                                          | 1/100 [00:07<13:02,  7.91s/it]

val_loss 7.651246015842145
val_acc 6.865985576923077


  2%|▊                                          | 2/100 [00:14<11:39,  7.14s/it]

val_loss 7.0485306703127355
val_acc 9.983473557692308


  3%|█▎                                         | 3/100 [00:21<11:09,  6.90s/it]

val_loss 6.78089948800894
val_acc 11.20042067307692


  4%|█▋                                         | 4/100 [00:27<10:53,  6.80s/it]

val_loss 6.65202104128324
val_acc 12.454927884615383


  5%|██▏                                        | 5/100 [00:34<10:56,  6.91s/it]

val_loss 6.576415575467623
val_acc 13.025841346153845


  6%|██▌                                        | 6/100 [00:41<10:43,  6.85s/it]

val_loss 6.516040600263156
val_acc 13.341346153846153


  7%|███                                        | 7/100 [00:48<10:33,  6.81s/it]

val_loss 6.4738259865687455
val_acc 13.551682692307692


  8%|███▍                                       | 8/100 [00:54<10:19,  6.73s/it]

val_loss 6.448409869120671
val_acc 13.694411057692307


  9%|███▊                                       | 9/100 [01:01<10:09,  6.70s/it]

val_loss 6.4282675706423245
val_acc 13.987379807692307


 10%|████▏                                     | 10/100 [01:08<10:00,  6.67s/it]

val_loss 6.419538589624257
val_acc 14.115084134615383


 11%|████▌                                     | 11/100 [01:14<09:51,  6.64s/it]

val_loss 6.429953593474168
val_acc 13.949819711538463


 12%|█████                                     | 12/100 [01:21<09:47,  6.68s/it]

val_loss 6.435051624591534
val_acc 14.43810096153846


 13%|█████▍                                    | 13/100 [01:28<09:39,  6.66s/it]

val_loss 6.436679418270405
val_acc 14.25030048076923


 14%|█████▉                                    | 14/100 [01:34<09:38,  6.72s/it]

val_loss 6.42897947017963
val_acc 14.51322115384615


 15%|██████▎                                   | 15/100 [01:41<09:30,  6.71s/it]

val_loss 6.430670866599449
val_acc 14.415564903846153


 16%|██████▋                                   | 16/100 [01:48<09:19,  6.66s/it]

val_loss 6.454676756492028
val_acc 14.197716346153847


 17%|███████▏                                  | 17/100 [01:54<09:14,  6.68s/it]

val_loss 6.468279893581684
val_acc 14.235276442307692


 18%|███████▌                                  | 18/100 [02:01<09:11,  6.73s/it]

val_loss 6.464673794232881
val_acc 14.505709134615385


 19%|███████▉                                  | 19/100 [02:08<09:04,  6.73s/it]

val_loss 6.4765980427081775
val_acc 14.520733173076922


 19%|███████▉                                  | 19/100 [02:15<09:35,  7.11s/it]

val_loss 6.495692601570716
val_acc 14.077524038461537





### Test 진행

In [32]:
# 가장 좋은 모델을 사용해 테스트 세트의 손실과 정확도를 계산합니다

classifier.load_state_dict(torch.load(train_state['model_filename']))

running_loss = 0.0
running_acc = 0.0

# 가중치 업데이트 하지 못 하게
classifier.eval()

for batch_idx, batch_data in enumerate(Testdataloader):
    
    y_pred = classifier(x_in=batch_data['x_data'])
    loss = loss_func(y_pred,batch_data['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_idx + 1)
    
    acc_t = compute_accuracy(y_pred, batch_data['y_target'])
    running_acc += (acc_t - running_acc) / (batch_idx + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [33]:
print("테스트 손실: {:.3f}".format(train_state['test_loss']))
print("테스트 정확도: {:.2f}".format(train_state['test_acc']))

테스트 손실: 7.267
테스트 정확도: 12.97


In [34]:
train_state

{'stop_early': True,
 'early_stopping_step': 10,
 'early_stopping_best_val': 6.419538589624257,
 'early_stopping_criteria': 10,
 'epoch_index': 20,
 'train_loss': [8.67147666023624,
  7.120615939940177,
  6.559917034641385,
  6.242215045036807,
  6.003153427954644,
  5.81934589724387,
  5.665385280886002,
  5.540672867528853,
  5.432552322264639,
  5.332696237871722,
  5.24340657264956,
  5.161360375342832,
  5.083277917677356,
  5.013531242647476,
  4.955493523228553,
  4.896792207994769,
  4.838583561681934,
  4.792905761349587,
  4.7379609231025945,
  4.693015256235678],
 'train_acc': [3.0572706653225814,
  9.088331653225808,
  11.143838205645162,
  12.375567036290326,
  13.20407006048387,
  13.730153729838714,
  14.201108870967744,
  14.544480846774198,
  14.764994959677422,
  15.141444052419361,
  15.39188508064516,
  15.615549395161285,
  15.940020161290317,
  16.37317288306451,
  16.70236895161289,
  16.9465095766129,
  17.412739415322573,
  17.61750252016129,
  18.1388608870967

In [35]:
def pretty_print(results):
    """
    임베딩 결과를 출력합니다
    """
    for item in results:
        print ("...[%.2f] - %s"%(item[1], item[0]))

def get_closest(target_word, word_to_idx, embeddings, n=5):
    """
    n개의 최근접 단어를 찾습니다.
    """

    # 다른 모든 단어까지 거리를 계산합니다
    word_embedding = embeddings[word_to_idx[target_word.lower()]]
    distances = []
    for word, index in word_to_idx.items():
        if word == "<MASK>" or word == target_word:
            continue
        distances.append((word, torch.dist(word_embedding, embeddings[index])))
    
    results = sorted(distances, key=lambda x: x[1])[1:n+2]
    return results

In [36]:
word = input('단어를 입력해 주세요: ')
embeddings = classifier.embedding.weight.data
word_to_idx = cbow_vocab.token_to_idx
pretty_print(get_closest(word, word_to_idx, embeddings, n=5))

단어를 입력해 주세요: good
...[6.79] - ascii
...[6.80] - bestowing
...[6.84] - court
...[6.86] - furs
...[6.90] - delicate
...[6.91] - parts


In [37]:
target_words = ['frankenstein', 'monster', 'science', 'sickness', 'lonely', 'happy']

embeddings = classifier.embedding.weight.data
word_to_idx = cbow_vocab.token_to_idx

for target_word in target_words: 
    print(f"======={target_word}=======")
    if target_word not in word_to_idx:
        print("Not in vocabulary")
        continue
    pretty_print(get_closest(target_word, word_to_idx, embeddings, n=5))

...[7.17] - computers
...[7.32] - submit
...[7.50] - youngest
...[7.55] - played
...[7.56] - status
...[7.60] - tranquillize
...[7.23] - luxuriances
...[7.36] - golden
...[7.43] - frightened
...[7.45] - machines
...[7.47] - passages
...[7.47] - damp
...[6.68] - considered
...[6.78] - wreaked
...[6.83] - image
...[6.88] - changed
...[6.93] - gigantic
...[6.93] - inconceivable
...[7.12] - cursed
...[7.19] - bernard
...[7.20] - dash
...[7.21] - blow
...[7.25] - construct
...[7.28] - angelic
...[7.14] - ramble
...[7.34] - deprived
...[7.42] - the
...[7.42] - diabolically
...[7.48] - daring
...[7.54] - modulated
...[6.75] - air
...[6.83] - trials
...[6.87] - precisely
...[6.90] - deprecate
...[6.92] - might
...[6.95] - animated
