# 예제: CNN으로 성씨 분류하기

## 데이터 로드

In [1]:
import pandas as pd

df = pd.read_csv("../../data/surnames_with_splits.csv")
df


Unnamed: 0,nationality,nationality_index,split,surname
0,Arabic,15,train,Totah
1,Arabic,15,train,Abboud
2,Arabic,15,train,Fakhoury
3,Arabic,15,train,Srour
4,Arabic,15,train,Sayegh
...,...,...,...,...
10975,Vietnamese,11,test,Dinh
10976,Vietnamese,11,test,Phung
10977,Vietnamese,11,test,Quang
10978,Vietnamese,11,test,Vu


In [2]:
df['split'].value_counts()

split
train    7680
test     1660
val      1640
Name: count, dtype: int64

### 데이터 split(train/valid/test)

In [3]:
# 데이터를 다시 train/valid/test로 나눠줌

# train 데이터 
train_df = df[df.split=='train']
train_size = len(train_df)

# valid 데이터 
val_df = df[df.split=='val']
val_size = len(val_df)

# test 데이터 
test_df = df[df.split=='test']
test_size = len(test_df)

In [4]:
lookup_dict = {'train': (train_df, train_size), 
              'val': (val_df, val_size), 
              'test': (test_df, test_size)}


## 2. Vocabulary

In [5]:
class Vocabulary:
    def __init__(self, token_to_idx=None):

        if token_to_idx is None:
            token_to_idx = {}
        self.token_to_idx = token_to_idx

        self.idx_to_token = {idx: token 
                              for token, idx in self.token_to_idx.items()}

    def add_token(self, token):
        
#       만약 해당 토큰이 있으면 토큰 idx만 return
        if token in self.token_to_idx:
            index = self.token_to_idx[token]
            
#       만약 해당 토큰이 없으면 새로운 토큰 만들어줌
        else:
            index = len(self.token_to_idx)
            self.token_to_idx[token] = index
            self.idx_to_token[index] = token
        return index

In [6]:
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)
        
        

In [7]:
# Vocab 생성
char_vocab = SequenceVocabulary()
nationality_vocab = Vocabulary()

for index, row in df.iterrows():
    for char in row.surname:
        char_vocab.add_token(char)
    nationality_vocab.add_token(row.nationality)

### Char Vocabulary

In [8]:
print(dict(list(char_vocab.token_to_idx.items())[:10]))

{'<MASK>': 0, '<UNK>': 1, '<BEGIN>': 2, '<END>': 3, 'T': 4, 'o': 5, 't': 6, 'a': 7, 'h': 8, 'A': 9}


In [9]:
print(dict(list(char_vocab.idx_to_token.items())[:10]))

{0: '<MASK>', 1: '<UNK>', 2: '<BEGIN>', 3: '<END>', 4: 'T', 5: 'o', 6: 't', 7: 'a', 8: 'h', 9: 'A'}


## 3. Vectorizer

In [10]:
# 주어진 토큰에 대응하는 인덱스 반환

def lookup_token(vocabulary_class,token):
    return vocabulary_class.token_to_idx[token]
    

In [11]:
# 주어진 인덱스에 대응하는 토큰 반환

def lookup_index(vocabulary_class, index):
        if index not in vocabulary_class.idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return vocabulary_class.idx_to_token[index]
    

In [12]:
vocab_length = len(char_vocab.token_to_idx)
print("토큰의 수:", vocab_length)

토큰의 수: 88


### 텍스트(surname)에 대한 원 핫 인코딩

In [13]:
import numpy as np 

# vector_length (int): 인덱스 벡터의 길이를 맞추기 위한 매개변수
def vectorize(surname, vector_length=-1):

    indices = [char_vocab.begin_seq_index]
    indices.extend(lookup_token(char_vocab,token) 
                   for token in surname)
    indices.append(char_vocab.end_seq_index)

    if vector_length < 0:
        vector_length = len(indices)
    
    from_vector = np.empty(vector_length, dtype=np.int64)         
    from_indices = indices[:-1]
    from_vector[:len(from_indices)] = from_indices
    from_vector[len(from_indices):] = char_vocab.mask_index

    to_vector = np.empty(vector_length, dtype=np.int64)
    to_indices = indices[1:]
    to_vector[:len(to_indices)] = to_indices
    to_vector[len(to_indices):] = char_vocab.mask_index
        
    return from_vector, to_vector

print("예시")
example = vectorize("Choi", 10)
print(example)
print(len(example[0]))

예시
(array([ 2, 20,  8,  5, 23,  0,  0,  0,  0,  0]), array([20,  8,  5, 23,  3,  0,  0,  0,  0,  0]))
10


### SurnameDataset class

In [14]:
import torch
from torch.utils.data import Dataset

class SurnameDataset(Dataset):
    def __init__(self, surname_df):
        self.surname_df = surname_df
        self.max_seq_length = max(map(len, df.surname)) + 2
        
    def __len__(self):
        return len(self.surname_df)

    def __getitem__(self, index):
        row = self.surname_df.iloc[index]
        
        from_vector, to_vector = vectorize(row.surname, self.max_seq_length)

        return {'x_data': from_vector, 
                'y_target': to_vector}
    

### 데이터셋 class

In [15]:
# 데이터셋을 인스턴스화 해주어야 로더에 넣어줄 수 있다. 

train_dataset = SurnameDataset(train_df)
train_dataset

valid_dataset = SurnameDataset(val_df)
valid_dataset

test_dataset = SurnameDataset(test_df)
test_dataset


<__main__.SurnameDataset at 0x7fe3e96e4550>

In [16]:
print(train_dataset.max_seq_length)
print(valid_dataset.max_seq_length)
print(test_dataset.max_seq_length)

19
19
19


In [17]:
# 데이터 로더 설정
from torch.utils.data import DataLoader

# drop_last=True -> 배치 사이즈보다 over하면 drop

Traindataloader = DataLoader(dataset=train_dataset, batch_size=512,
                            shuffle=True, drop_last=True)

Validdataloader = DataLoader(dataset=valid_dataset, batch_size=512,
                            shuffle=True, drop_last=True)

Testdataloader = DataLoader(dataset=test_dataset, batch_size=512,
                            shuffle=True, drop_last=True)


In [18]:
print(len(train_dataset),len(Traindataloader))

7680 15


In [19]:
for batch_index, batch_dict in enumerate(Traindataloader):
#     print(batch_index)
    print(batch_dict)
    
    break
    

{'x_data': tensor([[ 2,  4,  5,  ...,  0,  0,  0],
        [ 2,  9, 10,  ...,  0,  0,  0],
        [ 2, 13,  5,  ...,  0,  0,  0],
        ...,
        [ 2, 29, 11,  ...,  0,  0,  0],
        [ 2, 44,  7,  ...,  0,  0,  0],
        [ 2, 26,  7,  ...,  0,  0,  0]]), 'y_target': tensor([[ 4,  5, 25,  ...,  0,  0,  0],
        [ 9, 10,  7,  ...,  0,  0,  0],
        [13,  5, 21,  ...,  0,  0,  0],
        ...,
        [29, 11, 18,  ...,  0,  0,  0],
        [44,  7, 25,  ...,  0,  0,  0],
        [26,  7, 15,  ...,  0,  0,  0]])}


## 모델 정의

In [20]:
# 사전 정의 함수
# 주어진 배치의 각 데이터 포인트에 대해 시퀀스의 마지막 벡터를 추출
# => y_out에 있는 각 데이터 포인트에서 마지막 벡터 추출
def column_gather(y_out, x_lengths):
    
#     x_lengths = x_lengths.long().detach().cpu().numpy() - 1
    x_lengths = x_lengths-1
    out = []
    for batch_index, length in enumerate(x_lengths):
        out.append(y_out[batch_index, length])

    return torch.stack(out)

### column_gather 출력예시

In [21]:
import torch

# 가상의 입력 데이터
y_out = torch.tensor([
    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],  # 첫 번째 시퀀스: 길이 3
    [[10, 11, 12], [13, 14, 15], [16, 17, 18]],  # 두 번째 시퀀스: 길이 3 (길이를 맞춤)
])

x_lengths = torch.tensor([3, 3])  # 각 시퀀스의 길이 (동일하게 맞춤)
# column_gather 함수 호출
result = column_gather(y_out, x_lengths)
print(result)

tensor([[ 7,  8,  9],
        [16, 17, 18]])


### 조건이 없는 성씨 생성 모델 

In [22]:
import torch.nn.functional as F 
import torch.nn as nn

class SurnameGenerationModel(nn.Module):
    def __init__(self, char_embedding_size, char_vocab_size, rnn_hidden_size, 
                 batch_first=True, padding_idx=0, dropout_p=0.5):
        
        super(SurnameGenerationModel, self).__init__()
        
        self.emb = nn.Embedding(num_embeddings = char_vocab_size,
                               embedding_dim = char_embedding_size,
                               padding_idx = padding_idx)
        
        self.rnn = nn.GRU(input_size=char_embedding_size, 
                          hidden_size=rnn_hidden_size,
                          batch_first=batch_first)
        
        self.fc = nn.Linear(in_features=rnn_hidden_size, 
                            out_features=char_vocab_size)
        
        self.dropout_p = dropout_p
        
    def forward(self, x_in, apply_softmax=False):
        x_embedded = self.emb(x_in)

        y_out, _ = self.rnn(x_embedded)

        batch_size, seq_size, feat_size = y_out.shape
        y_out = y_out.contiguous().view(batch_size * seq_size, feat_size)

        y_out = self.fc(F.dropout(y_out, p=self.dropout_p))
                         
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)
            
        new_feat_size = y_out.shape[-1]
        y_out = y_out.view(batch_size, seq_size, new_feat_size)
            
        return y_out


In [23]:
len(char_vocab.token_to_idx)

88

In [24]:
len(nationality_vocab.token_to_idx)

18

In [25]:
char_embedding_size = 32
rnn_hidden_size = 32

model = SurnameGenerationModel(char_embedding_size=char_embedding_size,
                               char_vocab_size=len(char_vocab.token_to_idx),
                               rnn_hidden_size=rnn_hidden_size,
                               padding_idx=char_vocab.mask_index)
model

SurnameGenerationModel(
  (emb): Embedding(88, 32, padding_idx=0)
  (rnn): GRU(32, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=88, bias=True)
)

### 모델 뜯어보기

In [26]:

char_vocab_size = len(char_vocab.token_to_idx) 
char_embedding_size = 32
rnn_hidden_size = 64
padding_idx=0
batch_first=True


emb = nn.Embedding(num_embeddings = char_vocab_size,
                           embedding_dim = char_embedding_size,
                           padding_idx = padding_idx)
        
rnn = nn.GRU(input_size=char_embedding_size, 
              hidden_size=rnn_hidden_size,
              batch_first=batch_first)

fc = nn.Linear(in_features=rnn_hidden_size, 
                out_features=char_vocab_size)
    

# 데이터 로더에서 미니배치를 반복합니다.
for batch_index, batch in enumerate(Traindataloader):
    # 각 미니배치에서 샘플을 가져옵니다.
    x_in = batch['x_data']
    y_in = batch['y_target']
    
    
    embedded_sample = emb(x_in)
    
    # 임베딩 결과를 출력합니다.
    print(f"미니배치 {batch_index}의 임베딩 결과:")
    print(embedded_sample.size())
    print(embedded_sample)
    
    y_out, _ = rnn(embedded_sample)
    
    
    print(" ")
    print(f"미니배치 {batch_index}의 rnn 결과:")
    print(y_out.size())
    print(y_out)
    
    batch_size, seq_size, feat_size = y_out.shape
    y_out = y_out.contiguous().view(batch_size * seq_size, feat_size)
    
    print(" ")
    print(f"미니배치 {batch_index}의 y_out:")
    print(y_out.size())
    print(y_out)
    
    y_out = fc(F.dropout(y_out, p=0.5))
    
    print(" ")
    print(f"미니배치 {batch_index}의 dropout 결과:")
    print(y_out.size())
    print(y_out)
    
    new_feat_size = y_out.shape[-1]
    print(" ")
    print(f"미니배치 {batch_index}의 y_out.shape[-1] 결과:")
    print(new_feat_size)  # 88

    
    y_out = y_out.view(batch_size, seq_size, new_feat_size)
    print(" ")
    print(f"미니배치 {batch_index}의 view 결과:")
    print(y_out.size())
    print(y_out)
        
        
#     y_pred = y_out
#     loss =  loss_func(y_pred, y_in)
    
    break

미니배치 0의 임베딩 결과:
torch.Size([512, 19, 32])
tensor([[[-1.6768, -0.1081, -0.3991,  ..., -0.8560,  0.2349, -1.7395],
         [-0.7472, -0.4895, -0.9260,  ..., -1.1073,  0.1093, -0.8658],
         [ 0.9645,  0.2061,  0.6376,  ...,  0.1630, -1.1543, -0.5506],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-1.6768, -0.1081, -0.3991,  ..., -0.8560,  0.2349, -1.7395],
         [ 0.3681,  2.4369,  0.9258,  ...,  1.3521,  0.6009, -0.8445],
         [ 0.9645,  0.2061,  0.6376,  ...,  0.1630, -1.1543, -0.5506],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-1.6768, -0.1081, -0.3991,  ..., -0.8560,  0.2349, -1.7395],
   

### 옵티마이저, loss function

In [27]:
lr = 0.001
num_epochs = 100

In [28]:
# 옵티마이저
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr = lr)
optimizer


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [29]:
def normalize_sizes(y_pred, y_true):
    """
        y_pred (torch.Tensor): 모델의 출력
            3차원 텐서이면 행렬로 변환합니다.
        y_true (torch.Tensor): 타깃 예측
            행렬이면 벡터로 변환합니다.
    """
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

In [30]:
# loss function

def sequence_loss(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)

## normalize_sizes 예시

In [42]:
import torch
import torch.nn.functional as F

# normalize_sizes 함수 정의
def normalize_sizes(y_pred, y_true):
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

# 예측값과 실제값 생성 (예시)
y_pred = torch.tensor([[[0.1, 0.9], [0.8, 0.2]], [[0.3, 0.7], [0.6, 0.4]]])
y_true = torch.tensor([[1, 0], [0, 1]])

# 손실 함수 계산을 위해 normalize_sizes 함수 적용
y_pred, y_true = normalize_sizes(y_pred, y_true)
print("y_pred",y_pred)
print("y_true",y_true)

# # 손실 함수 계산
# loss = F.cross_entropy(y_pred, y_true)

# print("손실 함수 값:", loss.item())


y_pred tensor([[0.1000, 0.9000],
        [0.8000, 0.2000],
        [0.3000, 0.7000],
        [0.6000, 0.4000]])
y_true tensor([1, 0, 0, 1])


### Train

In [31]:
def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float() #마스킹 안된 것만 유효 index로 정의
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100

In [32]:
# Train state 초기화 
def make_train_state():
    return {
        'stop_early':False,
        'early_stopping_step':0,
        'early_stopping_best_val':1e8,
        'early_stopping_criteria' : 10,
        'epoch_index' : 0,
        'train_loss': [], 
        'train_acc' :[], 
        'val_loss' : [],
        'val_acc' : [], 
        'test_loss' : [],
        'test_acc' : [],
         
#       모델 저장파일
        'model_filename' : 'model.pth'
    } 


# Train update 
def update_train_state(model, train_state):
    
#   학습시작하면 초기에 모델 저장하기 
    
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(),train_state['model_filename'])
        
#   모델 성능이 향상되면 모델 저장(valid loss가 더 낮아지면)
    elif train_state['epoch_index'] >=1 :
        loss_t = train_state['val_loss'][-1]
#        loss가 나빠지면 early stop step 업데이트
        if loss_t >= train_state['early_stopping_best_val']:
            train_state['early_stopping_step']+=1
            
#        loss가 좋아지면   
        else:
#            early stop step 0으로 다시 초기화        
            train_state['early_stopping_step']=0
    
#           최저 loss이면 모델 저장 
            if loss_t < train_state['early_stopping_best_val']:
                train_state['early_stopping_best_val'] = loss_t
                torch.save(model.state_dict(),train_state['model_filename'])

#       기준점 넘으면 early stop 
        if train_state['early_stopping_step'] >= train_state['early_stopping_criteria']:
            train_state['stop_early'] = True
        
        return train_state


In [33]:
# 모델 진행 상황 함수 초기화
train_state = make_train_state()
train_state

{'stop_early': False,
 'early_stopping_step': 0,
 'early_stopping_best_val': 100000000.0,
 'early_stopping_criteria': 10,
 'epoch_index': 0,
 'train_loss': [],
 'train_acc': [],
 'val_loss': [],
 'val_acc': [],
 'test_loss': [],
 'test_acc': [],
 'model_filename': 'model.pth'}

In [34]:
import tqdm

mask_index = char_vocab.mask_index

# 에포크만큼
for epoch in tqdm.tqdm(range(num_epochs)):

#     print('epoch',epoch)
#     print(train_state['epoch_index']) 
    train_state['epoch_index'] +=1 

    running_loss = 0.0
    running_acc = 0.0


#     모델을 학습 모드로 설정 -> 드롭아웃 및 배치 정규화와 같은 학습 중에만 적용되는 기법들이 활성화
#     모델을 평가 모드로 전환하려면 model.eval()을 사용
    model.train()
# 배치 만큼
    for batch_idx, batch_data in enumerate(Traindataloader):
        

#       1. 옵티마이저 그레디언트 0으로 초기화
        optimizer.zero_grad()
#       2. 모델에 데이터 넣어서 출력받기
        y_pred = model(x_in=batch_data['x_data'])
#       3. loss 계산하기
        loss =  sequence_loss(y_pred, batch_dict['y_target'], mask_index)

    
#       4. gradient 계산하기
        loss.backward()

#       5. 옵티마이저 가중치 업데이트
        optimizer.step()

#       Accuracy 계산
        # 이동 손실과 이동 정확도를 계산
        running_loss += (loss.item() - running_loss) / (batch_idx + 1)
        acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
        running_acc += (acc_t - running_acc) / (batch_idx + 1)


    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)


#   valid에 대한 계산

    running_loss = 0.0
    running_acc = 0.0

    model.eval() # 모델 파라미터를 수정하지 못 하게 비활성화

    for batch_idx, batch_data in enumerate(Validdataloader):

#       1. 모델의 출력값(y_pred)계산
        y_pred = model(x_in=batch_data['x_data'])

#       2. loss 계산
        loss_t = sequence_loss(y_pred, batch_dict['y_target'], mask_index)
        running_loss += (loss_t.item() - running_loss) / (batch_idx + 1)

#       3. Accuracy 계산
        acc_t = compute_accuracy(y_pred,batch_data['y_target'],mask_index)
        running_acc += (acc_t - running_acc) / (batch_idx + 1)
    
    print("val_loss",running_loss)
    print("val_acc",running_acc)

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    

#   전체 loss, acc 저장
    train_state = update_train_state(model=model,
                                     train_state=train_state)
#   early stop해라고 했으면 학습 멈추기    
    if train_state['stop_early']:
        break



  1%|▍                                          | 1/100 [00:01<01:46,  1.07s/it]

val_loss 4.338480790456136
val_acc 3.571173238759035


  2%|▊                                          | 2/100 [00:02<01:44,  1.06s/it]

val_loss 4.03706423441569
val_acc 6.031034312102194


  3%|█▎                                         | 3/100 [00:03<01:38,  1.02s/it]

val_loss 3.5977984269460044
val_acc 6.727069450698828


  4%|█▋                                         | 4/100 [00:04<01:37,  1.02s/it]

val_loss 3.34366774559021
val_acc 11.192793862357634


  5%|██▏                                        | 5/100 [00:05<01:44,  1.10s/it]

val_loss 3.2401487827301025
val_acc 13.753772314946723


  6%|██▌                                        | 6/100 [00:06<01:40,  1.07s/it]

val_loss 3.17496657371521
val_acc 15.113712079552988


  7%|███                                        | 7/100 [00:07<01:39,  1.07s/it]

val_loss 3.116319179534912
val_acc 16.14586598483967


  8%|███▍                                       | 8/100 [00:08<01:37,  1.06s/it]

val_loss 3.0664554437001548
val_acc 17.00798293737608


  9%|███▊                                       | 9/100 [00:09<01:42,  1.12s/it]

val_loss 3.027183771133423
val_acc 17.5762851541794


 10%|████▏                                     | 10/100 [00:10<01:38,  1.09s/it]

val_loss 2.9928225676218667
val_acc 18.336815574030105


 11%|████▌                                     | 11/100 [00:11<01:36,  1.08s/it]

val_loss 2.9631017049153647
val_acc 18.417614312445465


 12%|█████                                     | 12/100 [00:12<01:35,  1.08s/it]

val_loss 2.9487913449605307
val_acc 18.572449588932653


 13%|█████▍                                    | 13/100 [00:14<01:35,  1.10s/it]

val_loss 2.929863691329956
val_acc 18.58534787877949


 14%|█████▉                                    | 14/100 [00:15<01:32,  1.07s/it]

val_loss 2.9179724057515464
val_acc 19.08209421005182


 15%|██████▎                                   | 15/100 [00:16<01:29,  1.05s/it]

val_loss 2.902268568674723
val_acc 19.358338036378132


 16%|██████▋                                   | 16/100 [00:17<01:28,  1.06s/it]

val_loss 2.8942235310872397
val_acc 19.110822358891692


 17%|███████▏                                  | 17/100 [00:18<01:26,  1.04s/it]

val_loss 2.8841446240743003
val_acc 19.123605735181307


 18%|███████▌                                  | 18/100 [00:19<01:27,  1.07s/it]

val_loss 2.883004347483317
val_acc 19.190782645675124


 19%|███████▉                                  | 19/100 [00:20<01:30,  1.11s/it]

val_loss 2.8709401289621987
val_acc 19.409990873955028


 20%|████████▍                                 | 20/100 [00:21<01:27,  1.09s/it]

val_loss 2.8671154181162515
val_acc 19.52571277191273


 21%|████████▊                                 | 21/100 [00:22<01:28,  1.11s/it]

val_loss 2.8499996662139893
val_acc 19.463957820660113


 22%|█████████▏                                | 22/100 [00:23<01:24,  1.09s/it]

val_loss 2.8508729139963784
val_acc 19.332606371946675


 23%|█████████▋                                | 23/100 [00:24<01:25,  1.11s/it]

val_loss 2.8507049878438315
val_acc 19.398504026242794


 24%|██████████                                | 24/100 [00:25<01:21,  1.07s/it]

val_loss 2.847710688908895
val_acc 19.71851742392217


 25%|██████████▌                               | 25/100 [00:27<01:27,  1.17s/it]

val_loss 2.8415218194325766
val_acc 19.60734698328811


 26%|██████████▉                               | 26/100 [00:28<01:22,  1.12s/it]

val_loss 2.840250571568807
val_acc 19.660589030492744


 27%|███████████▎                              | 27/100 [00:29<01:21,  1.12s/it]

val_loss 2.836102326711019
val_acc 19.609267167167992


 28%|███████████▊                              | 28/100 [00:30<01:20,  1.11s/it]

val_loss 2.8380130926767984
val_acc 19.46621071192484


 29%|████████████▏                             | 29/100 [00:31<01:15,  1.06s/it]

val_loss 2.834988594055176
val_acc 19.863904901665837


 30%|████████████▌                             | 30/100 [00:32<01:12,  1.03s/it]

val_loss 2.8341036637624106
val_acc 19.63281454054676


 31%|█████████████                             | 31/100 [00:33<01:10,  1.02s/it]

val_loss 2.8288962046305337
val_acc 19.434850527000723


 32%|█████████████▍                            | 32/100 [00:34<01:09,  1.02s/it]

val_loss 2.8313562870025635
val_acc 19.61444231273554


 33%|█████████████▊                            | 33/100 [00:35<01:09,  1.03s/it]

val_loss 2.8309008280436196
val_acc 19.790272187458292


 34%|██████████████▎                           | 34/100 [00:36<01:10,  1.07s/it]

val_loss 2.8258833090464273
val_acc 19.71260708975842


 35%|██████████████▋                           | 35/100 [00:37<01:08,  1.06s/it]

val_loss 2.8300113677978516
val_acc 19.915663237624766


 36%|███████████████                           | 36/100 [00:38<01:08,  1.08s/it]

val_loss 2.8294379711151123
val_acc 19.97507863688726


 37%|███████████████▌                          | 37/100 [00:39<01:05,  1.04s/it]

val_loss 2.8261839548746743
val_acc 19.683996904020407


 38%|███████████████▉                          | 38/100 [00:40<01:03,  1.02s/it]

val_loss 2.822040637334188
val_acc 19.768650793842685


 39%|████████████████▍                         | 39/100 [00:41<01:02,  1.02s/it]

val_loss 2.821429173151652
val_acc 19.661931062596945


 40%|████████████████▊                         | 40/100 [00:42<01:00,  1.01s/it]

val_loss 2.8205695947011313
val_acc 19.874470435224556


 41%|█████████████████▏                        | 41/100 [00:43<01:01,  1.05s/it]

val_loss 2.8208481470743814
val_acc 19.78664915404898


 42%|█████████████████▋                        | 42/100 [00:44<01:00,  1.05s/it]

val_loss 2.8189258575439453
val_acc 19.68589204665957


 43%|██████████████████                        | 43/100 [00:45<01:00,  1.06s/it]

val_loss 2.8229783376057944
val_acc 19.89131576260042


 44%|██████████████████▍                       | 44/100 [00:47<01:01,  1.09s/it]

val_loss 2.8183223406473794
val_acc 19.774457397321605


 45%|██████████████████▉                       | 45/100 [00:48<00:58,  1.07s/it]

val_loss 2.8187952836354575
val_acc 20.064061898670317


 46%|███████████████████▎                      | 46/100 [00:49<01:01,  1.14s/it]

val_loss 2.815053860346476
val_acc 19.98252765369282


 47%|███████████████████▋                      | 47/100 [00:50<01:01,  1.16s/it]

val_loss 2.815638542175293
val_acc 19.99134385072527


 48%|████████████████████▏                     | 48/100 [00:51<00:57,  1.11s/it]

val_loss 2.8174219131469727
val_acc 19.863510030244573


 49%|████████████████████▌                     | 49/100 [00:52<00:54,  1.07s/it]

val_loss 2.8171549638112388
val_acc 20.16505461374033


 50%|█████████████████████                     | 50/100 [00:53<00:52,  1.05s/it]

val_loss 2.811586062113444
val_acc 20.01267133200839


 51%|█████████████████████▍                    | 51/100 [00:54<00:50,  1.04s/it]

val_loss 2.815774838129679
val_acc 19.811902703422096


 52%|█████████████████████▊                    | 52/100 [00:55<00:50,  1.04s/it]

val_loss 2.8127740224202475
val_acc 20.200689927426854


 53%|██████████████████████▎                   | 53/100 [00:57<00:54,  1.15s/it]

val_loss 2.8114918073018393
val_acc 20.03608629247736


 54%|██████████████████████▋                   | 54/100 [00:58<00:53,  1.16s/it]

val_loss 2.8117194175720215
val_acc 20.037731557478182


 55%|███████████████████████                   | 55/100 [00:59<00:57,  1.29s/it]

val_loss 2.81473700205485
val_acc 19.844138621318574


 56%|███████████████████████▌                  | 56/100 [01:00<00:53,  1.22s/it]

val_loss 2.8089207808176675
val_acc 19.787784076929924


 57%|███████████████████████▉                  | 57/100 [01:02<00:50,  1.18s/it]

val_loss 2.8120240370432534
val_acc 19.705099045670107


 58%|████████████████████████▎                 | 58/100 [01:03<00:48,  1.15s/it]

val_loss 2.8101742267608643
val_acc 20.083776248166426


 59%|████████████████████████▊                 | 59/100 [01:04<00:46,  1.13s/it]

val_loss 2.8147141138712564
val_acc 19.766063863976058


 60%|█████████████████████████▏                | 60/100 [01:05<00:43,  1.10s/it]

val_loss 2.8055564562479653
val_acc 20.047123730730117


 61%|█████████████████████████▌                | 61/100 [01:06<00:41,  1.07s/it]

val_loss 2.8102962176005044
val_acc 19.87765890768293


 62%|██████████████████████████                | 62/100 [01:07<00:39,  1.04s/it]

val_loss 2.811798175175985
val_acc 20.117200487714157


 63%|██████████████████████████▍               | 63/100 [01:08<00:37,  1.01s/it]

val_loss 2.808818737665812
val_acc 20.09970075867353


 64%|██████████████████████████▉               | 64/100 [01:09<00:37,  1.04s/it]

val_loss 2.811549107233683
val_acc 20.120360106392315


 65%|███████████████████████████▎              | 65/100 [01:10<00:37,  1.07s/it]

val_loss 2.810814460118612
val_acc 19.813665836870637


 66%|███████████████████████████▋              | 66/100 [01:11<00:39,  1.17s/it]

val_loss 2.810210386912028
val_acc 19.856039839495082


 67%|████████████████████████████▏             | 67/100 [01:12<00:37,  1.12s/it]

val_loss 2.8090453147888184
val_acc 20.231247010381864


 68%|████████████████████████████▌             | 68/100 [01:13<00:35,  1.10s/it]

val_loss 2.808390220006307
val_acc 19.954277413450164


 69%|████████████████████████████▉             | 69/100 [01:15<00:35,  1.15s/it]

val_loss 2.8096299966176352
val_acc 19.90861066379202


 69%|████████████████████████████▉             | 69/100 [01:16<00:34,  1.10s/it]

val_loss 2.8064301013946533
val_acc 20.032327534671822





### Test 진행

In [35]:
# 가장 좋은 모델을 사용해 테스트 세트의 손실과 정확도를 계산합니다

model.load_state_dict(torch.load(train_state['model_filename']))

running_loss = 0.0
running_acc = 0.0

# 가중치 업데이트 하지 못 하게
model.eval()

for batch_idx, batch_data in enumerate(Testdataloader):
    
    y_pred = model(x_in=batch_data['x_data'])
    loss = sequence_loss(y_pred,batch_data['y_target'],mask_index)
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_idx + 1)
    
    acc_t = compute_accuracy(y_pred, batch_data['y_target'],mask_index)
    running_acc += (acc_t - running_acc) / (batch_idx + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [36]:
print("테스트 손실: {:.3f}".format(train_state['test_loss']))
print("테스트 정확도: {:.2f}".format(train_state['test_acc']))

테스트 손실: 2.836
테스트 정확도: 20.25


### 추론

In [37]:
def decode_samples(sampled_indices):
    """인덱스를 성씨 문자열로 바꿈
    """
    decoded_surnames = []
    vocab = char_vocab
    
    for sample_index in range(sampled_indices.shape[0]):
        surname = ""
        for time_step in range(sampled_indices.shape[1]):
            sample_item = sampled_indices[sample_index, time_step].item()
            if sample_item == vocab.begin_seq_index:
                continue
            elif sample_item == vocab.end_seq_index:
                break
            else:
                surname += lookup_index(vocab,sample_item)
        decoded_surnames.append(surname)
    return decoded_surnames


In [38]:
def sample_from_model(model, num_samples=1, sample_size=20, 
                      temperature=1.0):
    """모델이 만든 인덱스 시퀀스를 샘플링
    
    매개변수:
        model (SurnameGenerationModel): 훈련 모델
        num_samples (int): 샘플 개수
        sample_size (int): 샘플의 최대 길이
        temperature (float): 무작위성 정도
            0.0 < temperature < 1.0 이면 최대 값을 선택할 가능성이 높습니다
            temperature > 1.0 이면 균등 분포에 가깝습니다
    반환값:
        indices (torch.Tensor): 인덱스 행렬
        shape = (num_samples, sample_size)
    """
    begin_seq_index = [char_vocab.begin_seq_index 
                       for _ in range(num_samples)]
    begin_seq_index = torch.tensor(begin_seq_index, 
                                   dtype=torch.int64).unsqueeze(dim=1)
    indices = [begin_seq_index]
    h_t = None
    
    for time_step in range(sample_size):
        x_t = indices[time_step]
        x_emb_t = model.emb(x_t)
        rnn_out_t, h_t = model.rnn(x_emb_t, h_t)
        prediction_vector = model.fc(rnn_out_t.squeeze(dim=1))
        probability_vector = F.softmax(prediction_vector / temperature, dim=1)
        indices.append(torch.multinomial(probability_vector, num_samples=1))
    indices = torch.stack(indices).squeeze().permute(1, 0)
    return indices

In [39]:
# 생성할 이름 개수
num_names = 10
model = model.cpu()
# 이름 생성
sampled_surnames = decode_samples(
    sample_from_model(model, num_samples=num_names))
# 결과 출력
print ("-"*15)
for i in range(num_names):
    print (sampled_surnames[i])

---------------
Hhurnv
Wookico
Fasttilo
ñslseswa
Kaavi
Kooosuhta
Taouluannvo
Yhontade
Dozas
Ssle
