# 예제: 국적도 고려한 RNN으로 성씨 생성하기

## 데이터 로드

In [2]:
import pandas as pd

df = pd.read_csv("../../data/surnames_with_splits.csv")
df


Unnamed: 0,nationality,nationality_index,split,surname
0,Arabic,15,train,Totah
1,Arabic,15,train,Abboud
2,Arabic,15,train,Fakhoury
3,Arabic,15,train,Srour
4,Arabic,15,train,Sayegh
...,...,...,...,...
10975,Vietnamese,11,test,Dinh
10976,Vietnamese,11,test,Phung
10977,Vietnamese,11,test,Quang
10978,Vietnamese,11,test,Vu


In [3]:
df['nationality'].value_counts()

nationality
English       2972
Russian       2373
Arabic        1603
Japanese       775
Italian        600
German         576
Czech          414
Spanish        258
Dutch          236
French         229
Chinese        220
Irish          183
Greek          156
Polish         120
Korean          77
Scottish        75
Vietnamese      58
Portuguese      55
Name: count, dtype: int64

In [4]:
df['split'].value_counts()

split
train    7680
test     1660
val      1640
Name: count, dtype: int64

### 데이터 split(train/valid/test)

In [5]:
# 데이터를 다시 train/valid/test로 나눠줌

# train 데이터 
train_df = df[df.split=='train']
train_size = len(train_df)

# valid 데이터 
val_df = df[df.split=='val']
val_size = len(val_df)

# test 데이터 
test_df = df[df.split=='test']
test_size = len(test_df)

In [6]:
lookup_dict = {'train': (train_df, train_size), 
              'val': (val_df, val_size), 
              'test': (test_df, test_size)}


## 2. Vocabulary

In [7]:
class Vocabulary:
    def __init__(self, token_to_idx=None):

        if token_to_idx is None:
            token_to_idx = {}
        self.token_to_idx = token_to_idx

        self.idx_to_token = {idx: token 
                              for token, idx in self.token_to_idx.items()}

    def add_token(self, token):
        
#       만약 해당 토큰이 있으면 토큰 idx만 return
        if token in self.token_to_idx:
            index = self.token_to_idx[token]
            
#       만약 해당 토큰이 없으면 새로운 토큰 만들어줌
        else:
            index = len(self.token_to_idx)
            self.token_to_idx[token] = index
            self.idx_to_token[index] = token
        return index

In [8]:
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)
        
        

In [9]:
# Vocab 생성
char_vocab = SequenceVocabulary()
nationality_vocab = Vocabulary()

for index, row in df.iterrows():
    for char in row.surname:
        char_vocab.add_token(char)
    nationality_vocab.add_token(row.nationality)

### Char Vocabulary

In [10]:
print(dict(list(char_vocab.token_to_idx.items())[:10]))

{'<MASK>': 0, '<UNK>': 1, '<BEGIN>': 2, '<END>': 3, 'T': 4, 'o': 5, 't': 6, 'a': 7, 'h': 8, 'A': 9}


In [11]:
print(dict(list(char_vocab.idx_to_token.items())[:10]))

{0: '<MASK>', 1: '<UNK>', 2: '<BEGIN>', 3: '<END>', 4: 'T', 5: 'o', 6: 't', 7: 'a', 8: 'h', 9: 'A'}


### 국적 Vocabulary

In [12]:
print(dict(list(nationality_vocab.token_to_idx.items())[:10]))

{'Arabic': 0, 'Chinese': 1, 'Czech': 2, 'Dutch': 3, 'English': 4, 'French': 5, 'German': 6, 'Greek': 7, 'Irish': 8, 'Italian': 9}


In [13]:
print(dict(list(nationality_vocab.idx_to_token.items())[:10]))

{0: 'Arabic', 1: 'Chinese', 2: 'Czech', 3: 'Dutch', 4: 'English', 5: 'French', 6: 'German', 7: 'Greek', 8: 'Irish', 9: 'Italian'}


## 3. Vectorizer

In [14]:
# 주어진 토큰에 대응하는 인덱스 반환

def lookup_token(vocabulary_class,token):
    return vocabulary_class.token_to_idx[token]
    

In [15]:
# 주어진 인덱스에 대응하는 토큰 반환

def lookup_index(vocabulary_class, index):
        if index not in vocabulary_class.idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return vocabulary_class.idx_to_token[index]
    

In [16]:
vocab_length = len(nationality_vocab.token_to_idx)
print("토큰의 수:", vocab_length)

토큰의 수: 18


In [17]:
vocab_length = len(char_vocab.token_to_idx)
print("토큰의 수:", vocab_length)

토큰의 수: 88


### 텍스트(surname)에 대한 원 핫 인코딩

In [18]:
import numpy as np 

# vector_length (int): 인덱스 벡터의 길이를 맞추기 위한 매개변수
def vectorize(surname, vector_length=-1):

    indices = [char_vocab.begin_seq_index]
    indices.extend(lookup_token(char_vocab,token) 
                   for token in surname)
    indices.append(char_vocab.end_seq_index)

    if vector_length < 0:
        vector_length = len(indices)
    
    from_vector = np.empty(vector_length, dtype=np.int64)         
    from_indices = indices[:-1]
    from_vector[:len(from_indices)] = from_indices
    from_vector[len(from_indices):] = char_vocab.mask_index

    to_vector = np.empty(vector_length, dtype=np.int64)
    to_indices = indices[1:]
    to_vector[:len(to_indices)] = to_indices
    to_vector[len(to_indices):] = char_vocab.mask_index
        
    return from_vector, to_vector

print("예시")
example = vectorize("Choi", 10)
print(example)
print(len(example[0]))

예시
(array([ 2, 20,  8,  5, 23,  0,  0,  0,  0,  0]), array([20,  8,  5, 23,  3,  0,  0,  0,  0,  0]))
10


### SurnameDataset class

In [19]:
import torch
from torch.utils.data import Dataset

class SurnameDataset(Dataset):
    def __init__(self, surname_df):
        self.surname_df = surname_df
        self.max_seq_length = max(map(len, df.surname)) + 2
        
     

    def __len__(self):
        return len(self.surname_df)

    def __getitem__(self, index):
        row = self.surname_df.iloc[index]
        
        from_vector, to_vector = vectorize(row.surname, self.max_seq_length)
        
        nationality_index = lookup_token(nationality_vocab,row.nationality)

        return {'x_data': from_vector, 
                'y_target': to_vector, 
                'class_index': nationality_index}
    

### 데이터셋 class

In [20]:
# 데이터셋을 인스턴스화 해주어야 로더에 넣어줄 수 있다. 

train_dataset = SurnameDataset(train_df)
train_dataset

valid_dataset = SurnameDataset(val_df)
valid_dataset

test_dataset = SurnameDataset(test_df)
test_dataset


<__main__.SurnameDataset at 0x7f9ac1d8fdf0>

In [21]:
print(train_dataset.max_seq_length)
print(valid_dataset.max_seq_length)
print(test_dataset.max_seq_length)

19
19
19


In [22]:
# 데이터 로더 설정
from torch.utils.data import DataLoader

# drop_last=True -> 배치 사이즈보다 over하면 drop

Traindataloader = DataLoader(dataset=train_dataset, batch_size=512,
                            shuffle=True, drop_last=True)

Validdataloader = DataLoader(dataset=valid_dataset, batch_size=512,
                            shuffle=True, drop_last=True)

Testdataloader = DataLoader(dataset=test_dataset, batch_size=512,
                            shuffle=True, drop_last=True)


In [23]:
print(len(train_dataset),len(Traindataloader))

7680 15


In [24]:
for batch_index, batch_dict in enumerate(Traindataloader):
#     print(batch_index)
    print(batch_dict)
    
    break
    

{'x_data': tensor([[ 2, 22,  7,  ...,  0,  0,  0],
        [ 2, 17,  8,  ...,  0,  0,  0],
        [ 2, 24,  7,  ...,  0,  0,  0],
        ...,
        [ 2, 34,  7,  ...,  0,  0,  0],
        [ 2, 35,  8,  ...,  0,  0,  0],
        [ 2, 45, 18,  ...,  0,  0,  0]]), 'y_target': tensor([[22,  7, 31,  ...,  0,  0,  0],
        [17,  8, 23,  ...,  0,  0,  0],
        [24,  7, 25,  ...,  0,  0,  0],
        ...,
        [34,  7, 15,  ...,  0,  0,  0],
        [35,  8,  7,  ...,  0,  0,  0],
        [45, 18, 12,  ...,  0,  0,  0]]), 'class_index': tensor([ 0,  4,  4,  2, 14,  3,  2, 14, 13,  2, 14, 14,  4,  4,  9,  0,  7,  4,
         9,  9,  0, 12, 10,  9, 17,  4,  0,  4,  4,  8,  4,  0,  4,  4,  0, 10,
         4,  4,  0,  9,  0, 10, 14,  6,  4,  0, 14, 14,  0,  3, 14, 16,  4,  0,
        14,  4,  1, 17, 10, 14, 14,  0, 10,  5, 14, 16,  4,  4,  7, 14,  0,  0,
        14, 10, 14, 10,  3, 10,  9,  6,  4,  9,  6,  4, 14,  4,  4,  0,  4, 14,
        14,  4, 14,  4,  9, 10,  4, 14,  2,  2, 10, 

## 모델 정의

In [25]:
# 사전 정의 함수
# 주어진 배치의 각 데이터 포인트에 대해 시퀀스의 마지막 벡터를 추출
# => y_out에 있는 각 데이터 포인트에서 마지막 벡터 추출
def column_gather(y_out, x_lengths):
    
#     x_lengths = x_lengths.long().detach().cpu().numpy() - 1
    x_lengths = x_lengths-1
    out = []
    for batch_index, length in enumerate(x_lengths):
        out.append(y_out[batch_index, length])

    return torch.stack(out)

### column_gather 출력예시

In [26]:
import torch

# 가상의 입력 데이터
y_out = torch.tensor([
    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],  # 첫 번째 시퀀스: 길이 3
    [[10, 11, 12], [13, 14, 15], [16, 17, 18]],  # 두 번째 시퀀스: 길이 3 (길이를 맞춤)
])

x_lengths = torch.tensor([3, 3])  # 각 시퀀스의 길이 (동일하게 맞춤)
# column_gather 함수 호출
result = column_gather(y_out, x_lengths)
print(result)

tensor([[ 7,  8,  9],
        [16, 17, 18]])


### 조건이 없는 성씨 생성 모델 

In [43]:
import torch.nn.functional as F 
import torch.nn as nn

class SurnameGenerationModel(nn.Module):
    def __init__(self, char_embedding_size, char_vocab_size, num_nationalities, rnn_hidden_size, 
                 batch_first=True, padding_idx=0, dropout_p=0.5):
        
        super(SurnameGenerationModel, self).__init__()
        
        self.emb = nn.Embedding(num_embeddings = char_vocab_size,
                               embedding_dim = char_embedding_size,
                               padding_idx = padding_idx)
        
        self.nation_emb = nn.Embedding(embedding_dim=rnn_hidden_size,
                                            num_embeddings=num_nationalities)
        
        self.rnn = nn.GRU(input_size=char_embedding_size, 
                          hidden_size=rnn_hidden_size,
                          batch_first=batch_first)
        
        self.fc = nn.Linear(in_features=rnn_hidden_size, 
                            out_features=char_vocab_size)
        
        self.dropout_p = dropout_p
        
    def forward(self, x_in, nationality_index, apply_softmax=False):
        x_embedded = self.emb(x_in)
        nationality_embedded = self.nation_emb(nationality_index).unsqueeze(0)
        
        y_out, _ = self.rnn(x_embedded,nationality_embedded)

        batch_size, seq_size, feat_size = y_out.shape
        y_out = y_out.contiguous().view(batch_size * seq_size, feat_size)

        y_out = self.fc(F.dropout(y_out, p=self.dropout_p))
                         
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)
            
        new_feat_size = y_out.shape[-1]
        y_out = y_out.view(batch_size, seq_size, new_feat_size)
            
        return y_out


In [44]:
len(char_vocab.token_to_idx)

88

In [45]:
len(nationality_vocab.token_to_idx)

18

In [46]:
char_embedding_size = 32
rnn_hidden_size = 32

model = SurnameGenerationModel(char_embedding_size=char_embedding_size,
                               char_vocab_size=len(char_vocab.token_to_idx),
                               num_nationalities=len(nationality_vocab.token_to_idx),
                               rnn_hidden_size=rnn_hidden_size,
                               padding_idx=char_vocab.mask_index)
model

SurnameGenerationModel(
  (emb): Embedding(88, 32, padding_idx=0)
  (nation_emb): Embedding(18, 32)
  (rnn): GRU(32, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=88, bias=True)
)

### 옵티마이저, loss function

In [47]:
lr = 0.001
num_epochs = 100

In [48]:
# 옵티마이저
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr = lr)
optimizer


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [49]:
df['nationality'].value_counts()

nationality
English       2972
Russian       2373
Arabic        1603
Japanese       775
Italian        600
German         576
Czech          414
Spanish        258
Dutch          236
French         229
Chinese        220
Irish          183
Greek          156
Polish         120
Korean          77
Scottish        75
Vietnamese      58
Portuguese      55
Name: count, dtype: int64

In [50]:
# numSample_list = df['nationality'].value_counts().tolist()
# numSample_list
# # weights 계산
# weights = [1 - (x / sum(numSample_list)) for x in numSample_list]

# # weights를 torch.FloatTensor로 변환
# weights = torch.FloatTensor(weights)
# weights

In [51]:
def normalize_sizes(y_pred, y_true):
    """텐서 크기 정규화
    
    매개변수:
        y_pred (torch.Tensor): 모델의 출력
            3차원 텐서이면 행렬로 변환합니다.
        y_true (torch.Tensor): 타깃 예측
            행렬이면 벡터로 변환합니다.
    """
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

In [52]:
# loss function

def sequence_loss(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)

### Train

In [53]:
def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100

In [54]:
# Train state 초기화 
def make_train_state():
    return {
        'stop_early':False,
        'early_stopping_step':0,
        'early_stopping_best_val':1e8,
        'early_stopping_criteria' : 10,
        'epoch_index' : 0,
        'train_loss': [], 
        'train_acc' :[], 
        'val_loss' : [],
        'val_acc' : [], 
        'test_loss' : [],
        'test_acc' : [],
         
#       모델 저장파일
        'model_filename' : 'model.pth'
    } 


# Train update 
def update_train_state(model, train_state):
    
#   학습시작하면 초기에 모델 저장하기 
    
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(),train_state['model_filename'])
        
#   모델 성능이 향상되면 모델 저장(valid loss가 더 낮아지면)
    elif train_state['epoch_index'] >=1 :
        loss_t = train_state['val_loss'][-1]
#        loss가 나빠지면 early stop step 업데이트
        if loss_t >= train_state['early_stopping_best_val']:
            train_state['early_stopping_step']+=1
            
#        loss가 좋아지면   
        else:
#            early stop step 0으로 다시 초기화        
            train_state['early_stopping_step']=0
    
#           최저 loss이면 모델 저장 
            if loss_t < train_state['early_stopping_best_val']:
                train_state['early_stopping_best_val'] = loss_t
                torch.save(model.state_dict(),train_state['model_filename'])

#       기준점 넘으면 early stop 
        if train_state['early_stopping_step'] >= train_state['early_stopping_criteria']:
            train_state['stop_early'] = True
        
        return train_state


In [55]:
# 모델 진행 상황 함수 초기화
train_state = make_train_state()
train_state

{'stop_early': False,
 'early_stopping_step': 0,
 'early_stopping_best_val': 100000000.0,
 'early_stopping_criteria': 10,
 'epoch_index': 0,
 'train_loss': [],
 'train_acc': [],
 'val_loss': [],
 'val_acc': [],
 'test_loss': [],
 'test_acc': [],
 'model_filename': 'model.pth'}

In [56]:
import tqdm

mask_index = char_vocab.mask_index

# 에포크만큼
for epoch in tqdm.tqdm(range(num_epochs)):

#     print('epoch',epoch)
#     print(train_state['epoch_index']) 
    train_state['epoch_index'] +=1 

    running_loss = 0.0
    running_acc = 0.0


#     모델을 학습 모드로 설정 -> 드롭아웃 및 배치 정규화와 같은 학습 중에만 적용되는 기법들이 활성화
#     모델을 평가 모드로 전환하려면 model.eval()을 사용
    model.train()
# 배치 만큼
    for batch_idx, batch_data in enumerate(Traindataloader):
        

#       1. 옵티마이저 그레디언트 0으로 초기화
        optimizer.zero_grad()
#       2. 모델에 데이터 넣어서 출력받기
        y_pred = model(x_in=batch_data['x_data'], 
                           nationality_index=batch_data['class_index'])
#       3. loss 계산하기
        loss =  sequence_loss(y_pred, batch_data['y_target'], mask_index)

    
#       4. gradient 계산하기
        loss.backward()

#       5. 옵티마이저 가중치 업데이트
        optimizer.step()

#       Accuracy 계산
        # 이동 손실과 이동 정확도를 계산
        running_loss += (loss.item() - running_loss) / (batch_idx + 1)
        acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
        running_acc += (acc_t - running_acc) / (batch_idx + 1)


    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)


#   valid에 대한 계산

    running_loss = 0.0
    running_acc = 0.0

    model.eval() # 모델 파라미터를 수정하지 못 하게 비활성화

    for batch_idx, batch_data in enumerate(Validdataloader):

#       1. 모델의 출력값(y_pred)계산
        y_pred = model(x_in=batch_data['x_data'], 
                           nationality_index=batch_data['class_index'])

#       2. loss 계산
        loss_t = sequence_loss(y_pred, batch_dict['y_target'], mask_index)
        running_loss += (loss_t.item() - running_loss) / (batch_idx + 1)

#       3. Accuracy 계산
        acc_t = compute_accuracy(y_pred,batch_data['y_target'],mask_index)
        running_acc += (acc_t - running_acc) / (batch_idx + 1)
    
    print("val_loss",running_loss)
    print("val_acc",running_acc)

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    

#   전체 loss, acc 저장
    train_state = update_train_state(model=model,
                                     train_state=train_state)
#   early stop해라고 했으면 학습 멈추기    
    if train_state['stop_early']:
        break



  1%|▍                                          | 1/100 [00:02<04:02,  2.45s/it]

val_loss 4.38468599319458
val_acc 4.240601526834645


  2%|▊                                          | 2/100 [00:04<03:40,  2.25s/it]

val_loss 4.146158854166667
val_acc 11.143917595981465


  3%|█▎                                         | 3/100 [00:07<04:06,  2.54s/it]

val_loss 3.77731990814209
val_acc 14.358069765971136


  4%|█▋                                         | 4/100 [00:09<03:46,  2.36s/it]

val_loss 3.4539973735809326
val_acc 15.687638372598947


  5%|██▏                                        | 5/100 [00:11<03:30,  2.22s/it]

val_loss 3.300968647003174
val_acc 16.837265709518334


  6%|██▌                                        | 6/100 [00:13<03:23,  2.16s/it]

val_loss 3.212869723637899
val_acc 17.67955726524717


  7%|███                                        | 7/100 [00:15<03:14,  2.09s/it]

val_loss 3.1570770740509033
val_acc 18.236864825511372


  8%|███▍                                       | 8/100 [00:17<03:06,  2.03s/it]

val_loss 3.105630954106649
val_acc 19.420657010840685


  9%|███▊                                       | 9/100 [00:19<03:05,  2.04s/it]

val_loss 3.071368932723999
val_acc 19.177238303592617


 10%|████▏                                     | 10/100 [00:21<02:59,  1.99s/it]

val_loss 3.038522958755493
val_acc 20.05096690361011


 11%|████▌                                     | 11/100 [00:23<03:04,  2.07s/it]

val_loss 3.0228164196014404
val_acc 20.319826482831836


 12%|█████                                     | 12/100 [00:25<02:57,  2.02s/it]

val_loss 3.012853225072225
val_acc 20.686607354735553


 13%|█████▍                                    | 13/100 [00:27<02:52,  1.98s/it]

val_loss 3.0072545210520425
val_acc 20.941600808262283


 14%|█████▉                                    | 14/100 [00:29<02:48,  1.96s/it]

val_loss 2.991237163543701
val_acc 21.040790431445316


 15%|██████▎                                   | 15/100 [00:31<02:53,  2.04s/it]

val_loss 2.994898478190104
val_acc 21.22007312225309


 16%|██████▋                                   | 16/100 [00:33<02:51,  2.04s/it]

val_loss 3.0061072508494058
val_acc 21.993889433352848


 17%|███████▏                                  | 17/100 [00:35<02:45,  2.00s/it]

val_loss 3.0020174980163574
val_acc 21.75070365270316


 18%|███████▌                                  | 18/100 [00:37<02:39,  1.95s/it]

val_loss 3.011943260828654
val_acc 21.86098003256209


 19%|███████▉                                  | 19/100 [00:39<02:37,  1.94s/it]

val_loss 3.017106533050537
val_acc 22.767459311378456


 20%|████████▍                                 | 20/100 [00:41<02:35,  1.94s/it]

val_loss 3.023940324783325
val_acc 22.939004006695402


 21%|████████▊                                 | 21/100 [00:43<02:34,  1.96s/it]

val_loss 3.0361363887786865
val_acc 23.14873753498164


 22%|█████████▏                                | 22/100 [00:45<02:34,  1.98s/it]

val_loss 3.02583114306132
val_acc 23.72851730122807


 23%|█████████▋                                | 23/100 [00:47<02:32,  1.98s/it]

val_loss 3.032292445500692
val_acc 23.606516535284776


 23%|█████████▋                                | 23/100 [00:49<02:44,  2.14s/it]

val_loss 3.0546260674794516
val_acc 23.887859490414918





### Test 진행

In [57]:
# 가장 좋은 모델을 사용해 테스트 세트의 손실과 정확도를 계산합니다

model.load_state_dict(torch.load(train_state['model_filename']))

running_loss = 0.0
running_acc = 0.0

# 가중치 업데이트 하지 못 하게
model.eval()

for batch_idx, batch_data in enumerate(Testdataloader):
    
    y_pred = model(x_in=batch_data['x_data'], 
                   nationality_index=batch_data['class_index'])
    
    loss = sequence_loss(y_pred,batch_data['y_target'],mask_index)
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_idx + 1)
    
    acc_t = compute_accuracy(y_pred, batch_data['y_target'],mask_index)
    running_acc += (acc_t - running_acc) / (batch_idx + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [58]:
print("테스트 손실: {:.3f}".format(train_state['test_loss']))
print("테스트 정확도: {:.2f}".format(train_state['test_acc']))

테스트 손실: 2.794
테스트 정확도: 21.25


### 추론

In [60]:
def decode_samples(sampled_indices):
    """인덱스를 성씨 문자열로 변환합니다
    
    매개변수:
        sampled_indices (torch.Tensor): `sample_from_model` 함수에서 얻은 인덱스
    """
    decoded_surnames = []
    vocab = char_vocab
    
    for sample_index in range(sampled_indices.shape[0]):
        surname = ""
        for time_step in range(sampled_indices.shape[1]):
            sample_item = sampled_indices[sample_index, time_step].item()
            if sample_item == vocab.begin_seq_index:
                continue
            elif sample_item == vocab.end_seq_index:
                break
            else:
                surname += lookup_index(vocab,sample_item)
        decoded_surnames.append(surname)
    return decoded_surnames


In [64]:
def sample_from_model(model, nationalities, sample_size=20, 
                      temperature=1.0):
    """모델이 만든 인덱스 시퀀스를 샘플링합니다.
    
    매개변수:
        model (SurnameGenerationModel): 훈련 모델
        num_samples (int): 샘플 개수
        sample_size (int): 샘플의 최대 길이
        temperature (float): 무작위성 정도
            0.0 < temperature < 1.0 이면 최대 값을 선택할 가능성이 높습니다
            temperature > 1.0 이면 균등 분포에 가깝습니다
    반환값:
        indices (torch.Tensor): 인덱스 행렬
        shape = (num_samples, sample_size)
    """
    
    num_samples = len(nationalities)
    begin_seq_index = [char_vocab.begin_seq_index 
                       for _ in range(num_samples)]
    begin_seq_index = torch.tensor(begin_seq_index, 
                                   dtype=torch.int64).unsqueeze(dim=1)
    indices = [begin_seq_index]
    
    nationality_indices = torch.tensor(nationalities, dtype=torch.int64).unsqueeze(dim=0)
    h_t = model.nation_emb(nationality_indices)
    
    h_t = None
    
    for time_step in range(sample_size):
        x_t = indices[time_step]
        x_emb_t = model.emb(x_t)
        rnn_out_t, h_t = model.rnn(x_emb_t, h_t)
        prediction_vector = model.fc(rnn_out_t.squeeze(dim=1))
        probability_vector = F.softmax(prediction_vector / temperature, dim=1)
        indices.append(torch.multinomial(probability_vector, num_samples=1))
    indices = torch.stack(indices).squeeze().permute(1, 0)
    return indices

In [67]:
model = model.cpu()
for index in range(len(nationality_vocab.token_to_idx)):
    nationality = lookup_index(nationality_vocab,index)
    print("{} 샘플: ".format(nationality))
    sampled_indices = sample_from_model(model,  
                                        nationalities=[index] * 3, 
                                        temperature=0.7)
    for sampled_surname in decode_samples(sampled_indices):
        print("-  " + sampled_surname)

Arabic 샘플: 
-  raut
-  Talev
-  Jasovt
Chinese 샘플: 
-  Belar
-  Tatam
-  Bsan
Czech 샘플: 
-  Gal
-  Van
-  Biernenb
Dutch 샘플: 
-  Bafr
-  Dnadsa
-  Aarst
English 샘플: 
-  Foekot
-  Tnirun
-  Mar
French 샘플: 
-  Minie
-  Pcuikir
-  Krinens
German 샘플: 
-  gane
-  Jeson
-  Mgaer
Greek 샘플: 
-  Cáetonibad
-  Mhana
-  Klav
Irish 샘플: 
-  Hbeevsn
-  Atblee
-  Srnoin
Italian 샘플: 
-  Tlorren
-  Tuidun
-  Bitol
Japanese 샘플: 
-  Glemeo
-  Gaska
-  Aeenica
Korean 샘플: 
-  Dltsrime
-  Bkizery
-  Drni
Polish 샘플: 
-  Gaba
-  Seagi
-  Glillern
Portuguese 샘플: 
-  Hene
-  Caelami
-  Var
Russian 샘플: 
-  Yter
-  Sgioin
-  Hhaorbm
Scottish 샘플: 
-  Tani
-  Cekrir
-  Qanale
Spanish 샘플: 
-  Dalsov
-  Grceut
-  YMa
Vietnamese 샘플: 
-  Khenikae
-  Heada
-  Cetene
