### CNN 모델

In [1]:
import torch
import torch.nn as nn

batch_size = 2
one_hot_size = 10
sequence_width = 7

data = torch.randn(batch_size, one_hot_size, sequence_width)
conv1 = nn.Conv1d(in_channels=one_hot_size, out_channels=16,
              kernel_size=3)
intermediate1 = conv1(data)
print(data.size())
print(intermediate1.size())

torch.Size([2, 10, 7])
torch.Size([2, 16, 5])


In [2]:
conv2 = nn.Conv1d(in_channels=16, out_channels=32,kernel_size=3)
conv3 = nn.Conv1d(in_channels=32, out_channels=64,kernel_size=3)

intermediate2 = conv2(intermediate1)
intermediate3 = conv3(intermediate2)

print(intermediate2.size())
print(intermediate3.size())

torch.Size([2, 32, 3])
torch.Size([2, 64, 1])


In [3]:
y_output = intermediate3.squeeze()
print(y_output.size())

torch.Size([2, 64])


In [4]:
# 특성 벡터를 줄이는 방법 1
print(intermediate1.view(batch_size,-1).size())

# 특성 벡터를 줄이는 방법 2
print(torch.mean(intermediate1,dim=2).size())


torch.Size([2, 80])
torch.Size([2, 16])


### 데이터 로드

In [5]:
import pandas as pd

df = pd.read_csv("surnames_with_splits.csv")
df


Unnamed: 0,nationality,nationality_index,split,surname
0,Arabic,15,train,Totah
1,Arabic,15,train,Abboud
2,Arabic,15,train,Fakhoury
3,Arabic,15,train,Srour
4,Arabic,15,train,Sayegh
...,...,...,...,...
10975,Vietnamese,11,test,Dinh
10976,Vietnamese,11,test,Phung
10977,Vietnamese,11,test,Quang
10978,Vietnamese,11,test,Vu


In [6]:
df['nationality'].value_counts()

nationality
English       2972
Russian       2373
Arabic        1603
Japanese       775
Italian        600
German         576
Czech          414
Spanish        258
Dutch          236
French         229
Chinese        220
Irish          183
Greek          156
Polish         120
Korean          77
Scottish        75
Vietnamese      58
Portuguese      55
Name: count, dtype: int64

In [7]:
df['split'].value_counts()

split
train    7680
test     1660
val      1640
Name: count, dtype: int64

### 데이터 split(train/valid/test)

In [8]:
# 데이터를 다시 train/valid/test로 나눠줌

# train 데이터 
train_df = df[df.split=='train']
train_size = len(train_df)

# valid 데이터 
val_df = df[df.split=='val']
val_size = len(val_df)

# test 데이터 
test_df = df[df.split=='test']
test_size = len(test_df)

### 2. Vocabulary

In [9]:
from collections import Counter
import string

# Counter()를 통해 어떤 단어가 얼만큼의 횟수로 들어있는지를 알 수 있다.
word_counts = Counter()
for name_text in df.surname:
#     print(review)
    for word in name_text.split(" "):
        # word가 .(구두점,punctuation)이 아닐 경우 word에 추가
        if word not in string.punctuation:
            word_counts[word] += 1

# word_counts

In [10]:
# add_unk=True를 하면 '<UNK>': 0 토큰을 추가해줌 !

class Vocabulary:
    def __init__(self, add_unk=False):
        self.token_to_idx = {}
        self.idx_to_token = {}
    
#         "UNK" 토큰이 추가되지 않는 경우에는 -1로 설정,
        self.unk_index = -1
        if add_unk:
#         "UNK" 토큰이 추가될 경우에는 UNK에 해당하는 인덱스로 설정,
            self.unk_index = self.add_token('<UNK>') 

    def add_token(self, token):
        
#       만약 해당 토큰이 있으면 토큰 idx만 return
        if token in self.token_to_idx:
            index = self.token_to_idx[token]
            
#       만약 해당 토큰이 없으면 새로운 토큰 만들어줌
        else:
            index = len(self.token_to_idx)
            self.token_to_idx[token] = index
            self.idx_to_token[index] = token
        return index

In [11]:
# cutoff 보다 수가 많은 단어만 vocab에 추가
cutoff = 0

# Vocabulary 객체 생성
# cutoff보다 작으면 unk토큰으로 지정해줄 것이기 때문에 True
charater_vocab = Vocabulary(add_unk=True)
nation_vocab = Vocabulary(add_unk=False)

max_surname_length = 0

for index, row in df.iterrows():
    max_surname_length = max(max_surname_length,len(row.surname))
    for alpha in row.surname:
        charater_vocab.add_token(alpha)
    nation_vocab.add_token(row.nationality)
    

In [12]:
max_surname_length

17

In [13]:
charater_vocab.token_to_idx

{'<UNK>': 0,
 'T': 1,
 'o': 2,
 't': 3,
 'a': 4,
 'h': 5,
 'A': 6,
 'b': 7,
 'u': 8,
 'd': 9,
 'F': 10,
 'k': 11,
 'r': 12,
 'y': 13,
 'S': 14,
 'e': 15,
 'g': 16,
 'C': 17,
 'm': 18,
 'H': 19,
 'i': 20,
 'K': 21,
 'n': 22,
 'W': 23,
 's': 24,
 'f': 25,
 'G': 26,
 'M': 27,
 'l': 28,
 'B': 29,
 'z': 30,
 'N': 31,
 'I': 32,
 'w': 33,
 'D': 34,
 'Q': 35,
 'j': 36,
 'E': 37,
 'R': 38,
 'Z': 39,
 'c': 40,
 'Y': 41,
 'J': 42,
 'L': 43,
 'O': 44,
 '-': 45,
 'P': 46,
 'X': 47,
 'p': 48,
 ':': 49,
 'v': 50,
 'U': 51,
 '1': 52,
 'V': 53,
 'x': 54,
 '/': 55,
 'q': 56,
 'é': 57,
 'É': 58,
 "'": 59,
 'ç': 60,
 'ê': 61,
 'ß': 62,
 'ö': 63,
 'ä': 64,
 'ü': 65,
 'ú': 66,
 'à': 67,
 'ò': 68,
 'è': 69,
 'ó': 70,
 'ù': 71,
 'ì': 72,
 'Ś': 73,
 'ą': 74,
 'ń': 75,
 'á': 76,
 'ż': 77,
 'Ż': 78,
 'ł': 79,
 'õ': 80,
 'ã': 81,
 'í': 82,
 'ñ': 83,
 'Á': 84}

In [14]:
charater_vocab.idx_to_token

{0: '<UNK>',
 1: 'T',
 2: 'o',
 3: 't',
 4: 'a',
 5: 'h',
 6: 'A',
 7: 'b',
 8: 'u',
 9: 'd',
 10: 'F',
 11: 'k',
 12: 'r',
 13: 'y',
 14: 'S',
 15: 'e',
 16: 'g',
 17: 'C',
 18: 'm',
 19: 'H',
 20: 'i',
 21: 'K',
 22: 'n',
 23: 'W',
 24: 's',
 25: 'f',
 26: 'G',
 27: 'M',
 28: 'l',
 29: 'B',
 30: 'z',
 31: 'N',
 32: 'I',
 33: 'w',
 34: 'D',
 35: 'Q',
 36: 'j',
 37: 'E',
 38: 'R',
 39: 'Z',
 40: 'c',
 41: 'Y',
 42: 'J',
 43: 'L',
 44: 'O',
 45: '-',
 46: 'P',
 47: 'X',
 48: 'p',
 49: ':',
 50: 'v',
 51: 'U',
 52: '1',
 53: 'V',
 54: 'x',
 55: '/',
 56: 'q',
 57: 'é',
 58: 'É',
 59: "'",
 60: 'ç',
 61: 'ê',
 62: 'ß',
 63: 'ö',
 64: 'ä',
 65: 'ü',
 66: 'ú',
 67: 'à',
 68: 'ò',
 69: 'è',
 70: 'ó',
 71: 'ù',
 72: 'ì',
 73: 'Ś',
 74: 'ą',
 75: 'ń',
 76: 'á',
 77: 'ż',
 78: 'Ż',
 79: 'ł',
 80: 'õ',
 81: 'ã',
 82: 'í',
 83: 'ñ',
 84: 'Á'}

### 국적 Vocabulary

In [15]:
nation_vocab.token_to_idx

{'Arabic': 0,
 'Chinese': 1,
 'Czech': 2,
 'Dutch': 3,
 'English': 4,
 'French': 5,
 'German': 6,
 'Greek': 7,
 'Irish': 8,
 'Italian': 9,
 'Japanese': 10,
 'Korean': 11,
 'Polish': 12,
 'Portuguese': 13,
 'Russian': 14,
 'Scottish': 15,
 'Spanish': 16,
 'Vietnamese': 17}

In [16]:
nation_vocab.idx_to_token

{0: 'Arabic',
 1: 'Chinese',
 2: 'Czech',
 3: 'Dutch',
 4: 'English',
 5: 'French',
 6: 'German',
 7: 'Greek',
 8: 'Irish',
 9: 'Italian',
 10: 'Japanese',
 11: 'Korean',
 12: 'Polish',
 13: 'Portuguese',
 14: 'Russian',
 15: 'Scottish',
 16: 'Spanish',
 17: 'Vietnamese'}

## 3. Vectorizer

In [17]:
# 주어진 토큰에 대응하는 인덱스 반환

def lookup_token(vocabulary_class,token):

# UNK 토큰이 있을 경우
    if vocabulary_class.unk_index >= 0:
#           토큰을 찾아보고 없으면 unk_index 반환, 있으면 해당 토큰의 idx를 반환
        return vocabulary_class.token_to_idx.get(token, vocabulary_class.unk_index)
    else:
        return vocabulary_class.token_to_idx[token]
    

In [18]:
# 주어진 인덱스에 대응하는 토큰 반환

def lookup_index(vocabulary_class, index):
        if index not in vocabulary_class.idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return vocabulary_class.idx_to_token[index]
    

In [19]:
max_surname_length

17

In [20]:
vocab_length = len(charater_vocab.token_to_idx)
print("토큰의 수:", vocab_length)

토큰의 수: 85


### 텍스트(surname)에 대한 원 핫 인코딩

In [21]:
import numpy as np 

def vectorize(voca, max_surname_length,surname):

#     전체 85(character Voca 크기)*17(최대단어길이) 사이즈만큼을 미리 0으로 채워둠
    one_hot_matrix_size = (len(voca.token_to_idx), max_surname_length)
    one_hot_matrix = np.zeros(one_hot_matrix_size, dtype=np.float32)
#     print("one_hot",one_hot_matrix)
    
    for position_index, character in enumerate(surname):
        chracter_index = lookup_token(voca,character)
        one_hot_matrix[chracter_index][position_index]=1
    
    return one_hot_matrix

print("예시")
example = vectorize(charater_vocab,max_surname_length,"Choi")
print(example)
print(len(example))
print(len(example[0]))

예시
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
85
17


### CNN Dataset class

In [22]:
import torch
from torch.utils.data import Dataset

class NameDataset(Dataset):
    def __init__(self, names, nations):
        self.names = names
        self.nations = nations

    def __len__(self):
        return len(self.names)

    def __getitem__(self, index):
        name = self.names[index]
        nation = self.nations[index]
        
#         여기다가 vectorize함수 사용해서 charater return
        vectorized_name = vectorize(charater_vocab,max_surname_length,name)
#       nation 숫자로 return
        vectorized_nation = lookup_token(nation_vocab,nation)
    
        return {
            'surname': vectorized_name,
            'nationality': vectorized_nation
        }

### 데이터셋 class

In [23]:
# 데이터셋을 인스턴스화 해주어야 로더에 넣어줄 수 있다. 

train_dataset = NameDataset(train_df["surname"].values, train_df["nationality"].values)
train_dataset

valid_dataset = NameDataset(val_df["surname"].values, val_df["nationality"].values)
valid_dataset

test_dataset = NameDataset(test_df["surname"].values, test_df["nationality"].values)
test_dataset


<__main__.NameDataset at 0x7fb530ecd580>

In [24]:
# 데이터 로더 설정
from torch.utils.data import DataLoader

# drop_last=True -> 배치 사이즈보다 over하면 drop

Traindataloader = DataLoader(dataset=train_dataset, batch_size=512,
                            shuffle=True, drop_last=True)

Validdataloader = DataLoader(dataset=valid_dataset, batch_size=512,
                            shuffle=True, drop_last=True)

Testdataloader = DataLoader(dataset=test_dataset, batch_size=512,
                            shuffle=True, drop_last=True)


In [25]:
print(len(train_dataset),len(Traindataloader))

7680 15


In [26]:
for batch_index, batch_dict in enumerate(Traindataloader):
#     print(batch_index)
    print(batch_dict)
    
    break
    

{'surname': tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [

### CNN모델 정의 및 옵티마이저, loss func 설정

### 모델정의 NameClassifier

In [27]:
# nn은 neural network로 torch의 신경망 모듈이다.
# CNN사용 
import torch.nn as nn
import torch.nn.functional as F

class NameClassifier(nn.Module):
    
    def __init__(self, initial_num_channels, num_classes, num_channels):
        """
        initial_num_channels : 입력 특성 벡터의 크기 -> character Voca size(85)
        num_classes : 출력 예측 벡터의 크기 -> 국가 Voca size(18)
        num_channels : 신경망 전체에 사용될 채널 크기 -> 32, 64... 지정 
        """
#       torch.nn.Module의 초기화 메서드를 실행하여 해당 클래스의 기능을 상속받음
        super(NameClassifier, self).__init__()
        
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=initial_num_channels,
                     out_channels=num_channels,kernel_size=3),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
                     kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
                     kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
                     kernel_size=3),
            nn.ELU()
        )
        self.fc = nn.Linear(num_channels, num_classes)
    
    
    def forward(self, x_in, apply_softmax=False):
        """
        x_in.shape => batch, initial_num_channels(85), max_surname_length(17)이다. 
        """
        
        features = self.convnet(x_in).squeeze(dim=2) #2차원 출력을 1차원으로 변환해서 펴주기
        predition_vector = self.fc(features)
        
        if apply_softmax:
            predition_vector = F.softmax(predition_vector,dim=1)
        
        return predition_vector
        

In [28]:
len(charater_vocab.token_to_idx)

85

In [29]:
len(nation_vocab.token_to_idx)

18

In [30]:
classifier = NameClassifier(initial_num_channels=len(charater_vocab.token_to_idx), # 분류할 것(85)
                            num_classes=len(nation_vocab.token_to_idx), # 분류하려는 것(18)
                           num_channels=64) # 출력 채널 수 (32,64,128) 지정 가능
classifier

NameClassifier(
  (convnet): Sequential(
    (0): Conv1d(85, 64, kernel_size=(3,), stride=(1,))
    (1): ELU(alpha=1.0)
    (2): Conv1d(64, 64, kernel_size=(3,), stride=(2,))
    (3): ELU(alpha=1.0)
    (4): Conv1d(64, 64, kernel_size=(3,), stride=(2,))
    (5): ELU(alpha=1.0)
    (6): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
    (7): ELU(alpha=1.0)
  )
  (fc): Linear(in_features=64, out_features=18, bias=True)
)

### 옵티마이저, loss function

In [31]:
lr = 0.001
num_epochs = 100

In [32]:
# 옵티마이저
import torch.optim as optim

optimizer = optim.Adam(classifier.parameters(), lr = lr)
optimizer


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [33]:
df['nationality'].value_counts()

nationality
English       2972
Russian       2373
Arabic        1603
Japanese       775
Italian        600
German         576
Czech          414
Spanish        258
Dutch          236
French         229
Chinese        220
Irish          183
Greek          156
Polish         120
Korean          77
Scottish        75
Vietnamese      58
Portuguese      55
Name: count, dtype: int64

In [34]:
numSample_list = df['nationality'].value_counts().tolist()
numSample_list
# weights 계산
weights = [1 - (x / sum(numSample_list)) for x in numSample_list]

# weights를 torch.FloatTensor로 변환
weights = torch.FloatTensor(weights)
weights

tensor([0.7293, 0.7839, 0.8540, 0.9294, 0.9454, 0.9475, 0.9623, 0.9765, 0.9785,
        0.9791, 0.9800, 0.9833, 0.9858, 0.9891, 0.9930, 0.9932, 0.9947, 0.9950])

In [35]:
# loss function
#  dataset.class_weights -> 각 클래스에 대해 다른 가중치를 적용할 수 있음(데이터 불균형시에)
# 소수의 클래스가 다수의 클래스보다 훨씬 적은 수의 샘플을 가지고 있는 경우, 
# 소수 클래스에 더 높은 가중치를 부여하여 모델이 불균형한 데이터에 대해 더 잘 학습

# loss_func = nn.CrossEntropyLoss(weights)
loss_func = nn.CrossEntropyLoss()
loss_func


CrossEntropyLoss()

### Train

In [36]:
def compute_accuracy(y_pred, y_target):
#      예측값과 타겟값을 비교하여 일치하는 개수를 계산
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100
   

In [37]:
# Train state 초기화 
def make_train_state():
    return {
        'stop_early':False,
        'early_stopping_step':0,
        'early_stopping_best_val':1e8,
        'early_stopping_criteria' : 10,
        'epoch_index' : 0,
        'train_loss': [], 
        'train_acc' :[], 
        'val_loss' : [],
        'val_acc' : [], 
        'test_loss' : [],
        'test_acc' : [],
         
#       모델 저장파일
        'model_filename' : 'model.pth'
    } 


# Train update 
def update_train_state(model, train_state):
    
#   학습시작하면 초기에 모델 저장하기 
    
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(),train_state['model_filename'])
        
#   모델 성능이 향상되면 모델 저장(valid loss가 더 낮아지면)
    elif train_state['epoch_index'] >=1 :
        loss_t = train_state['val_loss'][-1]
#        loss가 나빠지면 early stop step 업데이트
        if loss_t >= train_state['early_stopping_best_val']:
            train_state['early_stopping_step']+=1
            
#        loss가 좋아지면   
        else:
#            early stop step 0으로 다시 초기화        
            train_state['early_stopping_step']=0
    
#           최저 loss이면 모델 저장 
            if loss_t < train_state['early_stopping_best_val']:
                train_state['early_stopping_best_val'] = loss_t
                torch.save(model.state_dict(),train_state['model_filename'])

#       기준점 넘으면 early stop 
        if train_state['early_stopping_step'] >= train_state['early_stopping_criteria']:
            train_state['stop_early'] = True
        
        return train_state


In [38]:
# 모델 진행 상황 함수 초기화
train_state = make_train_state()
train_state

{'stop_early': False,
 'early_stopping_step': 0,
 'early_stopping_best_val': 100000000.0,
 'early_stopping_criteria': 10,
 'epoch_index': 0,
 'train_loss': [],
 'train_acc': [],
 'val_loss': [],
 'val_acc': [],
 'test_loss': [],
 'test_acc': [],
 'model_filename': 'model.pth'}

In [39]:
import tqdm

# 에포크만큼
for epoch in tqdm.tqdm(range(num_epochs)):

#     print('epoch',epoch)
#     print(train_state['epoch_index']) 
    train_state['epoch_index'] +=1 

    running_loss = 0.0
    running_acc = 0.0


#     모델을 학습 모드로 설정 -> 드롭아웃 및 배치 정규화와 같은 학습 중에만 적용되는 기법들이 활성화
#     모델을 평가 모드로 전환하려면 classifier.eval()을 사용
    classifier.train()
# 배치 만큼
    for batch_idx, batch_data in enumerate(Traindataloader):
        

#       1. 옵티마이저 그레디언트 0으로 초기화
        optimizer.zero_grad()
#       2. 모델에 데이터 넣어서 출력받기
        y_pred = classifier(x_in=batch_data['surname'])
#       3. loss 계산하기
        loss =  loss_func(y_pred, batch_data['nationality'])
    
#       tensor(0.3190) -> 0.3190, item()으로 스칼라 값만 추출
        loss_t = loss.item()

#       배치에서의 평균 loss 구하기
        running_loss += (loss_t - running_loss) / (batch_idx + 1)

#       4. gradient 계산하기
        loss.backward()

#       5. 옵티마이저 가중치 업데이트
        optimizer.step()

#       Accuracy 계산
        acc_t = compute_accuracy(y_pred, batch_data['nationality'])
        running_acc += (acc_t - running_acc) / (batch_idx + 1)



    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)


#   valid에 대한 계산

    running_loss = 0.0
    running_acc = 0.0

    classifier.eval() # 모델 파라미터를 수정하지 못 하게 비활성화

    for batch_idx, batch_data in enumerate(Validdataloader):

#       1. 모델의 출력값(y_pred)계산
        y_pred = classifier(x_in=batch_data['surname'])

#       2. loss 계산
        loss = loss_func(y_pred,batch_data['nationality'])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_idx + 1)

#       3. Accuracy 계산
        acc_t = compute_accuracy(y_pred,batch_data['nationality'])
        running_acc += (acc_t - running_acc) / (batch_idx + 1)
    
    print("val_loss",running_loss)
    print("val_acc",running_acc)

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    

#   전체 loss, acc 저장
    train_state = update_train_state(model=classifier,
                                     train_state=train_state)
#   early stop해라고 했으면 학습 멈추기    
    if train_state['stop_early']:
        break



  1%|▍                                          | 1/100 [00:01<02:05,  1.27s/it]

val_loss 2.2941468556722007
val_acc 26.953125


  2%|▊                                          | 2/100 [00:02<01:42,  1.05s/it]

val_loss 2.2031826972961426
val_acc 27.278645833333332


  3%|█▎                                         | 3/100 [00:03<01:37,  1.00s/it]

val_loss 2.1640730698903403
val_acc 35.286458333333336


  4%|█▋                                         | 4/100 [00:04<01:43,  1.08s/it]

val_loss 2.080089489618937
val_acc 37.434895833333336


  5%|██▏                                        | 5/100 [00:05<01:36,  1.02s/it]

val_loss 1.9209178288777669
val_acc 42.903645833333336


  6%|██▌                                        | 6/100 [00:06<01:32,  1.02it/s]

val_loss 1.7434579531351726
val_acc 50.520833333333336


  7%|███                                        | 7/100 [00:07<01:30,  1.03it/s]

val_loss 1.6590429941813152
val_acc 52.669270833333336


  8%|███▍                                       | 8/100 [00:07<01:27,  1.06it/s]

val_loss 1.5907117923100789
val_acc 54.947916666666664


  9%|███▊                                       | 9/100 [00:08<01:26,  1.06it/s]

val_loss 1.5388100147247314
val_acc 56.119791666666664


 10%|████▏                                     | 10/100 [00:09<01:25,  1.05it/s]

val_loss 1.5057705243428547
val_acc 56.966145833333336


 11%|████▌                                     | 11/100 [00:11<01:30,  1.02s/it]

val_loss 1.4711943070093791
val_acc 57.942708333333336


 12%|█████                                     | 12/100 [00:12<01:44,  1.19s/it]

val_loss 1.4463011423746746
val_acc 58.919270833333336


 13%|█████▍                                    | 13/100 [00:13<01:46,  1.22s/it]

val_loss 1.439347743988037
val_acc 58.333333333333336


 14%|█████▉                                    | 14/100 [00:15<02:05,  1.46s/it]

val_loss 1.4116290012995403
val_acc 59.765625


 15%|██████▎                                   | 15/100 [00:17<01:57,  1.38s/it]

val_loss 1.3848724762598674
val_acc 60.15625


 16%|██████▋                                   | 16/100 [00:18<01:46,  1.27s/it]

val_loss 1.351871093114217
val_acc 62.174479166666664


 17%|███████▏                                  | 17/100 [00:19<01:50,  1.33s/it]

val_loss 1.3333966334660847
val_acc 61.848958333333336


 18%|███████▌                                  | 18/100 [00:20<01:49,  1.33s/it]

val_loss 1.2968440850575764
val_acc 62.174479166666664


 19%|███████▉                                  | 19/100 [00:22<01:45,  1.30s/it]

val_loss 1.2819708983103435
val_acc 62.955729166666664


 20%|████████▍                                 | 20/100 [00:23<01:55,  1.44s/it]

val_loss 1.240401268005371
val_acc 63.606770833333336


 21%|████████▊                                 | 21/100 [00:25<01:45,  1.34s/it]

val_loss 1.2250923713048298
val_acc 64.453125


 22%|█████████▏                                | 22/100 [00:26<01:39,  1.27s/it]

val_loss 1.2027342716852825
val_acc 66.2109375


 23%|█████████▋                                | 23/100 [00:27<01:33,  1.21s/it]

val_loss 1.192394455273946
val_acc 65.69010416666667


 24%|██████████                                | 24/100 [00:28<01:25,  1.13s/it]

val_loss 1.1770106951395671
val_acc 66.40625


 25%|██████████▌                               | 25/100 [00:29<01:20,  1.07s/it]

val_loss 1.1697304248809814
val_acc 66.6015625


 26%|██████████▉                               | 26/100 [00:30<01:16,  1.04s/it]

val_loss 1.1684833367665608
val_acc 66.66666666666667


 27%|███████████▎                              | 27/100 [00:30<01:13,  1.00s/it]

val_loss 1.1270057360331218
val_acc 68.48958333333333


 28%|███████████▊                              | 28/100 [00:31<01:10,  1.01it/s]

val_loss 1.116470495859782
val_acc 68.42447916666667


 29%|████████████▏                             | 29/100 [00:32<01:10,  1.00it/s]

val_loss 1.1222044626871746
val_acc 68.1640625


 30%|████████████▌                             | 30/100 [00:33<01:08,  1.03it/s]

val_loss 1.102547526359558
val_acc 69.01041666666667


 31%|█████████████                             | 31/100 [00:34<01:05,  1.06it/s]

val_loss 1.0906385978062947
val_acc 69.40104166666667


 32%|█████████████▍                            | 32/100 [00:35<01:04,  1.05it/s]

val_loss 1.092143177986145
val_acc 69.3359375


 33%|█████████████▊                            | 33/100 [00:36<01:02,  1.06it/s]

val_loss 1.0988272825876872
val_acc 68.1640625


 34%|██████████████▎                           | 34/100 [00:37<01:01,  1.07it/s]

val_loss 1.0755671461423237
val_acc 69.66145833333333


 35%|██████████████▋                           | 35/100 [00:38<01:03,  1.02it/s]

val_loss 1.0658969084421794
val_acc 70.5078125


 36%|███████████████                           | 36/100 [00:39<01:01,  1.05it/s]

val_loss 1.0569876829783122
val_acc 70.37760416666667


 37%|███████████████▌                          | 37/100 [00:40<00:59,  1.06it/s]

val_loss 1.0730501413345337
val_acc 69.59635416666667


 38%|███████████████▉                          | 38/100 [00:41<00:57,  1.07it/s]

val_loss 1.06015145778656
val_acc 69.7265625


 39%|████████████████▍                         | 39/100 [00:42<00:56,  1.08it/s]

val_loss 1.0604577859242756
val_acc 70.05208333333333


 40%|████████████████▊                         | 40/100 [00:43<00:54,  1.10it/s]

val_loss 1.0662401914596558
val_acc 69.46614583333333


 41%|█████████████████▏                        | 41/100 [00:44<00:54,  1.09it/s]

val_loss 1.064160664876302
val_acc 69.20572916666667


 42%|█████████████████▋                        | 42/100 [00:45<00:53,  1.09it/s]

val_loss 1.0468943516413372
val_acc 69.921875


 43%|██████████████████                        | 43/100 [00:45<00:52,  1.09it/s]

val_loss 1.0348653594652812
val_acc 70.24739583333333


 44%|██████████████████▍                       | 44/100 [00:46<00:53,  1.04it/s]

val_loss 1.0292507608731587
val_acc 70.18229166666667


 45%|██████████████████▉                       | 45/100 [00:48<00:54,  1.01it/s]

val_loss 1.0218332211176555
val_acc 70.18229166666667


 46%|███████████████████▎                      | 46/100 [00:49<00:55,  1.04s/it]

val_loss 1.0221168796221416
val_acc 70.3125


 47%|███████████████████▋                      | 47/100 [00:50<00:54,  1.04s/it]

val_loss 1.0502192775408428
val_acc 69.46614583333333


 48%|████████████████████▏                     | 48/100 [00:51<00:52,  1.02s/it]

val_loss 1.019181231657664
val_acc 71.15885416666667


 49%|████████████████████▌                     | 49/100 [00:52<00:51,  1.01s/it]

val_loss 1.022276798884074
val_acc 70.1171875


 50%|█████████████████████                     | 50/100 [00:53<00:50,  1.00s/it]

val_loss 1.007240653038025
val_acc 70.44270833333333


 51%|█████████████████████▍                    | 51/100 [00:54<00:47,  1.03it/s]

val_loss 1.026952823003133
val_acc 70.8984375


 52%|█████████████████████▊                    | 52/100 [00:55<00:47,  1.01it/s]

val_loss 1.0062974492708843
val_acc 71.74479166666667


 53%|██████████████████████▎                   | 53/100 [00:56<00:45,  1.03it/s]

val_loss 1.0101464192072551
val_acc 71.02864583333333


 54%|██████████████████████▋                   | 54/100 [00:56<00:44,  1.04it/s]

val_loss 1.0179474353790283
val_acc 70.5078125


 55%|███████████████████████                   | 55/100 [00:58<00:45,  1.01s/it]

val_loss 0.9937800168991089
val_acc 71.74479166666667


 56%|███████████████████████▌                  | 56/100 [00:59<00:47,  1.08s/it]

val_loss 1.0029486219088237
val_acc 71.484375


 57%|███████████████████████▉                  | 57/100 [01:00<00:48,  1.12s/it]

val_loss 0.9977965354919434
val_acc 71.41927083333333


 58%|████████████████████████▎                 | 58/100 [01:01<00:47,  1.13s/it]

val_loss 1.0039403637250264
val_acc 70.96354166666667


 59%|████████████████████████▊                 | 59/100 [01:02<00:44,  1.08s/it]

val_loss 1.0075672467549641
val_acc 71.15885416666667


 60%|█████████████████████████▏                | 60/100 [01:03<00:41,  1.03s/it]

val_loss 1.0233927567799885
val_acc 70.57291666666667


 61%|█████████████████████████▌                | 61/100 [01:04<00:39,  1.00s/it]

val_loss 1.002232591311137
val_acc 71.74479166666667


 62%|██████████████████████████                | 62/100 [01:05<00:37,  1.03it/s]

val_loss 0.9899790485699972
val_acc 72.0703125


 63%|██████████████████████████▍               | 63/100 [01:06<00:35,  1.04it/s]

val_loss 0.9939115246136984
val_acc 71.74479166666667


 64%|██████████████████████████▉               | 64/100 [01:07<00:34,  1.04it/s]

val_loss 0.9867205818494161
val_acc 72.59114583333333


 65%|███████████████████████████▎              | 65/100 [01:08<00:37,  1.07s/it]

val_loss 1.0019646286964417
val_acc 71.875


 66%|███████████████████████████▋              | 66/100 [01:09<00:36,  1.07s/it]

val_loss 1.0090186198552449
val_acc 71.484375


 67%|████████████████████████████▏             | 67/100 [01:10<00:34,  1.05s/it]

val_loss 1.0121209621429443
val_acc 71.09375


 68%|████████████████████████████▌             | 68/100 [01:11<00:33,  1.04s/it]

val_loss 1.0076925158500671
val_acc 72.0703125


 69%|████████████████████████████▉             | 69/100 [01:12<00:31,  1.01s/it]

val_loss 1.002161681652069
val_acc 72.00520833333333


 70%|█████████████████████████████▍            | 70/100 [01:13<00:29,  1.02it/s]

val_loss 0.9887951612472534
val_acc 72.39583333333333


 71%|█████████████████████████████▊            | 71/100 [01:14<00:27,  1.04it/s]

val_loss 1.0070250630378723
val_acc 71.41927083333333


 72%|██████████████████████████████▏           | 72/100 [01:15<00:26,  1.04it/s]

val_loss 0.9835207263628641
val_acc 72.65625


 73%|██████████████████████████████▋           | 73/100 [01:16<00:25,  1.06it/s]

val_loss 1.0136265556017559
val_acc 73.17708333333333


 74%|███████████████████████████████           | 74/100 [01:17<00:24,  1.05it/s]

val_loss 1.0162049531936646
val_acc 72.4609375


 75%|███████████████████████████████▌          | 75/100 [01:18<00:24,  1.02it/s]

val_loss 0.9984452923138937
val_acc 73.11197916666667


 76%|███████████████████████████████▉          | 76/100 [01:19<00:24,  1.03s/it]

val_loss 1.0084959864616394
val_acc 72.13541666666667


 77%|████████████████████████████████▎         | 77/100 [01:20<00:22,  1.00it/s]

val_loss 1.017244021097819
val_acc 71.54947916666667


 78%|████████████████████████████████▊         | 78/100 [01:21<00:22,  1.00s/it]

val_loss 1.0339492758115132
val_acc 72.13541666666667


 79%|█████████████████████████████████▏        | 79/100 [01:22<00:23,  1.11s/it]

val_loss 1.0433433254559834
val_acc 72.0703125


 80%|█████████████████████████████████▌        | 80/100 [01:23<00:21,  1.07s/it]

val_loss 1.0358987053235371
val_acc 72.78645833333333


 81%|██████████████████████████████████        | 81/100 [01:25<00:23,  1.21s/it]

val_loss 1.0570435126622517
val_acc 72.33072916666667


 81%|██████████████████████████████████        | 81/100 [01:26<00:20,  1.07s/it]

val_loss 1.0462886492411296
val_acc 72.0703125





### Test 진행

In [40]:
# 가장 좋은 모델을 사용해 테스트 세트의 손실과 정확도를 계산합니다

classifier.load_state_dict(torch.load(train_state['model_filename']))

running_loss = 0.0
running_acc = 0.0

# 가중치 업데이트 하지 못 하게
classifier.eval()

for batch_idx, batch_data in enumerate(Testdataloader):
    
    y_pred = classifier(x_in=batch_data['surname'])
    loss = loss_func(y_pred,batch_data['nationality'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_idx + 1)
    
    acc_t = compute_accuracy(y_pred, batch_data['nationality'])
    running_acc += (acc_t - running_acc) / (batch_idx + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

### MLP 보다 30%의 Accuracy 향상

In [41]:
print("테스트 손실: {:.3f}".format(train_state['test_loss']))
print("테스트 정확도: {:.2f}".format(train_state['test_acc']))

테스트 손실: 1.049
테스트 정확도: 71.16


In [42]:
train_state

{'stop_early': True,
 'early_stopping_step': 10,
 'early_stopping_best_val': 0.9835207263628641,
 'early_stopping_criteria': 10,
 'epoch_index': 82,
 'train_loss': [2.5587686697642007,
  2.252599716186524,
  2.1958210945129393,
  2.1378581047058103,
  2.0014691988627114,
  1.8231967846552528,
  1.6801792224248249,
  1.577084747950236,
  1.5090062300364178,
  1.4615716298421222,
  1.4177862564722696,
  1.3828231414159138,
  1.3572803735733032,
  1.326089866956075,
  1.2966793855031333,
  1.2672155777613323,
  1.2340983867645263,
  1.2058131138483683,
  1.170122210184733,
  1.1371918916702268,
  1.1104252258936564,
  1.084042231241862,
  1.059811254342397,
  1.0383695165316262,
  1.017719495296478,
  0.9993720253308613,
  0.975432042280833,
  0.9597695390383402,
  0.9454202612241109,
  0.9282543261845906,
  0.9154825647672017,
  0.9017051498095195,
  0.8910457094510396,
  0.8761918743451436,
  0.8599290529886882,
  0.8513031840324402,
  0.8438669721285502,
  0.8377117196718852,
  0.81997

### 추론

In [43]:
new_surname = "Suzuki"

In [44]:
# 벡터화 + 텐서화

vectorized_surname = torch.tensor(vectorize(charater_vocab,max_surname_length,new_surname))
vectorized_surname

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [45]:
print(vectorized_surname.shape)

torch.Size([85, 17])


In [46]:
# 이전에는 view(1,-1) 사용했으나 
before = vectorized_surname.view(1, -1)
print(before.shape)

torch.Size([1, 1445])


In [47]:
# CNN에는 unsqueeze(0)을 통해서 차원을 하나 늘려준다. 
x_data = torch.tensor(vectorized_surname).unsqueeze(0)
print(x_data.shape)

torch.Size([1, 85, 17])


  x_data = torch.tensor(vectorized_surname).unsqueeze(0)


In [48]:
# 모델에 test 데이터 넣어주기
result = classifier(x_data,apply_softmax=True)
print(result)

tensor([[2.7602e-10, 4.7359e-11, 4.9214e-02, 2.3482e-06, 3.8571e-07, 1.4107e-07,
         1.0961e-06, 1.2776e-05, 3.8232e-07, 1.1281e-08, 9.4579e-01, 6.8485e-08,
         3.2575e-03, 1.0687e-08, 1.7236e-03, 2.6550e-09, 7.6680e-07, 6.2029e-10]],
       grad_fn=<SoftmaxBackward0>)


[W NNPACK.cpp:51] Could not initialize NNPACK! Reason: Unsupported hardware.


In [49]:
result.max()

tensor(0.9458, grad_fn=<MaxBackward1>)

In [50]:
probability_value, indices = result.max(dim=1)

print("probability_value",probability_value)
print("indices",indices)

probability_value tensor([0.9458], grad_fn=<MaxBackward0>)
indices tensor([10])


In [51]:
print("새로운 데이터에 대해 추론한 label",indices.item())

새로운 데이터에 대해 추론한 label 10


In [52]:
prediction = lookup_index(nation_vocab,indices.item())
prediction

'Japanese'

In [53]:
print("{} -> {}".format(new_surname, prediction))

Suzuki -> Japanese
