### MLP 모델

In [1]:
import torch.nn as nn
import torch.nn.functional as F

class MultilayerPerceptron(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        
        super(MultilayerPerceptron,self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x_in, apply_softmax=False):
        
#         relu 활성화 함수를 통해 비선형성을 추가
        intermediate = F.relu(self.fc1(x_in))
        output = self.fc2(intermediate)
        
#         softmax 적용여부 
        if apply_softmax:
#           dim = 행렬의 각 행에 대해 소프트맥스 함수를 적용
            output = F.softmax(output, dim=1)
        return output

In [2]:
# example1

input_dim = 3
hidden_dim = 100
output_dim = 4

mlp = MultilayerPerceptron(input_dim,hidden_dim,output_dim)
print(mlp)

MultilayerPerceptron(
  (fc1): Linear(in_features=3, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=4, bias=True)
)


### 데이터 로드

In [3]:
import pandas as pd

df = pd.read_csv("surnames_with_splits.csv")
df


Unnamed: 0,nationality,nationality_index,split,surname
0,Arabic,15,train,Totah
1,Arabic,15,train,Abboud
2,Arabic,15,train,Fakhoury
3,Arabic,15,train,Srour
4,Arabic,15,train,Sayegh
...,...,...,...,...
10975,Vietnamese,11,test,Dinh
10976,Vietnamese,11,test,Phung
10977,Vietnamese,11,test,Quang
10978,Vietnamese,11,test,Vu


In [4]:
df['nationality'].value_counts()

nationality
English       2972
Russian       2373
Arabic        1603
Japanese       775
Italian        600
German         576
Czech          414
Spanish        258
Dutch          236
French         229
Chinese        220
Irish          183
Greek          156
Polish         120
Korean          77
Scottish        75
Vietnamese      58
Portuguese      55
Name: count, dtype: int64

In [5]:
df['split'].value_counts()

split
train    7680
test     1660
val      1640
Name: count, dtype: int64

### 데이터 split(train/valid/test)

In [6]:
# 데이터를 다시 train/valid/test로 나눠줌

# train 데이터 
train_df = df[df.split=='train']
train_size = len(train_df)

# valid 데이터 
val_df = df[df.split=='val']
val_size = len(val_df)

# test 데이터 
test_df = df[df.split=='test']
test_size = len(test_df)

### 2. Vocabulary

In [7]:
from collections import Counter
import string

# Counter()를 통해 어떤 단어가 얼만큼의 횟수로 들어있는지를 알 수 있다.
word_counts = Counter()
for name_text in df.surname:
#     print(review)
    for word in name_text.split(" "):
        # word가 .(구두점,punctuation)이 아닐 경우 word에 추가
        if word not in string.punctuation:
            word_counts[word] += 1

# word_counts

In [8]:
# add_unk=True를 하면 '<UNK>': 0 토큰을 추가해줌 !

class Vocabulary:
    def __init__(self, add_unk=False):
        self.token_to_idx = {}
        self.idx_to_token = {}
    
#         "UNK" 토큰이 추가되지 않는 경우에는 -1로 설정,
        self.unk_index = -1
        if add_unk:
#         "UNK" 토큰이 추가될 경우에는 UNK에 해당하는 인덱스로 설정,
            self.unk_index = self.add_token('<UNK>') 

    def add_token(self, token):
        
#       만약 해당 토큰이 있으면 토큰 idx만 return
        if token in self.token_to_idx:
            index = self.token_to_idx[token]
            
#       만약 해당 토큰이 없으면 새로운 토큰 만들어줌
        else:
            index = len(self.token_to_idx)
            self.token_to_idx[token] = index
            self.idx_to_token[index] = token
        return index

In [9]:
# cutoff 보다 수가 많은 단어만 vocab에 추가
cutoff = 0

# Vocabulary 객체 생성
# cutoff보다 작으면 unk토큰으로 지정해줄 것이기 때문에 True
name_vocab = Vocabulary(add_unk=True)

# word_counts.items() -> ex) ('all', 24160)
for word, count in word_counts.items():
    if count > cutoff:
        name_vocab.add_token(word)

In [10]:
name_vocab.token_to_idx

{'<UNK>': 0,
 'Totah': 1,
 'Abboud': 2,
 'Fakhoury': 3,
 'Srour': 4,
 'Sayegh': 5,
 'Cham': 6,
 'Haik': 7,
 'Kattan': 8,
 'Khouri': 9,
 'Antoun': 10,
 'Wasem': 11,
 'Seif': 12,
 'Guirguis': 13,
 'Sarkis': 14,
 'Said': 15,
 'Malouf': 16,
 'Bishara': 17,
 'Ganim': 18,
 'Baz': 19,
 'Assaf': 20,
 'Nader': 21,
 'Isa': 22,
 'Awad': 23,
 'Deeb': 24,
 'Kanaan': 25,
 'Quraishi': 26,
 'Atiyeh': 27,
 'Boutros': 28,
 'Sabbagh': 29,
 'Mustafa': 30,
 'Mansour': 31,
 'Hadad': 32,
 'Salib': 33,
 'Sabbag': 34,
 'Kassab': 35,
 'Moghadam': 36,
 'Najjar': 37,
 'Gerges': 38,
 'Safar': 39,
 'Mifsud': 40,
 'Shalhoub': 41,
 'Koury': 42,
 'Kalb': 43,
 'Harb': 44,
 'Toma': 45,
 'Maalouf': 46,
 'Kouri': 47,
 'Shadid': 48,
 'Dagher': 49,
 'Tahan': 50,
 'Bahar': 51,
 'Boulos': 52,
 'Attia': 53,
 'Amari': 54,
 'Naser': 55,
 'Bazzi': 56,
 'Bata': 57,
 'Shamoon': 58,
 'Hanania': 59,
 'Masih': 60,
 'Halabi': 61,
 'Saliba': 62,
 'Antar': 63,
 'Qureshi': 64,
 'Kassis': 65,
 'Sarraf': 66,
 'Essa': 67,
 'Touma': 68,
 'Mar

In [11]:
name_vocab.idx_to_token

{0: '<UNK>',
 1: 'Totah',
 2: 'Abboud',
 3: 'Fakhoury',
 4: 'Srour',
 5: 'Sayegh',
 6: 'Cham',
 7: 'Haik',
 8: 'Kattan',
 9: 'Khouri',
 10: 'Antoun',
 11: 'Wasem',
 12: 'Seif',
 13: 'Guirguis',
 14: 'Sarkis',
 15: 'Said',
 16: 'Malouf',
 17: 'Bishara',
 18: 'Ganim',
 19: 'Baz',
 20: 'Assaf',
 21: 'Nader',
 22: 'Isa',
 23: 'Awad',
 24: 'Deeb',
 25: 'Kanaan',
 26: 'Quraishi',
 27: 'Atiyeh',
 28: 'Boutros',
 29: 'Sabbagh',
 30: 'Mustafa',
 31: 'Mansour',
 32: 'Hadad',
 33: 'Salib',
 34: 'Sabbag',
 35: 'Kassab',
 36: 'Moghadam',
 37: 'Najjar',
 38: 'Gerges',
 39: 'Safar',
 40: 'Mifsud',
 41: 'Shalhoub',
 42: 'Koury',
 43: 'Kalb',
 44: 'Harb',
 45: 'Toma',
 46: 'Maalouf',
 47: 'Kouri',
 48: 'Shadid',
 49: 'Dagher',
 50: 'Tahan',
 51: 'Bahar',
 52: 'Boulos',
 53: 'Attia',
 54: 'Amari',
 55: 'Naser',
 56: 'Bazzi',
 57: 'Bata',
 58: 'Shamoon',
 59: 'Hanania',
 60: 'Masih',
 61: 'Halabi',
 62: 'Saliba',
 63: 'Antar',
 64: 'Qureshi',
 65: 'Kassis',
 66: 'Sarraf',
 67: 'Essa',
 68: 'Touma',
 69: 

### 국적 Vocabulary

In [12]:
nation_vocab = Vocabulary(add_unk=False)
nation_vocab

<__main__.Vocabulary at 0x7fb4d33f3610>

In [13]:
# 국적 Vocabulary 추가 

for nation in sorted(set(df.nationality)):
    nation_vocab.add_token(nation)
    print(nation)

Arabic
Chinese
Czech
Dutch
English
French
German
Greek
Irish
Italian
Japanese
Korean
Polish
Portuguese
Russian
Scottish
Spanish
Vietnamese


In [14]:
nation_vocab.token_to_idx

{'Arabic': 0,
 'Chinese': 1,
 'Czech': 2,
 'Dutch': 3,
 'English': 4,
 'French': 5,
 'German': 6,
 'Greek': 7,
 'Irish': 8,
 'Italian': 9,
 'Japanese': 10,
 'Korean': 11,
 'Polish': 12,
 'Portuguese': 13,
 'Russian': 14,
 'Scottish': 15,
 'Spanish': 16,
 'Vietnamese': 17}

In [52]:
nation_vocab.idx_to_token

{0: 'Arabic',
 1: 'Chinese',
 2: 'Czech',
 3: 'Dutch',
 4: 'English',
 5: 'French',
 6: 'German',
 7: 'Greek',
 8: 'Irish',
 9: 'Italian',
 10: 'Japanese',
 11: 'Korean',
 12: 'Polish',
 13: 'Portuguese',
 14: 'Russian',
 15: 'Scottish',
 16: 'Spanish',
 17: 'Vietnamese'}

## 3. Vectorizer

In [16]:
# 주어진 토큰에 대응하는 인덱스 반환

def lookup_token(vocabulary_class,token):

# UNK 토큰이 있을 경우
    if vocabulary_class.unk_index >= 0:
#           토큰을 찾아보고 없으면 unk_index 반환, 있으면 해당 토큰의 idx를 반환
        return vocabulary_class.token_to_idx.get(token, vocabulary_class.unk_index)
    else:
        return vocabulary_class.token_to_idx[token]
    

In [17]:
# 주어진 인덱스에 대응하는 토큰 반환

def lookup_index(vocabulary_class, index):
        if index not in vocabulary_class.idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return vocabulary_class.idx_to_token[index]
    

### 텍스트(surname)에 대한 원 핫 인코딩

In [18]:
import numpy as np 

def vectorize(voca, column):

#     전체 리뷰 사이즈만큼을 미리 0으로 채워둠
    one_hot = np.zeros(len(voca.token_to_idx), dtype=np.float32)
    one_hot
    
    for token in column.split(" "):
        
#         토큰이 .(구두점)이 아닐 경우 
#         토큰에 해당되는 인덱스에 1를 부여한 one_hot encoding 만듦
        if token not in string.punctuation:
            one_hot[lookup_token(voca,token)] = 1

    return one_hot

print(vectorize(name_vocab,"all i can say is that a i had no other option"))

[1. 0. 0. ... 0. 0. 0.]


### Dataset class

In [19]:
import torch
from torch.utils.data import Dataset

class NameDataset(Dataset):
    def __init__(self, names, nations):
        self.names = names
        self.nations = nations

    def __len__(self):
        return len(self.names)

    def __getitem__(self, index):
        name = self.names[index]
        nation = self.nations[index]
        
#         여기다가 vectorize함수 사용해서 name return
        vectorized_name = vectorize(name_vocab,name)
#       nation 숫자로 return
#         vectorized_nation = vectorize(nation_vocab,nation)
        vectorized_nation = lookup_token(nation_vocab,nation)
    
        return {
            'surname': vectorized_name,
            'nationality': vectorized_nation
        }

### 데이터셋 class

In [21]:
# 데이터셋을 인스턴스화 해주어야 로더에 넣어줄 수 있다. 

train_dataset = NameDataset(train_df["surname"].values, train_df["nationality"].values)
train_dataset

valid_dataset = NameDataset(val_df["surname"].values, val_df["nationality"].values)
valid_dataset

test_dataset = NameDataset(test_df["surname"].values, test_df["nationality"].values)
test_dataset


<__main__.NameDataset at 0x7fb4d34c6700>

In [22]:
# 데이터 로더 설정
from torch.utils.data import DataLoader

# drop_last=True -> 배치 사이즈보다 over하면 drop

Traindataloader = DataLoader(dataset=train_dataset, batch_size=512,
                            shuffle=True, drop_last=True)

Validdataloader = DataLoader(dataset=valid_dataset, batch_size=512,
                            shuffle=True, drop_last=True)

Testdataloader = DataLoader(dataset=test_dataset, batch_size=512,
                            shuffle=True, drop_last=True)


In [23]:
print(len(train_dataset),len(Traindataloader))

7680 15


In [24]:
for batch_index, batch_dict in enumerate(Traindataloader):
    print(batch_index)
    print(batch_dict)
    
    break
    

0
{'surname': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'nationality': tensor([ 4,  4, 14, 14,  9, 14, 14, 10,  4, 16,  4,  4,  0,  4, 14,  9,  9,  0,
        14, 14,  4,  8,  0,  9,  4,  4,  6,  2,  8,  4,  6,  4,  7, 10, 10,  0,
         4,  8,  4,  0, 10,  2, 14, 14, 16, 15, 14,  0,  8, 14,  4,  4,  0,  9,
        14, 14,  4, 14, 14, 14,  6, 12,  5, 10, 10,  4,  4,  4,  4,  0, 10,  4,
         4, 13,  0,  0,  3,  0,  4, 14,  9,  6, 14, 14,  0,  0, 10,  5, 14,  4,
        12,  4,  3,  0, 14,  4,  6,  6,  2, 14, 12,  9,  0,  9,  4, 14,  0, 14,
        13, 10,  2, 10, 14,  4,  6,  9,  0, 14,  4, 17,  0, 14,  4,  0,  4, 14,
         6,  4,  4,  1,  4, 10, 16, 14,  0,  0, 10,  9,  9,  9,  0,  9,  4,  4,
         1,  4, 14,  4, 10,  9, 14, 14, 14,  4, 15,  4,  5,  5, 14,  6,  4, 

### 모델 정의 및 옵티마이저, loss func 설정

### 모델정의 ReviewClassifier

In [25]:
# nn은 neural network로 torch의 신경망 모듈이다.
# MLP사용 
import torch.nn as nn
import torch.nn.functional as F

class NameClassifier(nn.Module):
    
    def __init__(self,input_dim, hidden_dim, output_dim):
        
#       torch.nn.Module의 초기화 메서드를 실행하여 해당 클래스의 기능을 상속받음
        super(NameClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim,hidden_dim)
        self.fc2 = nn.Linear(hidden_dim,output_dim)
    
    def forward(self, x_in, apply_softmax=False):
        intermediate = F.relu(self.fc1(x_in))
        output = self.fc2(intermediate)
        
        if apply_softmax:
            output = F.softmax(output,dim=1)
        
        return output
        

### 드롭 아웃 적용한 모델

In [26]:
# nn은 neural network로 torch의 신경망 모듈이다.
import torch.nn as nn
import torch.nn.functional as F

class NameClassifier(nn.Module):
    
    def __init__(self,input_dim, hidden_dim, output_dim):
        
#       torch.nn.Module의 초기화 메서드를 실행하여 해당 클래스의 기능을 상속받음
        super(NameClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim,hidden_dim)
        self.fc2 = nn.Linear(hidden_dim,output_dim)
    
    def forward(self, x_in, apply_softmax=False):
        intermediate = F.relu(self.fc1(x_in))
        output = self.fc2(F.dropout(intermediate,p=0.5))
        
        if apply_softmax:
            output = F.softmax(output,dim=1)
        
        return output
        

In [27]:
len(name_vocab.token_to_idx)

9042

In [28]:
len(nation_vocab.token_to_idx)

18

In [29]:
hidden_dim = 300

classifier = NameClassifier(input_dim=len(name_vocab.token_to_idx),
                            hidden_dim=hidden_dim,
                           output_dim=len(nation_vocab.token_to_idx))
classifier

NameClassifier(
  (fc1): Linear(in_features=9042, out_features=300, bias=True)
  (fc2): Linear(in_features=300, out_features=18, bias=True)
)

### 옵티마이저, loss function

In [30]:
lr = 0.001
num_epochs = 100

In [31]:
# 옵티마이저
import torch.optim as optim

optimizer = optim.Adam(classifier.parameters(), lr = lr)
optimizer


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [32]:
df['nationality'].value_counts()

nationality
English       2972
Russian       2373
Arabic        1603
Japanese       775
Italian        600
German         576
Czech          414
Spanish        258
Dutch          236
French         229
Chinese        220
Irish          183
Greek          156
Polish         120
Korean          77
Scottish        75
Vietnamese      58
Portuguese      55
Name: count, dtype: int64

In [33]:
numSample_list = df['nationality'].value_counts().tolist()
numSample_list
# weights 계산
weights = [1 - (x / sum(numSample_list)) for x in numSample_list]

# weights를 torch.FloatTensor로 변환
weights = torch.FloatTensor(weights)
weights

tensor([0.7293, 0.7839, 0.8540, 0.9294, 0.9454, 0.9475, 0.9623, 0.9765, 0.9785,
        0.9791, 0.9800, 0.9833, 0.9858, 0.9891, 0.9930, 0.9932, 0.9947, 0.9950])

In [34]:
# loss function
#  dataset.class_weights -> 각 클래스에 대해 다른 가중치를 적용할 수 있음(데이터 불균형시에)
# 소수의 클래스가 다수의 클래스보다 훨씬 적은 수의 샘플을 가지고 있는 경우, 
# 소수 클래스에 더 높은 가중치를 부여하여 모델이 불균형한 데이터에 대해 더 잘 학습

loss_func = nn.CrossEntropyLoss(weights)
loss_func


CrossEntropyLoss()

### Train

In [35]:
def compute_accuracy(y_pred, y_target):
#      예측값과 타겟값을 비교하여 일치하는 개수를 계산
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100
   

In [36]:
# Train state 초기화 
def make_train_state():
    return {
        'stop_early':False,
        'early_stopping_step':0,
        'early_stopping_best_val':1e8,
        'early_stopping_criteria' : 10,
        'epoch_index' : 0,
        'train_loss': [], 
        'train_acc' :[], 
        'val_loss' : [],
        'val_acc' : [], 
        'test_loss' : [],
        'test_acc' : [],
         
#       모델 저장파일
        'model_filename' : 'model.pth'
    } 


# Train update 
def update_train_state(model, train_state):
    
#   학습시작하면 초기에 모델 저장하기 
    
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(),train_state['model_filename'])
        
#   모델 성능이 향상되면 모델 저장(valid loss가 더 낮아지면)
    elif train_state['epoch_index'] >=1 :
        loss_t = train_state['val_loss'][-1]
#        loss가 나빠지면 early stop step 업데이트
        if loss_t >= train_state['early_stopping_best_val']:
            train_state['early_stopping_step']+=1
            
#        loss가 좋아지면   
        else:
#            early stop step 0으로 다시 초기화        
            train_state['early_stopping_step']=0
    
#           최저 loss이면 모델 저장 
            if loss_t < train_state['early_stopping_best_val']:
                train_state['early_stopping_best_val'] = loss_t
                torch.save(model.state_dict(),train_state['model_filename'])

#       기준점 넘으면 early stop 
        if train_state['early_stopping_step'] >= train_state['early_stopping_criteria']:
            train_state['stop_early'] = True
        
        return train_state


In [37]:
# 모델 진행 상황 함수 초기화
train_state = make_train_state()
train_state

{'stop_early': False,
 'early_stopping_step': 0,
 'early_stopping_best_val': 100000000.0,
 'early_stopping_criteria': 10,
 'epoch_index': 0,
 'train_loss': [],
 'train_acc': [],
 'val_loss': [],
 'val_acc': [],
 'test_loss': [],
 'test_acc': [],
 'model_filename': 'model.pth'}

In [38]:
import tqdm

# 에포크만큼
for epoch in tqdm.tqdm(range(num_epochs)):

#     print('epoch',epoch)
#     print(train_state['epoch_index']) 
    train_state['epoch_index'] +=1 

    running_loss = 0.0
    running_acc = 0.0


#     모델을 학습 모드로 설정 -> 드롭아웃 및 배치 정규화와 같은 학습 중에만 적용되는 기법들이 활성화
#     모델을 평가 모드로 전환하려면 classifier.eval()을 사용
    classifier.train()
# 배치 만큼
    for batch_idx, batch_data in enumerate(Traindataloader):

        

#       1. 옵티마이저 그레디언트 0으로 초기화
        optimizer.zero_grad()
#       2. 모델에 데이터 넣어서 출력받기
        y_pred = classifier(x_in=batch_data['surname'])
#       3. loss 계산하기
        loss =  loss_func(y_pred, batch_data['nationality'])
    
#       tensor(0.3190) -> 0.3190, item()으로 스칼라 값만 추출
        loss_t = loss.item()

#       배치에서의 평균 loss 구하기
        running_loss += (loss_t - running_loss) / (batch_idx + 1)

#       4. gradient 계산하기
        loss.backward()

#       5. 옵티마이저 가중치 업데이트
        optimizer.step()

#       Accuracy 계산
        acc_t = compute_accuracy(y_pred, batch_data['nationality'])
        running_acc += (acc_t - running_acc) / (batch_idx + 1)



    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)


#   valid에 대한 계산

    running_loss = 0.0
    running_acc = 0.0

    classifier.eval() # 모델 파라미터를 수정하지 못 하게 비활성화

    for batch_idx, batch_data in enumerate(Validdataloader):

#       1. 모델의 출력값(y_pred)계산
        y_pred = classifier(x_in=batch_data['surname'])

#       2. loss 계산
        loss = loss_func(y_pred,batch_data['nationality'])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_idx + 1)

#       3. Accuracy 계산
        acc_t = compute_accuracy(y_pred,batch_data['nationality'])
        running_acc += (acc_t - running_acc) / (batch_idx + 1)
    
    print("val_loss",running_loss)
    print("val_acc",running_acc)

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    

#   전체 loss, acc 저장
    train_state = update_train_state(model=classifier,
                                     train_state=train_state)
#   early stop해라고 했으면 학습 멈추기    
    if train_state['stop_early']:
        break



  1%|▍                                          | 1/100 [00:01<03:02,  1.84s/it]

val_loss 2.8252743085225425
val_acc 15.169270833333334


  2%|▊                                          | 2/100 [00:03<02:40,  1.64s/it]

val_loss 2.6660370031992593
val_acc 33.59375


  3%|█▎                                         | 3/100 [00:04<02:35,  1.60s/it]

val_loss 2.42344339688619
val_acc 34.895833333333336


  4%|█▋                                         | 4/100 [00:06<02:33,  1.60s/it]

val_loss 2.2233364582061768
val_acc 27.473958333333332


  5%|██▏                                        | 5/100 [00:08<02:30,  1.59s/it]

val_loss 2.1553154786427817
val_acc 27.669270833333332


  6%|██▌                                        | 6/100 [00:09<02:25,  1.55s/it]

val_loss 2.123735268910726
val_acc 35.872395833333336


  7%|███                                        | 7/100 [00:11<02:22,  1.54s/it]

val_loss 2.0613616704940796
val_acc 41.145833333333336


  8%|███▍                                       | 8/100 [00:12<02:18,  1.51s/it]

val_loss 2.0356032053629556
val_acc 41.666666666666664


  9%|███▊                                       | 9/100 [00:14<02:19,  1.53s/it]

val_loss 1.9932376941045125
val_acc 41.731770833333336


 10%|████▏                                     | 10/100 [00:15<02:17,  1.53s/it]

val_loss 1.9527780612309773
val_acc 41.2109375


 11%|████▌                                     | 11/100 [00:17<02:15,  1.52s/it]

val_loss 1.9177780946095784
val_acc 41.731770833333336


 12%|█████                                     | 12/100 [00:18<02:11,  1.50s/it]

val_loss 1.9209449291229248
val_acc 41.2109375


 13%|█████▍                                    | 13/100 [00:20<02:12,  1.53s/it]

val_loss 1.909353494644165
val_acc 39.908854166666664


 14%|█████▉                                    | 14/100 [00:22<02:20,  1.63s/it]

val_loss 1.903205116589864
val_acc 40.690104166666664


 15%|██████▎                                   | 15/100 [00:23<02:25,  1.71s/it]

val_loss 1.911732792854309
val_acc 42.1875


 16%|██████▋                                   | 16/100 [00:25<02:30,  1.80s/it]

val_loss 1.9057999451955159
val_acc 42.96875


 17%|███████▏                                  | 17/100 [00:27<02:21,  1.71s/it]

val_loss 1.9144363403320312
val_acc 42.7734375


 18%|███████▌                                  | 18/100 [00:29<02:22,  1.73s/it]

val_loss 1.92490021387736
val_acc 42.643229166666664


 19%|███████▉                                  | 19/100 [00:31<02:25,  1.79s/it]

val_loss 1.9463210503260295
val_acc 41.080729166666664


 20%|████████▍                                 | 20/100 [00:33<02:26,  1.83s/it]

val_loss 1.9434895118077595
val_acc 41.666666666666664


 21%|████████▊                                 | 21/100 [00:34<02:22,  1.80s/it]

val_loss 1.9605814615885417
val_acc 40.755208333333336


 22%|█████████▏                                | 22/100 [00:36<02:13,  1.71s/it]

val_loss 1.9811700582504272
val_acc 41.341145833333336


 23%|█████████▋                                | 23/100 [00:37<02:05,  1.63s/it]

val_loss 1.9675931930541992
val_acc 42.513020833333336


 23%|█████████▋                                | 23/100 [00:39<02:11,  1.70s/it]

val_loss 1.9982776641845703
val_acc 41.536458333333336





### Test 진행

In [39]:
# 가장 좋은 모델을 사용해 테스트 세트의 손실과 정확도를 계산합니다

classifier.load_state_dict(torch.load(train_state['model_filename']))

running_loss = 0.0
running_acc = 0.0

# 가중치 업데이트 하지 못 하게
classifier.eval()

for batch_idx, batch_data in enumerate(Testdataloader):
    
    y_pred = classifier(x_in=batch_data['surname'])
    loss = loss_func(y_pred,batch_data['nationality'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_idx + 1)
    
    acc_t = compute_accuracy(y_pred, batch_data['nationality'])
    running_acc += (acc_t - running_acc) / (batch_idx + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [40]:
print("테스트 손실: {:.3f}".format(train_state['test_loss']))
print("테스트 정확도: {:.2f}".format(train_state['test_acc']))

테스트 손실: 1.931
테스트 정확도: 40.82


In [41]:
train_state

{'stop_early': True,
 'early_stopping_step': 10,
 'early_stopping_best_val': 1.903205116589864,
 'early_stopping_criteria': 10,
 'epoch_index': 24,
 'train_loss': [2.8662541389465335,
  2.7537015279134116,
  2.5309797128041587,
  2.2651388963063552,
  2.1040698051452638,
  2.008525323867798,
  1.9041026989618937,
  1.790707572301229,
  1.6639065821965535,
  1.5314770062764487,
  1.3943700393040976,
  1.2507047812143959,
  1.111471382776896,
  0.9782052159309388,
  0.8578760425249735,
  0.7463638663291932,
  0.6523718317349751,
  0.5717738866806031,
  0.5038931707541149,
  0.4438691755135855,
  0.39415636857350667,
  0.3524436811606089,
  0.3109319766362508,
  0.28348709146181744],
 'train_acc': [12.786458333333334,
  27.669270833333336,
  47.1875,
  39.96093750000001,
  32.91666666666667,
  30.546875,
  41.848958333333336,
  49.28385416666667,
  57.044270833333336,
  60.625,
  63.60677083333333,
  69.59635416666667,
  78.58072916666667,
  82.64322916666666,
  84.89583333333333,
  87.04

### 추론

In [42]:
new_surname = "Kim"

In [43]:
# 벡터화 + 텐서화

vectorized_surname = torch.tensor(vectorize(name_vocab,new_surname))
vectorized_surname

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [44]:
print(vectorized_surname.shape)

torch.Size([9042])


In [45]:
# 첫번째 차원을 1로 만들고, 나머지는 다른 차원으로 알아서 되도록 !
x_data = vectorized_surname.view(1, -1)
print(x_data.shape)

torch.Size([1, 9042])


In [46]:
# 모델에 test 데이터 넣어주기
result = classifier(x_data,apply_softmax=True)
print(result)

tensor([[0.0221, 0.0546, 0.0646, 0.0541, 0.0962, 0.0577, 0.0853, 0.0434, 0.0438,
         0.0837, 0.1167, 0.0345, 0.0370, 0.0220, 0.0719, 0.0253, 0.0667, 0.0204]],
       grad_fn=<SoftmaxBackward0>)


In [47]:
result.max()

tensor(0.1167, grad_fn=<MaxBackward1>)

In [48]:
probability_value, indices = result.max(dim=1)

print("probability_value",probability_value)
print("indices",indices)

probability_value tensor([0.1167], grad_fn=<MaxBackward0>)
indices tensor([10])


In [49]:
print("새로운 데이터에 대해 추론한 label",indices.item())

새로운 데이터에 대해 추론한 label 10


In [50]:
prediction = lookup_index(nation_vocab,indices.item())
prediction

'Japanese'

In [51]:
print("{} -> {}".format(new_surname, prediction))

Kim -> Japanese
