## **1. seq2seq**

### **라이브러리 호출**

In [1]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd

import os
import re
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### **데이터 준비**

In [2]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 20

### 딕셔너리 제작을 위한 클래스
class Lang:
    # 단어의 인덱스를 저장하기 위한 컨테이너 초기화
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"} # SOS: 문장의 시작, EOS: 문장의 끝
        self.n_words = 2 # SOS와 EOS에 대한 카운트
        
    # 문장을 단어 단위로 분리한 후 컨테이너에 추가
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
            
    # 컨테이너에 단어가 없다면 추가하고, 있다면 카운트를 업데이트
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

**데이터 정규화**

In [3]:
def normalizeString(df, lang):
    sentence = df[lang].str.lower() # 모두 소문자로
    sentence = sentence.str.replace('[^A-Za-z\s]+', '')
    sentence = sentence.str.normalize('NFD') # 유니코드 정규화
    sentence = sentence.str.encode('ascii', errors='ignore').str.decode('utf-8') # unicode -> ASCII
    return sentence

def read_sentence(df, lang1, lang2):
    sentence1 = normalizeString(df, lang1) # 데이터셋의 첫 번째 열(영어)
    sentence2 = normalizeString(df, lang2) # 데이터셋의 두 번째 열(프랑스어)
    return sentence1, sentence2

def read_file(loc, lang1, lang2):
    df = pd.read_csv(loc, delimiter = '\t', header = None, names = [lang1, lang2])
    return df

def process_data(lang1,lang2):
    df = read_file('./data/eng-fra.txt', lang1, lang2) 
    sentence1, sentence2 = read_sentence(df, lang1, lang2)

    input_lang = Lang()
    output_lang = Lang()
    pairs = []
    for i in range(len(df)):
        if len(sentence1[i].split(' ')) < MAX_LENGTH and len(sentence2[i].split(' ')) < MAX_LENGTH:
            full = [sentence1[i], sentence2[i]] # 첫 번째와 두 번째 열을 합쳐서 저장
            input_lang.addSentence(sentence1[i]) # 입력으로 영어 사용
            output_lang.addSentence(sentence2[i]) # 출력으로 프랑스어 사용
            pairs.append(full) # 입력과 출력이 합쳐진 것을 활용

    return input_lang, output_lang, pairs

**텐서로 변환**

In [4]:
# 문장을 단어로 분리하고 인덱스 반환
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

# 딕셔너리에 단어에 대한 인덱스를 가져오고 문장 끝에 토큰을 추가
def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype = torch.long, device = device).view(-1, 1)

# 입력과 출력 문장을 텐서로 변환하여 반환
def tensorsFromPair(input_lang, output_lang, pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

### **모델링**

**인코더 네트워크**

In [5]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, embbed_dim, num_layers):
        super(Encoder, self).__init__()       
        self.input_dim = input_dim # 입력층
        self.embbed_dim = embbed_dim # 임베딩 계층
        self.hidden_dim = hidden_dim # 은닉층(이전 은닉층)
        self.num_layers = num_layers # GRU 계층 개수
        self.embedding = nn.Embedding(input_dim, self.embbed_dim) # 임베딩 계층 초기화
        self.gru = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers) # gru 계층 초기화
              
    def forward(self, src):      
        embedded = self.embedding(src).view(1,1,-1) # 임베딩 처리
        outputs, hidden = self.gru(embedded) # 임베딩 결과를 gru 모델에 적용
        return outputs, hidden

**디코더 네트워크**

In [6]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, embbed_dim, num_layers):
        super(Decoder, self).__init__()

        self.embbed_dim = embbed_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(output_dim, self.embbed_dim) # 임베딩 계층 초기화
        self.gru = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers = self.num_layers) # GRU 계층 초기화
        self.out = nn.Linear(self.hidden_dim, output_dim) # 선형 계층 초기화
        self.softmax = nn.LogSoftmax(dim=1) # 기울기 소멸 문제를 방지하기 위해 로그 적용
      
    def forward(self, input, hidden):
        input = input.view(1, -1) # 입력을 (1, 배치 크기)로 변경
        embedded = F.relu(self.embedding(input))
        output, hidden = self.gru(embedded, hidden)       
        prediction = self.softmax(self.out(output[0]))      
        return prediction, hidden

**seq2seq**
- 어텐션 적용 x

In [7]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, MAX_LENGTH = MAX_LENGTH):
        super().__init__()
      
        self.encoder = encoder # 인코더 초기화
        self.decoder = decoder # 디코더 초기화
        self.device = device 
     
    def forward(self, input_lang, output_lang, teacher_forcing_ratio = 0.5):
        input_length = input_lang.size(0) # 입력 문자 길이(문장의 단어 수)
        batch_size = output_lang.shape[1] 
        target_length = output_lang.shape[0]
        vocab_size = self.decoder.output_dim      
        outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device) # 예측된 출력을 저장하기 위한 변수 초기화

        for i in range(input_length):
            encoder_output, encoder_hidden = self.encoder(input_lang[i]) # 문장 내의 모든 단어 인코딩

        decoder_hidden = encoder_hidden.to(device)  # 인코더의 은닉층을 디코더의 은닉층으로 사용
        decoder_input = torch.tensor([SOS_token], device = device) # 첫 번째 예측 단어 앞에 토큰(SOS) 추가

        for t in range(target_length): # 현재 단어에서 출력 단어 예측
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            outputs[t] = decoder_output
            teacher_force = random.random() < teacher_forcing_ratio
            topv, topi = decoder_output.topk(1)
            input = (output_lang[t] if teacher_force else topi) # teacher_force를 활용하면 목표 단어를 다음 입력으로 사용
            if(teacher_force == False and input.item() == EOS_token): # teacher_force 활성화 안할 시 자체 예측 값을 다음 입력으로 사용
                break
        return outputs

**오차 계산 함수 정의**

In [8]:
teacher_forcing_ratio = 0.5

def Model(model, input_tensor, target_tensor, model_optimizer, criterion):
    model_optimizer.zero_grad()
    input_length = input_tensor.size(0)
    loss = 0
    epoch_loss = 0
    output = model(input_tensor, target_tensor)
    num_iter = output.size(0)

    for ot in range(num_iter):
        loss += criterion(output[ot], target_tensor[ot]) # 모델의 예측 결과와 정답을 이용하여 오차 계산

    loss.backward()
    model_optimizer.step()
    epoch_loss = loss.item() / num_iter
    return epoch_loss

**모델 훈련 함수 정의**

In [12]:
def trainModel(model, input_lang, output_lang, pairs, num_iteration = 20000):
    model.train()
    optimizer = optim.SGD(model.parameters(), lr = 0.01)
    criterion = nn.NLLLoss()
    total_loss_iterations = 0

    training_pairs = [tensorsFromPair(input_lang, output_lang, random.choice(pairs))
                      for i in range(num_iteration)]
  
    for iter in range(1, num_iteration + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        loss = Model(model, input_tensor, target_tensor, optimizer, criterion) # Model 객체를 이용하여 오차 계산
        total_loss_iterations += loss

        if iter % 500 == 0: # 500번마다 오차 값에 대한 결과 출력
            avarage_loss= total_loss_iterations / 500
            total_loss_iterations = 0
            print('%d %.4f' % (iter, avarage_loss))
          
    torch.save(model.state_dict(), './data/mytraining.pt') 
    return model

**모델 평가**

In [13]:
def evaluate(model, input_lang, output_lang, sentences, max_length = MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentences[0]) # 입력 문자열 -> 텐서
        output_tensor = tensorFromSentence(output_lang, sentences[1]) # 출력 문자열 -> 텐서
        decoded_words = []  
        output = model(input_tensor, output_tensor)
  
        for ot in range(output.size(0)):
            topv, topi = output[ot].topk(1) # 각 출력에서 가장 높은 값을 찾아 인덱스 반환

            if topi[0].item() == EOS_token:
                decoded_words.append('<EOS>') # EOS 토큰을 만나면 종료
                break
            else:
                decoded_words.append(output_lang.index2word[topi[0].item()]) # 예측 결과를 문자열에 추가
    return decoded_words

## 훈련 데이터셋으로부터 임의의 문장을 가져와서 모델 평가
def evaluateRandomly(model, input_lang, output_lang, pairs, n = 10):
    for i in range(n):
        pair = random.choice(pairs) # 임의의 문장 가져오기
        print('input {}'.format(pair[0]))
        print('output {}'.format(pair[1]))
        output_words = evaluate(model, input_lang, output_lang, pair) # 모델 평가 저장
        output_sentence = ' '.join(output_words)
        print('predicted {}'.format(output_sentence))

**Run!!**

In [14]:
lang1 = 'eng' # 입력(영어)
lang2 = 'fra' # 출력(프랑스어)
input_lang, output_lang, pairs = process_data(lang1, lang2)

randomize = random.choice(pairs)
print('random sentence {}'.format(randomize))

input_size = input_lang.n_words
output_size = output_lang.n_words
print('Input : {} Output : {}'.format(input_size, output_size))

embed_size = 256
hidden_size = 512
num_layers = 1
num_iteration = 10000

# 인코더에 훈련 데이터셋을 입력하고 모든 출력과 은닉 상태 저장
encoder = Encoder(input_size, hidden_size, embed_size, num_layers)
# 디코더의 첫 번째 입력으로 <SOS> 토큰이 제공되고, 인코더의 마지막 은닉 상태가 디코더의 첫 번째 은닉 상태로 제공됨
decoder = Decoder(output_size, hidden_size, embed_size, num_layers)

model = Seq2Seq(encoder, decoder, device).to(device) # 모델 객체 생성
 
print(encoder)
print(decoder)

model = trainModel(model, input_lang, output_lang, pairs, num_iteration) # 모델 학습

  This is separate from the ipykernel package so we can avoid doing imports until


random sentence ['i have until tomorrow to finish this', 'jai jusqu demain pour finir a']
Input : 13366 Output : 25937
Encoder(
  (embedding): Embedding(13366, 256)
  (gru): GRU(256, 512)
)
Decoder(
  (embedding): Embedding(25937, 256)
  (gru): GRU(256, 512)
  (out): Linear(in_features=512, out_features=25937, bias=True)
  (softmax): LogSoftmax(dim=1)
)
500 5.1146
1000 5.0612
1500 5.1296
2000 4.9518
2500 4.9390
3000 4.7116
3500 4.7034
4000 4.7821
4500 4.5422
5000 4.6091
5500 4.5701
6000 4.7347
6500 4.6486
7000 4.6795
7500 4.7418
8000 4.6883
8500 4.5681
9000 4.6370
9500 4.7267
10000 4.5974


### **예측**

**임의의 문장에 대한 평가**

In [15]:
evaluateRandomly(model, input_lang, output_lang, pairs)

input he gave her an engagement ring last night
output il lui a donn une bague de fianailles hier soir
predicted je ne pas pas <EOS>
input you should prepare for the future
output vous devriez vous apprter pour le futur
predicted je ne pas pas <EOS>
input i hope your brother is better
output jespre que votre frre se porte mieux
predicted je ne pas pas <EOS>
input tom had never seen mary that angry
output tom navait jamais vu marie autant en colre
predicted je ne pas pas <EOS>
input you dont seem very sure
output vous ne semblez pas trs sres
predicted je ne pas pas <EOS>
input we looked around the property
output on a fait le tour de la proprit
predicted je ne pas pas <EOS>
input their goods are of the highest quality
output leurs marchandises sont de la plus haute qualit
predicted je ne pas pas <EOS>
input no security system is foolproof
output aucun systme de scurit nest infaillible
predicted je ne pas pas <EOS>
input the place is surrounded by cops
output lendroit est cern par les fl

꽤나 비슷하게 번역된 듯 하다.

### **모델링-2**

**어텐션이 적용된 디코더**

In [18]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p = 0.1, max_length = MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size) # 임베딩 계층 초기화
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length) # 어텐션: 입력을 디코더로 변환
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim = 1)
        ### 배치 행렬 곱 수행
        # 가중치와 인코더의 출력 벡터를 곱하겠다
        # 그 결과 입력 시퀀스의 특정 부분에 관한 정보를 포함하고 있음 -> 디코더가 적절한 출력 단어를 선택하도록 도움
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

**어텐션 디코더 모델 학습을 위한 함수**

In [19]:
def trainIters(encoder, decoder, n_iters, print_every = 1000, plot_every = 100, learning_rate = 0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  
    plot_loss_total = 0  

    encoder_optimizer = optim.SGD(encoder.parameters(), lr = learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr = learning_rate)
    training_pairs = [tensorsFromPair(input_lang, output_lang, random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0] # 입력 + 출력 쌍에서 입력을 input_tensor로
        target_tensor = training_pair[1] # 입력 + 출력 쌍에서 출력을 target_tensor로
        loss = Model(model, input_tensor, target_tensor, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % 500 == 0:
            print_loss_avg = print_loss_total / 500
            print_loss_total = 0
            print('%d,  %.4f' % (iter, print_loss_avg))

**모델 훈련**

In [20]:
import time

embed_size = 256
hidden_size = 512
num_layers = 1
input_size = input_lang.n_words
output_size = output_lang.n_words

encoder1 = Encoder(input_size, hidden_size, embed_size, num_layers)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_size, dropout_p = 0.1).to(device)

print(encoder1)
print(attn_decoder1)

attn_model = trainIters(encoder1, attn_decoder1, 10000, print_every = 500, plot_every=50, learning_rate=0.01)

Encoder(
  (embedding): Embedding(13366, 256)
  (gru): GRU(256, 512)
)
AttnDecoderRNN(
  (embedding): Embedding(25937, 512)
  (attn): Linear(in_features=1024, out_features=20, bias=True)
  (attn_combine): Linear(in_features=1024, out_features=512, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (gru): GRU(512, 512)
  (out): Linear(in_features=512, out_features=25937, bias=True)
)
500,  4.8628
1000,  5.0121
1500,  5.0067
2000,  5.0031
2500,  4.9993
3000,  5.0115
3500,  4.9337
4000,  5.0260
4500,  4.7825
5000,  5.0638
5500,  5.0040
6000,  4.9593
6500,  4.8815
7000,  5.0100
7500,  4.9463
8000,  4.9405
8500,  4.9531
9000,  4.8978
9500,  4.9829
10000,  4.9810


현 예제에서는 두드러지는 성능 차이는 보이고 있지 않음

## **2. 버트(BERT)**

In [1]:
# !pip install transformers
# !pip install pytorch-transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl.metadata (113 kB)
     -------------------------------------- 113.6/113.6 kB 3.3 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp37-cp37m-win_amd64.whl (3.5 MB)
     ---------------------------------------- 3.5/3.5 MB 18.4 MB/s eta 0:00:00
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.1-cp37-none-win_amd64.whl.metadata (3.8 kB)
Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
   ---------------------------------------- 7.2/7.2 MB 38.1 MB/s eta 0:00:00
Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
   ---------------------------------------- 268.8/268.8 kB 8.1 MB/s eta 0:00:00
Downloading safetensors-0.4.1-cp37-none-win_amd64.whl (277 kB)
   --------------------



Collecting pytorch-transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
     -------------------------------------- 176.4/176.4 kB 3.5 MB/s eta 0:00:00
Collecting sentencepiece (from pytorch-transformers)
  Downloading sentencepiece-0.1.99-cp37-cp37m-win_amd64.whl (977 kB)
     ------------------------------------- 977.7/977.7 kB 15.6 MB/s eta 0:00:00
Collecting sacremoses (from pytorch-transformers)
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
     ------------------------------------- 880.6/880.6 kB 18.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py): started
  Building wheel for sacremoses (setup.py): finished with status 'done'
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=83a3f9e0f49327c5fc30c65b1ab440ecc55569da1079c0305b1a7062342bf0ef
  St



### **라이브러리 호출**

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from pytorch_transformers import BertTokenizer, BertForSequenceClassification 
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### **데이터 준비**

**데이터셋 불러오기**

In [2]:
train_df = pd.read_csv('./data/training.txt', sep = '\t') 
valid_df = pd.read_csv('./data/validing.txt', sep = '\t') 
test_df = pd.read_csv('./data/testing.txt', sep = '\t') 

**일부 데이터만 사용하기**
- 실행 시간 단축 위함..

In [3]:
train_df = train_df.sample(frac = 0.1, random_state = 500)
valid_df = valid_df.sample(frac = 0.1, random_state = 500)
test_df = test_df.sample(frac = 0.1, random_state = 500)

**데이터셋 생성**

In [4]:
class Datasets(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, 1]
        label = self.df.iloc[idx, 2]
        return text, label

**데이터셋의 데이터를 데이터로더로 전달**

In [5]:
train_dataset = Datasets(train_df)
train_loader = DataLoader(train_dataset, batch_size = 2, shuffle = True, num_workers = 0)

valid_dataset = Datasets(valid_df)
valid_loader = DataLoader(valid_dataset, batch_size = 2, shuffle = True, num_workers = 0)

test_dataset = Datasets(test_df)
test_loader = DataLoader(test_dataset, batch_size = 2, shuffle = True, num_workers = 0)

### **토큰화**

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # 토크나이저 정의
model = BertForSequenceClassification.from_pretrained('bert-base-uncased') # 모델 정의
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

- BERT 모델의 경우 인코더와 어텐션이 반복되고 있는 것을 확인할 수 있음

### **모델링**

**최적 모델 저장을 위한 함수**

In [7]:
### 모델 평가를 위해 훈련 과정을 저장
def save_checkpoint(save_path, model, valid_loss):
    if save_path == None:
        return    
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

### save_checkpoint 함수에서 저장된 모델을 가져옴
def load_checkpoint(load_path, model):    
    if load_path == None:
        return    
    state_dict = torch.load(load_path, map_location = device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']

### 훈련, 검증에 대한 오차와 epoch 저장
def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):
    if save_path == None:
        return    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

### save_metrics에 저장해 둔 정보 불러오기
def load_metrics(load_path):
    if load_path==None:
        return    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

**학습**

In [8]:
def train(model,
          optimizer,
          criterion = nn.BCELoss(), # 손실함수
          num_epochs = 5,
          eval_every = len(train_loader) // 2,
          best_valid_loss = float("Inf")):
    
    total_correct = 0.0
    total_len = 0.0
    
    running_loss = 0.0
    valid_running_loss = 0.0
    
    global_step = 0
    
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    model.train() # 모델 훈련
    
    for epoch in range(num_epochs):
        for text, label in train_loader:
            
            optimizer.zero_grad()        
            
            encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
            padded_list =  [e + [0] * (512-len(e)) for e in encoded_list] # 인코딩 결과에 제로패딩 적용
        
            sample = torch.tensor(padded_list)
            sample, label = sample.to(device), label.to(device)
            labels = torch.tensor(label)
            outputs = model(sample, labels = labels)
            loss, logits = outputs

            pred = torch.argmax(F.softmax(logits), dim = 1) # 가장 큰 값을 최종 예측값으로
            correct = pred.eq(labels)
            total_correct += correct.sum().item()
            total_len += len(labels)
            running_loss += loss.item()
            loss.backward()
            optimizer.step()        
            global_step += 1
            
            ### 모델 평가
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    
                    for text, label in valid_loader:
                        encoded_list = [tokenizer.encode(t, add_special_tokens = True) for t in text]
                        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]        
                        sample = torch.tensor(padded_list)
                        sample, label = sample.to(device), label.to(device)
                        labels = torch.tensor(label)
                        outputs = model(sample, labels = labels)
                        loss, logits = outputs                        
                        valid_running_loss += loss.item()

                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                running_loss = 0.0                
                valid_running_loss = 0.0
                
                model.train()

                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint('./model/model.pt', model, best_valid_loss)
                    save_metrics('./model/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics('./model/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('훈련 종료!')

**파라미터 미세 조정 & 모델 훈련**

In [9]:
optimizer = optim.Adam(model.parameters(), lr = 2e-5) # 옵티마이져, 학습률 설정
train(model = model, optimizer = optimizer) # 모델 학습



Epoch [1/5], Step [510/5100], Train Loss: 0.7033, Valid Loss: 0.6950
Model saved to ==> ./model/model.pt
Model saved to ==> ./model/metrics.pt
Epoch [1/5], Step [1020/5100], Train Loss: 0.7034, Valid Loss: 0.6936
Model saved to ==> ./model/model.pt
Model saved to ==> ./model/metrics.pt


KeyboardInterrupt: 

**오차 정보 확인**

In [10]:
train_loss_list, valid_loss_list, global_steps_list = load_metrics('./data/metrics.pt') # 최종 저장된 모델 불러오기

plt.plot(global_steps_list, train_loss_list, label = 'Train')
plt.plot(global_steps_list, valid_loss_list, label = 'Valid')
plt.xlabel('Global Steps')
plt.ylabel('Loss')
plt.legend()
plt.show() 

FileNotFoundError: [Errno 2] No such file or directory: './data/metrics.pt'

### **평가**

**모델 평가 함수**

In [None]:
def evaluate(model, test_loader):
    y_pred = [] # 예측값
    y_true = [] # 실제값

    model.eval() # 테스트 데이터셋으로 모델 평가
    
    with torch.no_grad():
        for text, label in test_loader:
            encoded_list = [tokenizer.encode(t, add_special_tokens = True) for t in text]
            padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        
            sample = torch.tensor(padded_list)
            sample, label = sample.to(device), label.to(device)
            labels = torch.tensor(label)
            output = model(sample, labels = labels)
            
            _, output = output
            y_pred.extend(torch.argmax(output, 1).tolist())
            y_true.extend(labels.tolist())
                    
    print('Classification 결과:')
    print(classification_report(y_true, y_pred, labels=[1,0], digits=4))
    
    cm = confusion_matrix(y_true, y_pred, labels = [1,0]) # 혼동 행렬
    ax= plt.subplot() # 히트맵으로 시각화
    sns.heatmap(cm, annot = True, ax = ax, cmap = 'Blues', fmt = "d")
    
    ax.set_title('Confusion Matrix')
    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')
    ax.xaxis.set_ticklabels(['0', '1'])
    ax.yaxis.set_ticklabels(['0', '1'])

**모델 평가**

In [None]:
import warnings
warnings.filterwarnings('ignore') 

best_model = model.to(device)
load_checkpoint('./model/model.pt', best_model)
evaluate(best_model, test_loader)

- 그닥 결과가 좋진 않음
    - 모델을 훈련시키기 위한 데이터가 작았음
    - 사전 훈련된 모델도 다국어를 지원하는 모델이 아니였음