## 예제: 문서 분류에 사전 훈련된 임베딩을 사용한 전이 학습

In [15]:
import pandas as pd

df = pd.read_csv("news_with_splits.csv")
df

Unnamed: 0,category,split,title
0,Business,train,"Jobs, tax cuts key issues for Bush"
1,Business,train,Jarden Buying Mr. Coffee #39;s Maker
2,Business,train,Retail sales show festive fervour
3,Business,train,Intervoice's Customers Come Calling
4,Business,train,Boeing Expects Air Force Contract
...,...,...,...
119995,World,test,Genesis Space Capsule Crashes Into Desert
119996,World,test,U.S.: Too Early to Tell Iraq Unit's Fate
119997,World,test,AFGHAN OPIUM GROWING UP TWO THIRDS
119998,World,test,At least one Saudi policeman killed in clashes...


In [16]:
# 데이터를 다시 train/valid/test로 나눠줌

# train 데이터 
train_df = df[df.split=='train']
train_size = len(train_df)

# valid 데이터 
val_df = df[df.split=='val']
val_size = len(val_df)

# test 데이터 
test_df = df[df.split=='test']
test_size = len(test_df)

In [17]:
lookup_dict = {'train': (train_df, train_size),
                             'val': (val_df, val_size),
                             'test': (test_df, test_size)}
# lookup_dict

In [18]:
train_df

Unnamed: 0,category,split,title
0,Business,train,"Jobs, tax cuts key issues for Bush"
1,Business,train,Jarden Buying Mr. Coffee #39;s Maker
2,Business,train,Retail sales show festive fervour
3,Business,train,Intervoice's Customers Come Calling
4,Business,train,Boeing Expects Air Force Contract
...,...,...,...
110995,World,train,UAE president dies at 86
110996,World,train,Iran says it won #39;t halt nuclear technology...
110997,World,train,Republicans Assail Kerry at Convention
110998,World,train,Muslim envoys off to Baghdad in bid to free Br...


### 2. Vocabulary

In [22]:
# add_unk=True를 하면 '<UNK>': 0 토큰을 추가해줌 !

class Vocabulary:
    def __init__(self, unk_token="<UNK>",
                 mask_token="<MASK>", 
                 begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):
        
        self.token_to_idx = {}
        self.idx_to_token = {}
        
        self.mask_token = mask_token
        self.unk_token = unk_token
        self.begin_seq_token = begin_seq_token
        self.end_seq_token = end_seq_token

        
        self.mask_index = self.add_token(self.mask_token)
        self.unk_index = self.add_token(self.unk_token)
        self.begin_seq_index = self.add_token(self.begin_seq_token)
        self.end_seq_index = self.add_token(self.end_seq_token)

    def add_token(self, token):
        
#       만약 해당 토큰이 있으면 토큰 idx만 return
        if token in self.token_to_idx:
            index = self.token_to_idx[token]
            
#       만약 해당 토큰이 없으면 새로운 토큰 만들어줌
        else:
            index = len(self.token_to_idx)
            self.token_to_idx[token] = index
            self.idx_to_token[index] = token
        return index

In [45]:
# word counter 생성 

from collections import Counter
import string

# Counter()를 통해 어떤 단어가 얼만큼의 횟수로 들어있는지를 알 수 있다.
word_counts = Counter()
for t in df.title:
    for word in t.split(" "):
        # word가 .(구두점,punctuation)이 아닐 경우 word에 추가
        if word not in string.punctuation:
            word_counts[word] += 1

# word_counts

In [47]:
# Vocabulary 객체 생성

# cutoff 보다 수가 많은 단어만 vocab에 추가
cutoff = 25
vocab = Vocabulary()

# word_counts.items() -> ex) ('all', 24160)
for word, count in word_counts.items():
    if count > cutoff:
        vocab.add_token(word)

In [50]:
print(dict(list(vocab.token_to_idx.items())[:20]))

{'<MASK>': 0, '<UNK>': 1, '<BEGIN>': 2, '<END>': 3, 'Jobs,': 4, 'tax': 5, 'cuts': 6, 'key': 7, 'issues': 8, 'for': 9, 'Bush': 10, 'Buying': 11, '#39;s': 12, 'Maker': 13, 'Retail': 14, 'sales': 15, 'show': 16, 'Customers': 17, 'Come': 18, 'Calling': 19}


In [51]:
print(dict(list(vocab.idx_to_token.items())[:20]))

{0: '<MASK>', 1: '<UNK>', 2: '<BEGIN>', 3: '<END>', 4: 'Jobs,', 5: 'tax', 6: 'cuts', 7: 'key', 8: 'issues', 9: 'for', 10: 'Bush', 11: 'Buying', 12: '#39;s', 13: 'Maker', 14: 'Retail', 15: 'sales', 16: 'show', 17: 'Customers', 18: 'Come', 19: 'Calling'}


### Category Vocabulary

In [52]:
category_vocab = Vocabulary()   
category_vocab

<__main__.Vocabulary at 0x7fca40678d00>

In [56]:
for category in sorted(set(df.category)):
    category_vocab.add_token(category)


In [57]:
print(dict(list(category_vocab.token_to_idx.items())[:20]))

{'<MASK>': 0, '<UNK>': 1, '<BEGIN>': 2, '<END>': 3, 'Business': 4, 'Sci/Tech': 5, 'Sports': 6, 'World': 7}


In [58]:
print(dict(list(category_vocab.idx_to_token.items())[:20]))

{0: '<MASK>', 1: '<UNK>', 2: '<BEGIN>', 3: '<END>', 4: 'Business', 5: 'Sci/Tech', 6: 'Sports', 7: 'World'}


## 3. Vectorizer

In [59]:
# 주어진 토큰에 대응하는 인덱스 반환

def lookup_token(vocabulary_class,token):

# UNK 토큰이 있을 경우
    if vocabulary_class.unk_index >= 0:
#           토큰을 찾아보고 없으면 unk_index 반환, 있으면 해당 토큰의 idx를 반환
        return vocabulary_class.token_to_idx.get(token, vocabulary_class.unk_index)
    else:
        return vocabulary_class.token_to_idx[token]
    

In [60]:
# 주어진 인덱스에 대응하는 토큰 반환

def lookup_index(vocabulary_class, index):
        if index not in vocabulary_class.idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return vocabulary_class.idx_to_token[index]
    

### Vectorize

In [164]:
import numpy as np 

def vectorize(title, vector_length=-1):
    
    indices = [vocab.begin_seq_index]
    indices.extend(lookup_token(vocab,token) for token in title.split(' '))
    if vector_length < 0:
        vector_length = len(indices)
    
    out_vector = np.zeros(vector_length, dtype=np.int64)
    out_vector[:len(indices)] = indices
#     문장 길이가 작아서 padding 진행하면 해당 부분 mask 처리
    out_vector[len(indices):] = vocab.mask_index
    
    
    return out_vector

# 무조건 첫번째 토큰은 2(BEGIN) 토큰이다. 
print(vectorize("all dafs dfkdl dd good",vector_length=10))

[  2 880   1   1   1 237   0   0   0   0]


### Dataset class

In [68]:
import torch
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, cbow_df):
        
        self.cbow_df = cbow_df
        
        measure_len = lambda title: len(title.split(" "))
        self.max_seq_length = max(map(measure_len, cbow_df.title))+2 #시작 및 끝 토큰 고려

    def __len__(self):
        return len(self.cbow_df)

    def __getitem__(self, index):
        
        row = self.cbow_df.iloc[index]
        
        title_vector = vectorize(row.title, self.max_seq_length)
        category_index = lookup_token(category_vocab,row.category)

        return {'x_data': title_vector,
                'y_target': category_index}

In [69]:
# 데이터셋을 인스턴스화 해주어야 로더에 넣어줄 수 있다. 

train_dataset = NewsDataset(train_df)
train_dataset

valid_dataset = NewsDataset(val_df)
valid_dataset

test_dataset = NewsDataset(test_df)
test_dataset


<__main__.NewsDataset at 0x7fc9f0972880>

In [70]:
# 데이터 로더 설정
from torch.utils.data import DataLoader

# drop_last=True -> 배치 사이즈보다 over하면 drop

Traindataloader = DataLoader(dataset=train_dataset, batch_size=512,
                            shuffle=True, drop_last=True)

Validdataloader = DataLoader(dataset=valid_dataset, batch_size=512,
                            shuffle=False, drop_last=True)

Testdataloader = DataLoader(dataset=test_dataset, batch_size=512,
                            shuffle=False, drop_last=True)


In [71]:
print(len(train_dataset),len(Traindataloader))

84000 164


In [72]:
for batch_index, batch_dict in enumerate(Traindataloader):
    print(batch_index)
    print(batch_dict)
    
    break
    

0
{'x_data': tensor([[   2, 3830, 2002,  ...,    0,    0,    0],
        [   2,   49,  606,  ...,    0,    0,    0],
        [   2,    1, 2414,  ...,    0,    0,    0],
        ...,
        [   2,    1, 3188,  ...,    0,    0,    0],
        [   2,  167,    1,  ...,    0,    0,    0],
        [   2,    1,  946,  ...,    0,    0,    0]]), 'y_target': tensor([7, 4, 5, 6, 7, 7, 5, 7, 4, 6, 6, 6, 4, 4, 6, 5, 5, 5, 5, 7, 6, 6, 6, 5,
        7, 7, 5, 4, 6, 5, 6, 7, 4, 5, 4, 6, 7, 7, 7, 6, 6, 7, 6, 6, 5, 7, 6, 5,
        6, 4, 7, 7, 6, 5, 7, 5, 6, 7, 7, 5, 5, 5, 7, 5, 4, 5, 4, 5, 5, 4, 4, 6,
        5, 6, 5, 5, 4, 5, 5, 6, 7, 6, 4, 5, 7, 5, 6, 7, 7, 5, 7, 4, 5, 5, 6, 4,
        7, 7, 7, 6, 6, 5, 6, 4, 7, 5, 6, 4, 6, 5, 4, 7, 6, 4, 7, 6, 4, 4, 6, 5,
        7, 5, 6, 6, 7, 5, 4, 6, 4, 4, 6, 4, 7, 5, 4, 6, 4, 6, 7, 5, 6, 7, 7, 6,
        6, 5, 7, 4, 5, 4, 5, 7, 4, 6, 4, 4, 5, 7, 5, 4, 5, 5, 6, 4, 7, 4, 7, 6,
        4, 5, 6, 4, 6, 6, 5, 7, 7, 5, 4, 5, 5, 6, 4, 6, 7, 5, 4, 4, 6, 4, 5, 7,
        

### 모델정의 NewsClassifier

In [172]:
# nn은 neural network로 torch의 신경망 모듈이다.

import torch.nn as nn
import torch.nn.functional as F

class NewsClassifier(nn.Module):
    def __init__(self, embedding_size, num_embeddings, num_channels,
                hidden_dim, num_classes, dropout_p,
                pretrained_embeddings=None, padding_idx=0):
        """
            embedding_size (int): 임베딩 벡터의 크기
            num_embeddings (int): 임베딩 벡터의 개수
            num_channels (int): 합성곱 커널 개수 ex) 32, 64, ..
            hidden_dim (int): 은닉 차원 크기
            num_classes (int): 클래스 개수
            dropout_p (float): 드롭아웃 확률
            pretrained_embeddings (numpy.array): 사전에 훈련된 단어 임베딩
            padding_idx (int): 패딩 인덱스
        """
        
        super(NewsClassifier, self).__init__()
        
        # 사전 학습 모델 사용여부 
        if pretrained_embeddings is None:
            self.emb = nn.Embedding(embedding_dim = embedding_size,
                                   num_embeddings = num_embeddings,
                                   padding_idx = padding_idx)
        else:
#             pretrained_embeddings -> 사전 학습된 임베딩 matrix
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim = embedding_size,
                                   num_embeddings=num_embeddings,
                                   padding_idx = padding_idx,
                                   _weight = pretrained_embeddings)
        
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels = embedding_size,
                     out_channels = num_channels, kernel_size=3),
            nn.ELU(),
            nn.Conv1d(in_channels = num_channels, out_channels=num_channels,
                     kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels = num_channels, out_channels=num_channels,
                     kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels = num_channels, out_channels=num_channels,
                     kernel_size=3),
            nn.ELU()
        
        )
        self.dropout_p = dropout_p
        self.fc1 = nn.Linear(num_channels, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)
        
    def forward(self,x_in, apply_softmax=False):
#       permute -> 차원의 순서 바꾸기 
#       임베딩 레이어의 출력은 (배치 크기, 시퀀스 길이, 임베딩 크기) 형태
#       Conv1d 레이어의 입력은 일반적으로 (배치 크기, 채널 수, 시퀀스 길이) 형태

        x_embedded = self.emb(x_in).permute(0,2,1)
        features = self.convnet(x_embedded)
        
        
        # 평균값 계산하기 
        remaining_size = features.size(dim=2)
        features = F.avg_pool1d(features, remaining_size).squeeze(dim=2)
        features = F.dropout(features, p=self.dropout_p)
        
        # MLP classifier
        intermediate_vector = F.relu(F.dropout(self.fc1(features), p=self.dropout_p))
        prediction_vector = self.fc2(intermediate_vector)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector

In [176]:
# 모델 들고와보기 (Permute 왜 하는지 )
import torch

# 모델 초기화
embedding_size = 100
num_embeddings = 10000
num_channels = 32
hidden_dim = 64
num_classes = 10
dropout_p = 0.5
padding_idx = 0

model = NewsClassifier(embedding_size, num_embeddings, num_channels, hidden_dim, num_classes, dropout_p)

# 임의의 입력 데이터 생성
batch_size = 32
sequence_length = 50

# 임의의 입력 데이터 생성 (배치 크기, 시퀀스 길이)
input_data = torch.randint(0, num_embeddings, (batch_size, sequence_length))

# 임베딩 후 permute
embedded_data = model.emb(input_data).permute(0, 2, 1)

print("Original input data shape:", input_data.shape)
print("Embedded data shape after permute:", embedded_data.shape)


Original input data shape: torch.Size([32, 50])
Embedded data shape after permute: torch.Size([32, 100, 50])


In [175]:
import torch
import torch.nn as nn

# 사전 학습된 임베딩 matrix 불러오기 
words = vocab.token_to_idx.keys()
pretrained_embeddings = make_embedding_matrix(glove_filepath='glove.6B.100d.txt', 
                                   words=words)
pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()  # NumPy 배열을 텐서로 변환

# 예시로 임의의 데이터 생성
input_data = torch.tensor([1, 2, 3, 4, 5])  # 임의의 데이터 (배치 크기 1)

# 임베딩 레이어 정의
embedding_size = 100  # 임베딩 차원 크기
num_embeddings = len(words)  # 임베딩 벡터의 개수
padding_idx = 0  # 패딩에 사용될 인덱스


emb = nn.Embedding(embedding_dim=embedding_size,
                         num_embeddings=num_embeddings,
                         padding_idx=padding_idx,
                         _weight=pretrained_embeddings)

# 임베딩 레이어에 입력 데이터 통과
embedded_data = emb(input_data)

print("Embedded data shape:", embedded_data.shape)
print("Embedded data:", embedded_data)


Embedded data shape: torch.Size([5, 100])
Embedded data: tensor([[ 1.2704e-01,  1.7831e-02, -2.3641e-02, -6.5017e-02, -1.2157e-02,
          1.5102e-01,  5.3289e-02, -1.4382e-02, -2.0612e-01, -1.4185e-01,
         -1.6901e-01, -1.7447e-01, -3.4675e-03, -2.1077e-01, -1.0859e-01,
         -1.2947e-01, -1.6271e-01, -8.8284e-02, -1.2512e-01,  1.6890e-01,
         -6.2940e-02,  1.2273e-01, -1.4146e-01, -1.7785e-01,  2.3973e-03,
         -6.0074e-02,  9.0401e-02, -3.3368e-03, -8.4645e-02, -1.4040e-01,
          2.0587e-01,  1.8994e-01,  1.2982e-01, -1.9011e-01, -1.7127e-01,
         -1.5373e-01, -8.6781e-02,  3.0449e-02, -2.2515e-02, -2.2082e-01,
         -4.5941e-02, -1.0774e-01, -1.8684e-01,  1.4941e-01, -1.9609e-01,
         -2.1336e-01, -1.4369e-01,  1.8261e-01, -1.2049e-01, -2.3666e-01,
          1.6654e-01,  1.1389e-01,  1.4494e-01,  2.9041e-02,  1.5172e-02,
          8.0321e-02, -5.0443e-02,  2.1568e-01,  1.3720e-01, -1.1069e-01,
          1.2359e-01,  1.2072e-01, -2.3217e-02,  2.0497

In [None]:
def make_embedding_matrix(glove_filepath, words):
    """
    특정 단어 집합에 대한 임베딩 행렬을 만듭니다.
    
    매개변수:
        glove_filepath (str): 임베딩 파일 경로
        words (list): 단어 리스트
    """
    
    word_to_index = {}
    embeddings = []

    with open(glove_filepath, "r") as f:
        for index, line in enumerate(f):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)

    glove_embeddings = np.stack(embeddings)
    embedding_size = glove_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_index:
            final_embeddings[i, :] = glove_embeddings[word_to_index[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i

    return final_embeddings

In [None]:
# 예시
words = vocab.token_to_idx.keys()
embeddings = make_embedding_matrix(glove_filepath='glove.6B.100d.txt', 
                                       words=words)
embeddings

In [128]:
len(category_vocab.token_to_idx)

8

In [129]:
use_glove = True
embedding_size = 100
num_channels = 100
hidden_dim = 100
dropout_p = 0.1

# GloVe를 사용하거나 랜덤하게 임베딩을 초기화합니다
if use_glove:
    words = vocab.token_to_idx.keys()
    embeddings = make_embedding_matrix(glove_filepath='glove.6B.100d.txt', 
                                       words=words)
    print("사전 훈련된 임베딩을 사용합니다")
else:
    printv("사전 훈련된 임베딩을 사용하지 않습니다")
    embeddings = None

classifier = NewsClassifier(embedding_size=embedding_size, 
                            num_embeddings=len(vocab.token_to_idx),
                            num_channels=num_channels,
                            hidden_dim=hidden_dim, 
                            num_classes=len(category_vocab.token_to_idx), 
                            dropout_p=dropout_p,
                            pretrained_embeddings=embeddings,
                            padding_idx=0)
classifier

사전 훈련된 임베딩을 사용합니다


NewsClassifier(
  (emb): Embedding(4794, 100, padding_idx=0)
  (convnet): Sequential(
    (0): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (1): ELU(alpha=1.0)
    (2): Conv1d(100, 100, kernel_size=(3,), stride=(2,))
    (3): ELU(alpha=1.0)
    (4): Conv1d(100, 100, kernel_size=(3,), stride=(2,))
    (5): ELU(alpha=1.0)
    (6): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (7): ELU(alpha=1.0)
  )
  (fc1): Linear(in_features=100, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=8, bias=True)
)

### 옵티마이저, loss function

In [130]:
lr = 0.001
num_epochs = 100

In [131]:
# 옵티마이저
import torch.optim as optim

optimizer = optim.Adam(classifier.parameters(), lr = lr)
optimizer


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [132]:
loss_func = nn.CrossEntropyLoss()
loss_func

CrossEntropyLoss()

### Train

In [133]:
def compute_accuracy(y_pred, y_target):
#      예측값과 타겟값을 비교하여 일치하는 개수를 계산
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100
   

In [134]:
# Train state 초기화 
def make_train_state():
    return {
        'stop_early':False,
        'early_stopping_step':0,
        'early_stopping_best_val':1e8,
        'early_stopping_criteria' : 10,
        'epoch_index' : 0,
        'train_loss': [], 
        'train_acc' :[], 
        'val_loss' : [],
        'val_acc' : [], 
        'test_loss' : [],
        'test_acc' : [],
         
#       모델 저장파일
        'model_filename' : 'model.pth'
    } 


# Train update 
def update_train_state(model, train_state):
    
#   학습시작하면 초기에 모델 저장하기 
    
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(),train_state['model_filename'])
        
#   모델 성능이 향상되면 모델 저장(valid loss가 더 낮아지면)
    elif train_state['epoch_index'] >=1 :
        loss_t = train_state['val_loss'][-1]
#        loss가 나빠지면 early stop step 업데이트
        if loss_t >= train_state['early_stopping_best_val']:
            train_state['early_stopping_step']+=1
            
#        loss가 좋아지면   
        else:
#            early stop step 0으로 다시 초기화        
            train_state['early_stopping_step']=0
    
#           최저 loss이면 모델 저장 
            if loss_t < train_state['early_stopping_best_val']:
                train_state['early_stopping_best_val'] = loss_t
                torch.save(model.state_dict(),train_state['model_filename'])

#       기준점 넘으면 early stop 
        if train_state['early_stopping_step'] >= train_state['early_stopping_criteria']:
            train_state['stop_early'] = True
        
        return train_state


In [135]:
# 모델 진행 상황 함수 초기화
train_state = make_train_state()
train_state

{'stop_early': False,
 'early_stopping_step': 0,
 'early_stopping_best_val': 100000000.0,
 'early_stopping_criteria': 10,
 'epoch_index': 0,
 'train_loss': [],
 'train_acc': [],
 'val_loss': [],
 'val_acc': [],
 'test_loss': [],
 'test_acc': [],
 'model_filename': 'model.pth'}

In [136]:
import tqdm

# 에포크만큼
for epoch in tqdm.tqdm(range(num_epochs)):

#     print('epoch',epoch)
#     print(train_state['epoch_index']) 
    train_state['epoch_index'] +=1 

    running_loss = 0.0
    running_acc = 0.0


#     모델을 학습 모드로 설정 -> 드롭아웃 및 배치 정규화와 같은 학습 중에만 적용되는 기법들이 활성화
#     모델을 평가 모드로 전환하려면 classifier.eval()을 사용
    classifier.train()
# 배치 만큼
    for batch_idx, batch_data in enumerate(Traindataloader):

        

#       1. 옵티마이저 그레디언트 0으로 초기화
        optimizer.zero_grad()
#       2. 모델에 데이터 넣어서 출력받기
        y_pred = classifier(x_in=batch_data['x_data'])
#       3. loss 계산하기
        loss =  loss_func(y_pred, batch_data['y_target'])
    
#       tensor(0.3190) -> 0.3190, item()으로 스칼라 값만 추출
        loss_t = loss.item()

#       배치에서의 평균 loss 구하기
        running_loss += (loss_t - running_loss) / (batch_idx + 1)

#       4. gradient 계산하기
        loss.backward()

#       5. 옵티마이저 가중치 업데이트
        optimizer.step()

#       Accuracy 계산
        acc_t = compute_accuracy(y_pred, batch_data['y_target'])
        running_acc += (acc_t - running_acc) / (batch_idx + 1)



    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)


#   valid에 대한 계산

    running_loss = 0.0
    running_acc = 0.0

    classifier.eval() # 모델 파라미터를 수정하지 못 하게 비활성화

    for batch_idx, batch_data in enumerate(Validdataloader):

#       1. 모델의 출력값(y_pred)계산
        y_pred = classifier(x_in=batch_data['x_data'])

#       2. loss 계산
        loss = loss_func(y_pred,batch_data['y_target'])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_idx + 1)

#       3. Accuracy 계산
        acc_t = compute_accuracy(y_pred,batch_data['y_target'])
        running_acc += (acc_t - running_acc) / (batch_idx + 1)
    
    print("val_loss",running_loss)
    print("val_acc",running_acc)

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    

#   전체 loss, acc 저장
    train_state = update_train_state(model=classifier,
                                     train_state=train_state)
#   early stop해라고 했으면 학습 멈추기    
    if train_state['stop_early']:
        break



  1%|▍                                          | 1/100 [00:15<26:12, 15.88s/it]

val_loss 0.9191021455185752
val_acc 62.31026785714285


  2%|▊                                          | 2/100 [00:32<26:20, 16.13s/it]

val_loss 0.7589134518589293
val_acc 71.22209821428574


  3%|█▎                                         | 3/100 [00:47<25:30, 15.78s/it]

val_loss 0.6675949322325841
val_acc 74.96651785714283


  4%|█▋                                         | 4/100 [01:03<25:06, 15.69s/it]

val_loss 0.6571825223309652
val_acc 75.234375


  5%|██▏                                        | 5/100 [01:18<24:42, 15.61s/it]

val_loss 0.6320760582174573
val_acc 75.9933035714286


  6%|██▌                                        | 6/100 [01:35<25:09, 16.06s/it]

val_loss 0.67661657844271
val_acc 74.55357142857142


  7%|███                                        | 7/100 [01:50<24:33, 15.85s/it]

val_loss 0.6571794024535588
val_acc 75.20647321428572


  8%|███▍                                       | 8/100 [02:06<24:07, 15.74s/it]

val_loss 0.6180685698986055
val_acc 76.46205357142858


  9%|███▊                                       | 9/100 [02:21<23:34, 15.54s/it]

val_loss 0.616837908540453
val_acc 76.5625


 10%|████▏                                     | 10/100 [02:40<25:00, 16.67s/it]

val_loss 0.6654306109462467
val_acc 74.7098214285714


 11%|████▌                                     | 11/100 [03:00<26:08, 17.63s/it]

val_loss 0.6410550892353059
val_acc 75.93749999999999


 12%|█████                                     | 12/100 [03:15<24:50, 16.94s/it]

val_loss 0.6652209094592504
val_acc 74.97767857142858


 13%|█████▍                                    | 13/100 [03:31<23:51, 16.45s/it]

val_loss 0.6656059878213065
val_acc 74.94977678571428


 14%|█████▉                                    | 14/100 [03:46<23:12, 16.19s/it]

val_loss 0.6555714087826866
val_acc 75.53571428571429


 15%|██████▎                                   | 15/100 [04:02<22:39, 15.99s/it]

val_loss 0.7289681877408708
val_acc 73.90066964285715


 16%|██████▋                                   | 16/100 [04:17<22:07, 15.81s/it]

val_loss 0.7440870612859726
val_acc 72.53348214285712


 17%|███████▏                                  | 17/100 [04:33<21:45, 15.73s/it]

val_loss 0.7328091310603279
val_acc 73.13058035714285


 18%|███████▌                                  | 18/100 [04:48<21:20, 15.61s/it]

val_loss 0.7174163673605237
val_acc 74.09598214285714


 18%|███████▌                                  | 18/100 [05:04<23:09, 16.94s/it]

val_loss 0.7390894676957813
val_acc 72.9575892857143





### Test 진행

In [137]:
# 가장 좋은 모델을 사용해 테스트 세트의 손실과 정확도를 계산합니다

classifier.load_state_dict(torch.load(train_state['model_filename']))

running_loss = 0.0
running_acc = 0.0

# 가중치 업데이트 하지 못 하게
classifier.eval()

for batch_idx, batch_data in enumerate(Testdataloader):
    
    y_pred = classifier(x_in=batch_data['x_data'])
    loss = loss_func(y_pred,batch_data['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_idx + 1)
    
    acc_t = compute_accuracy(y_pred, batch_data['y_target'])
    running_acc += (acc_t - running_acc) / (batch_idx + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [138]:
print("테스트 손실: {:.3f}".format(train_state['test_loss']))
print("테스트 정확도: {:.2f}".format(train_state['test_acc']))

테스트 손실: 0.519
테스트 정확도: 80.97


In [141]:
train_state

{'stop_early': True,
 'early_stopping_step': 10,
 'early_stopping_best_val': 0.616837908540453,
 'early_stopping_criteria': 10,
 'epoch_index': 19,
 'train_loss': [1.07824518913176,
  0.5811618224876679,
  0.5040205122857562,
  0.470424688080462,
  0.45024730646755634,
  0.4311915592449466,
  0.4177757503419388,
  0.40504673286909015,
  0.38667869186256004,
  0.3764421701794717,
  0.3623963018379561,
  0.34775477011756195,
  0.3371951899877407,
  0.32293680010408904,
  0.3086075559258461,
  0.29571367518567454,
  0.2822118337016279,
  0.2710341033412189,
  0.25818121651323805],
 'train_acc': [53.102372332317074,
  78.55492568597563,
  81.63586128048784,
  82.81369092987806,
  83.5675495426829,
  84.10227705792684,
  84.65844131097563,
  85.0550209603659,
  85.80530678353665,
  86.08755716463415,
  86.70803163109753,
  87.17606707317071,
  87.60956554878051,
  88.17644817073166,
  88.70879382621953,
  89.1946932164634,
  89.7246570121951,
  90.06764481707313,
  90.57140815548783],
 'val

### 추론

In [160]:
import re

# 리뷰 텍스트를 전처리합니다
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [161]:
def predict_category(title, classifier, max_length):
    """뉴스 제목을 기반으로 카테고리를 예측합니다
    
    매개변수:
        title (str): 원시 제목 문자열
        classifier (NewsClassifier): 훈련된 분류기 객체
        vectorizer (NewsVectorizer): 해당 Vectorizer
        max_length (int): 최대 시퀀스 길이
            노트: CNN은 입력 텐서 크기에 민감합니다. 
                 훈련 데이터처럼 동일한 크기를 갖도록 만듭니다.
    """
#   전처리 
    title = preprocess_text(title)
#   벡터화 
    vectorized_title = torch.tensor(vectorize(title, vector_length=max_length))
#   모델에 적용
#   추론과정에서는 loss function이 쓰이지 않기 때문에 소프트맥스를 사용
    result = classifier(vectorized_title.unsqueeze(0), apply_softmax=True)

    probability_values, indices = result.max(dim=1)
    predicted_category = lookup_index(category_vocab,indices.item())

    return {'category': predicted_category, 
            'probability': probability_values.item()}

In [162]:
def get_samples():
    samples = {}
    for cat in val_df.category.unique():
        samples[cat] = val_df.title[val_df.category==cat].tolist()[:5]
    return samples

val_samples = get_samples()
val_samples

{'Business': ['AZ suspends marketing of cancer drug',
  'Business world has mixed reaction to Perez move',
  'Betting Against Bombay',
  'Malpractice Insurers Face a Tough Market',
  'NVIDIA Is Vindicated'],
 'Sci/Tech': ['Spies prize webcam #39;s eyes',
  'Sober worm causes headaches',
  'Local Search: Missing Pieces Falling into Place',
  'Hackers baiting Internet users with Beckham pix',
  'Nokia adds BlackBerry support to Series 80 handsets'],
 'Sports': ['Is Meyer the man to get Irish up?',
  'Who? Who? And Clemens',
  'Baseball Today (AP)',
  'Mark Kreidler: Yao Ming epitomizes the Chinese athlete who is &lt;b&gt;...&lt;/b&gt;',
  'No. 5 Miami Rebounds to Beat FSU in Overtime'],
 'World': ['Arafat in pain but expected to recover-Shaath',
  'Maoist rebels bomb Kathmandu building, no injuries (Reuters)',
  "Son Running for Ill. Rep.'s House Seat (AP)",
  'Strong Quake Hits in Japan',
  'Israel assassinates Hamas militant in Damascus']}

In [163]:
#title = input("Enter a news title to classify: ")

measure_len = lambda title: len(title.split(" "))
max_seq_length = max(map(measure_len, df.title))+2 #시작 및 끝 토큰 고려


for truth, sample_group in val_samples.items():
    print(f"True Category: {truth}")
    print("="*30)
    for sample in sample_group:
        prediction = predict_category(sample, classifier, 
                                      max_seq_length + 1)
        print("예측: {} (p={:0.2f})".format(prediction['category'],
                                                  prediction['probability']))
        print("\t + 샘플: {}".format(sample))
    print("-"*30 + "\n")

True Category: Business
예측: World (p=0.56)
	 + 샘플: AZ suspends marketing of cancer drug
예측: Business (p=0.37)
	 + 샘플: Business world has mixed reaction to Perez move
예측: Sports (p=0.77)
	 + 샘플: Betting Against Bombay
예측: Business (p=0.74)
	 + 샘플: Malpractice Insurers Face a Tough Market
예측: Sports (p=0.78)
	 + 샘플: NVIDIA Is Vindicated
------------------------------

True Category: Sci/Tech
예측: Sci/Tech (p=0.30)
	 + 샘플: Spies prize webcam #39;s eyes
예측: Sci/Tech (p=0.77)
	 + 샘플: Sober worm causes headaches
예측: Sports (p=0.69)
	 + 샘플: Local Search: Missing Pieces Falling into Place
예측: Sci/Tech (p=0.98)
	 + 샘플: Hackers baiting Internet users with Beckham pix
예측: Sports (p=0.85)
	 + 샘플: Nokia adds BlackBerry support to Series 80 handsets
------------------------------

True Category: Sports
예측: Sports (p=0.48)
	 + 샘플: Is Meyer the man to get Irish up?
예측: Sports (p=0.72)
	 + 샘플: Who? Who? And Clemens
예측: Sports (p=0.99)
	 + 샘플: Baseball Today (AP)
예측: World (p=0.44)
	 + 샘플: Mark Kreidler:

[W NNPACK.cpp:51] Could not initialize NNPACK! Reason: Unsupported hardware.
