In [121]:
## 프로그래밍 시작
import torch
import pandas as pd
import math
import collections
import tqdm
from torch.utils.data import IterableDataset
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, GloVe
from konlpy.tag import Komoran
import datasets
import numpy as np
import re
import warnings
warnings.filterwarnings(action='ignore')

In [122]:
# torch version: 2.3.0
torch.__version__

'2.3.0'

In [123]:
# 사용 GPU: GTX 1650
torch.cuda.get_device_name(device = 0)

'NVIDIA GeForce GTX 1650'

In [124]:
seed = 2024
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [125]:
# 데이터프레임 전처리
def preprocess_dataframe(input_dataframe:pd.DataFrame)->pd.DataFrame:
    return_dataframe = input_dataframe.copy(deep=True)
    return_dataframe.drop_duplicates(subset=['summary'], inplace=True)
    # 한글, 공백만 포함
    return_dataframe['korean'] = return_dataframe['summary'].str.replace(pat=r"[^ㄱ-ㅎㅏ-ㅣ가-힣0-9 ]", repl=r"", regex=True)
    # 영어도 포함시킬려면
    # return_dataframe['document'] = return_dataframe['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣a-zA-Z ]","")
    # 공백만 존재할 경우 na로 대체한다.
    return_dataframe['korean'] = return_dataframe['korean'].replace('', np.nan)
    # na만 존재하는 행들을 제거한다
    return_dataframe = return_dataframe.dropna(how='any')
    return return_dataframe

In [126]:
train_df = pd.read_csv("summarized_df.tsv", sep='\t')
len(train_df)

40800

In [127]:
train_df.sample(n=len(train_df))

Unnamed: 0,path,summary,label
35256,source_data\speech\REPORT-speech-09483-00006.json,지배구조 연차보고서에는 사외이사 후보 추천과 사외이사 활동 및 보상에 관한 사항을 ...,speech
29128,source_data\speech\REPORT-speech-04350-00001.json,서울경찰청은 여론조사기관에서 실시한 설문조사 결과를 바탕으로 보복 운전 등의 사회적...,speech
4561,source_data\briefing\REPORT-briefing-63316-000...,코로나바이러스감염증19 중앙재난안전대책본부는 영상회의실에서 코로나 블루 극복을 위한...,briefing
1954,source_data\briefing\REPORT-briefing-31343-000...,2021년 소셜벤처 경연대회는 사회문제 해결에 관심 있는 국민이 4개 부문 중 하나...,briefing
10206,source_data\edit\REPORT-edit-20389-00012.json,서양에 못지않게 일찍부터 법률체제를 구축한 중국은 주나라 대에 이르러서는 관리의 문...,edit
...,...,...,...
7977,source_data\briefing\REPORT-briefing-87674-000...,환경부는 2020 환경 데이터 활용 그린 뉴딜 아이디어 공모전 최종 경연 대회 및 ...,briefing
2494,source_data\briefing\REPORT-briefing-34860-000...,코로나19 상황에서 다양한 형태의 긴급 돌봄을 제공하는 등 가정의 돌봄 부담을 완화...,briefing
30244,source_data\speech\REPORT-speech-05450-00005.json,공공재개발을 통해서 사업 장애요인이 해소된다면 실수요자가 원하는 양질의 주택을 공급...,speech
35456,source_data\speech\REPORT-speech-09792-00001.json,국립외교원과 유럽연합안보연구원이 공동으로 주최하는 한-EU 동북아 평화협력구상 세미...,speech


데이터 전처리 후, korean 컬럼에 한국어만을 담은 데이터를 저장
- train: 145791
- test: 48995

In [128]:
preprocessed_train_df = preprocess_dataframe(train_df)
preprocessed_train_df.shape

(40799, 4)

In [129]:
preprocessed_train_df['korean']

0        노 국토교통부 장관은 서울역을 방문하여 방역실태를 점검했고 치밀한 시설물 점검 등을...
1        경남 혁신도시에는 동남권 주태 건설 수요 및 지역내 중소기업 진흥을 감안해 한국토지...
2        사례집엔 경북 성주군 별의별팀 전북 전주시 물왕멀 공동체 이야기 등이 간단하게 소개...
3        사업 추진 과정에서 지정권자는 사전검토기구를 구성하여 용적률 등을 사전에 검토하고 ...
4        공공택지에서 공급되는 공공분양주택은 분양가상한제가 적용되며 사전 청약 시행 단지 분...
                               ...                        
40795     전국기능경기대회는 심각한 코로나 상황으로 인해 방역에 특히 신경을 써서 추진하고자 한다
40796    이륜차 사고예방 시범사업은 이동시간 논란이 많은 배달업 종사원들에 대해 안전 배달시...
40797    강원도 철원군에서 2021년 통일로가요 결선 경연을 개최하며 결선에 진출한 12개 ...
40798    국민권익위는 경기도 소재 21개 중고등학교를 표본으로 실태조사를 한 결과 저소득층 ...
40799    국립중앙과학관은 국민이 직접 만든 영상 콘텐츠 국민들과 과학관이 함께 만드는 영상 ...
Name: korean, Length: 40799, dtype: object

In [130]:
# 각 데이터프레임을 datasets로 표현
train_data = datasets.Dataset.from_pandas(preprocessed_train_df)

In [131]:
# konlp의 Okt tokenizer를 사용
kor_tokenizer = get_tokenizer(Komoran().pos)

In [132]:
stop_words = ['의','가','이','은', 
              '을','들','는','좀',
              '잘','걍','과','도',
              '를','으로','자','에',
              '와','한','하다', '겠', 
              '음', '에', '에게', '다', '이다', 
              '됨', '데','내', '네', '게', '나', '하게', '중']

In [133]:
# 앞에서 설정한 불용어를 제거하고, otk tokenizer를 사용하여 리뷰 데이터를 tokenize한다
poses = ["VA", "VV", "VCP","VCN", "NNG", "NR", "MM", "IC"]
def kor_tokenize(review, tokenizer, max_length):
    tokens = tokenizer(review["korean"])
    no_stop_words_tokens = [token[0] for token in tokens if (token[0] not in stop_words) and (len(token[0])>1)  and (token[1] in poses)]
    no_stop_words_tokens = no_stop_words_tokens[:max_length]
    length = len(no_stop_words_tokens)
    return {"tokens": no_stop_words_tokens, "length": length}

In [134]:
# 앞에서 설정한 불용어를 제거하고, otk tokenizer를 사용하여 리뷰 데이터를 tokenize한다
poses = ["VA", "VV", "VCP","VCN", "NNG", "NR", "MM", "IC"]
def ex_kor_tokenize(review, tokenizer, max_length):
    tokens = tokenizer(review)
    no_stop_words_tokens = [token[0] for token in tokens if (token[0] not in stop_words) and (len(token[0])>1)  and (token[1] in poses)]
    no_stop_words_tokens = no_stop_words_tokens[:max_length]
    length = len(no_stop_words_tokens)
    return {"tokens": no_stop_words_tokens, "length": length}

In [135]:
tokens = preprocessed_train_df['korean'].apply(lambda x: ex_kor_tokenize(x, kor_tokenizer, 200))

In [136]:
lengths = [tokens.iloc[index]['length'] for index in range(len(tokens))]

In [137]:
np.mean(lengths)

11.169219833819456

In [138]:
max_length=15
train_data = train_data.map(
    kor_tokenize, fn_kwargs={"tokenizer": kor_tokenizer, "max_length": max_length}
)

Map: 100%|██████████| 40799/40799 [00:38<00:00, 1052.02 examples/s]


In [139]:
# 불용어만 존재하여 토큰이 없는 데이터를 삭제
def filter_empty_tokens(example):
    return len(example["tokens"]) > 0

train_data = train_data.filter(filter_empty_tokens)

Filter: 100%|██████████| 40799/40799 [00:00<00:00, 51797.97 examples/s]


In [140]:
# 학습 데이터를 6:2:2로 나누어 학습 데이터, 검증 데이터로 분리
test_size = 0.2
train_test_data = train_data.train_test_split(test_size=test_size)
train_data = train_test_data["train"]
test_data = train_test_data["test"]
valid_size=0.25
train_valid_data = train_data.train_test_split(test_size=valid_size)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]

In [141]:
train_data['tokens'][0]

['보금자리', '금리', '시장', '불안', '생기', '조달', '방안', '방법', '시장', '안정', '위하', '노력']

In [142]:
# <unk>, <pad> 특수 토큰을 추가한다
special_tokens = ["<unk>", "<pad>"]
min_freq = 3

vocab = build_vocab_from_iterator(
    train_data["tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

In [143]:
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]

In [144]:
vocab.set_default_index(unk_index)

In [145]:
def numericalize(example, vocab):
    ids = vocab.lookup_indices(example["tokens"])
    return {"ids": ids}

In [146]:
# 리뷰 데이터들을 vocab에 존재하는 단어들로 mapping한다
train_data = train_data.map(numericalize, fn_kwargs={"vocab": vocab})
valid_data = valid_data.map(numericalize, fn_kwargs={"vocab": vocab})
test_data = test_data.map(numericalize, fn_kwargs={"vocab": vocab})

Map:   0%|          | 0/24469 [00:00<?, ? examples/s]

Map: 100%|██████████| 24469/24469 [00:03<00:00, 7724.40 examples/s]
Map: 100%|██████████| 8157/8157 [00:00<00:00, 8233.72 examples/s]
Map: 100%|██████████| 8157/8157 [00:01<00:00, 7413.42 examples/s]


In [147]:
# 데이터셋을 torch 형태로 변환한다.
train_data = train_data.with_format(type="torch", columns=["ids", "label", "length"])
valid_data = valid_data.with_format(type="torch", columns=["ids", "label", "length"])
test_data = test_data.with_format(type="torch", columns=["ids", "label", "length"])

In [148]:
label_dict = {label:index for index, label in enumerate(train_data.unique('label'))}

In [149]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_ids = [i["ids"] for i in batch]
        batch_ids = nn.utils.rnn.pad_sequence(
            batch_ids, padding_value=pad_index, batch_first=True
        )
        batch_length = [i["length"] for i in batch]
        batch_length = torch.stack(batch_length)
        batch_label = [label_dict[i["label"]] for i in batch]
        batch_label = torch.tensor(batch_label, dtype=torch.long)
        batch = {"ids": batch_ids, "length": batch_length, "label": batch_label}
        return batch

    return collate_fn

In [150]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [151]:
# 각 데이터를 data_loader로 변환하여 모델에 실을 준비를 한다.
batch_size = 512

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [152]:
vectors = GloVe()
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())

### NBoW(Neaural Bag of Words) 모델

In [153]:
len(vocab), len(train_data.unique("label"))

(7049, 6)

In [154]:
class NBoW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, ids):
        # ids = [batch size, seq len]
        embedded = self.embedding(ids)
        # embedded = [batch size, seq len, embedding dim]
        pooled = embedded.mean(dim=1)
        # pooled = [batch size, embedding dim]
        prediction = self.fc(pooled)
        # prediction = [batch size, output dim]
        return prediction

In [165]:
vocab_size = len(vocab)
embedding_dim = 300
output_dim = len(train_data.unique("label"))

NBoW_model = NBoW(vocab_size, embedding_dim, output_dim, pad_index)

In [166]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(NBoW_model):,} trainable parameters")

The model has 2,116,506 trainable parameters


In [167]:
NBoW_model.embedding.weight.data = pretrained_embedding

In [168]:
lr = 5e-4

optimizer = optim.Adam(NBoW_model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [169]:
# cuda 사용함
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cuda')

In [170]:
NBoW_model = NBoW_model.to(device)
criterion = criterion.to(device)

In [171]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [172]:
def train_NBoW(data_loader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []
    for batch in tqdm.tqdm(data_loader, desc="training..."):
        ids = batch["ids"].to(device)
        label = batch["label"].to(device)
        prediction = model(ids)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

In [173]:
def evaluate_NBoW(data_loader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []
    with torch.no_grad():
        for batch in tqdm.tqdm(data_loader, desc="evaluating..."):
            ids = batch["ids"].to(device)
            label = batch["label"].to(device)
            prediction = model(ids)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

In [177]:
import torch.nn.functional as F

In [202]:
def predict_NBoW(data_loader, model, criterion, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for batch in tqdm.tqdm(data_loader, desc="evaluating..."):
            ids = batch["ids"].to(device)
            prediction = model(ids)
            softmax_predictions = F.softmax(prediction, dim=1)
            prediction = torch.argmax(softmax_predictions, dim=1)
            preds+=prediction
            # print(prediction)
    preds = pd.Series([pred.item() for pred in preds])
    return preds

In [174]:
n_epochs = 10
best_valid_loss = float("inf")

metrics = collections.defaultdict(list)

for epoch in range(n_epochs):
    train_loss, train_acc = train_NBoW(
        train_data_loader, NBoW_model, criterion, optimizer, device
    )
    valid_loss, valid_acc = evaluate_NBoW(valid_data_loader, NBoW_model, criterion, device)
    metrics["train_losses"].append(train_loss)
    metrics["train_accs"].append(train_acc)
    metrics["valid_losses"].append(valid_loss)
    metrics["valid_accs"].append(valid_acc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(NBoW_model.state_dict(), "base_nbow.pt")
    print(f"epoch: {epoch}")
    print(f"train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}")
    print(f"valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}")

training...: 100%|██████████| 48/48 [00:01<00:00, 33.38it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 44.19it/s]


epoch: 0
train_loss: 1.759, train_acc: 0.350
valid_loss: 1.686, valid_acc: 0.461


training...: 100%|██████████| 48/48 [00:01<00:00, 34.86it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 45.20it/s]


epoch: 1
train_loss: 1.574, train_acc: 0.471
valid_loss: 1.470, valid_acc: 0.477


training...: 100%|██████████| 48/48 [00:01<00:00, 27.86it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 43.55it/s]


epoch: 2
train_loss: 1.388, train_acc: 0.495
valid_loss: 1.345, valid_acc: 0.498


training...: 100%|██████████| 48/48 [00:01<00:00, 27.63it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 41.74it/s]


epoch: 3
train_loss: 1.288, train_acc: 0.508
valid_loss: 1.274, valid_acc: 0.500


training...: 100%|██████████| 48/48 [00:01<00:00, 27.33it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 42.94it/s]


epoch: 4
train_loss: 1.219, train_acc: 0.512
valid_loss: 1.221, valid_acc: 0.503


training...: 100%|██████████| 48/48 [00:01<00:00, 27.51it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 42.03it/s]


epoch: 5
train_loss: 1.162, train_acc: 0.527
valid_loss: 1.178, valid_acc: 0.521


training...: 100%|██████████| 48/48 [00:01<00:00, 27.71it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 42.65it/s]


epoch: 6
train_loss: 1.112, train_acc: 0.556
valid_loss: 1.142, valid_acc: 0.541


training...: 100%|██████████| 48/48 [00:01<00:00, 25.54it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 42.11it/s]


epoch: 7
train_loss: 1.065, train_acc: 0.583
valid_loss: 1.109, valid_acc: 0.566


training...: 100%|██████████| 48/48 [00:01<00:00, 27.37it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 42.90it/s]


epoch: 8
train_loss: 1.020, train_acc: 0.612
valid_loss: 1.079, valid_acc: 0.583


training...: 100%|██████████| 48/48 [00:01<00:00, 27.54it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 41.86it/s]

epoch: 9
train_loss: 0.978, train_acc: 0.635
valid_loss: 1.052, valid_acc: 0.595





In [186]:
test_data.shape

(8157, 8)

In [212]:
label_dict

{'speech': 0,
 'paper': 1,
 'public': 2,
 'edit': 3,
 'literature': 4,
 'briefing': 5}

In [213]:
reversed_label_dict =  {value:key for key, value in label_dict.items()}
reversed_label_dict

{0: 'speech',
 1: 'paper',
 2: 'public',
 3: 'edit',
 4: 'literature',
 5: 'briefing'}

In [215]:
preds = predict_NBoW(test_data_loader, NBoW_model, criterion, device)
preds = preds.map(reversed_label_dict)
print(preds)

evaluating...: 100%|██████████| 16/16 [00:00<00:00, 32.42it/s]


0           speech
1           speech
2         briefing
3           speech
4         briefing
           ...    
8152    literature
8153    literature
8154        speech
8155      briefing
8156        speech
Length: 8157, dtype: object


In [218]:
expected_df = pd.DataFrame({'path':test_data['path'], 'prediction':preds})
expected_df.head()

Unnamed: 0,path,prediction
0,source_data\briefing\REPORT-briefing-32676-000...,speech
1,source_data\speech\REPORT-speech-10748-00001.json,speech
2,source_data\briefing\REPORT-briefing-65558-000...,briefing
3,source_data\public\REPORT-public-00080-00222.json,speech
4,source_data\briefing\REPORT-briefing-33600-000...,briefing


In [219]:
expected_df.to_csv("expected_df.tsv", index=False, sep='\t')

### LSTM(RNN) 모델

In [None]:
class LSTM(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_dim,
        output_dim,
        n_layers,
        bidirectional,
        dropout_rate,
        pad_index,
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            n_layers,
            bidirectional=bidirectional,
            dropout=dropout_rate,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, ids, length):
        # ids = [batch size, seq len]
        # length = [batch size]
        embedded = self.dropout(self.embedding(ids))
        # embedded = [batch size, seq len, embedding dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, length, batch_first=True, enforce_sorted=False
        )
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)
        # output = [batch size, seq len, hidden dim * n directions]
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat([hidden[-1], hidden[-2]], dim=-1))
            # hidden = [batch size, hidden dim * 2]
        else:
            hidden = self.dropout(hidden[-1])
            # hidden = [batch size, hidden dim]
        prediction = self.fc(hidden)
        # prediction = [batch size, output dim]
        return prediction

In [None]:
vocab_size = len(vocab)
embedding_dim = 1000
hidden_dim = 512
output_dim = len(train_data.unique("label"))
n_layers = 4
bidirectional = True
dropout_rate = 0.2

lstm_model = LSTM(
    vocab_size,
    embedding_dim,
    hidden_dim,
    output_dim,
    n_layers,
    bidirectional,
    dropout_rate,
    pad_index,
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(lstm_model):,} trainable parameters")

The model has 3,742,354 trainable parameters


In [None]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if "bias" in name:
                nn.init.zeros_(param)
            elif "weight" in name:
                nn.init.orthogonal_(param)

In [None]:
lstm_model.apply(initialize_weights)

LSTM(
  (embedding): Embedding(7049, 300, padding_idx=1)
  (lstm): LSTM(300, 128, num_layers=4, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=256, out_features=6, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
lstm_model.embedding.weight.data = pretrained_embedding

In [None]:
lr = 5e-4

optimizer = optim.Adam(lstm_model.parameters(), lr=lr)

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
# cuda 사용함
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cuda')

In [None]:
lstm_model = lstm_model.to(device)
criterion = criterion.to(device)

In [None]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [None]:
def train(dataloader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []
    for batch in tqdm.tqdm(dataloader, desc="training..."):
        ids = batch["ids"].to(device)
        length = batch["length"]
        label = batch["label"].to(device)
        prediction = model(ids, length)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

In [None]:
def evaluate(dataloader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []
    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc="evaluating..."):
            ids = batch["ids"].to(device)
            length = batch["length"]
            label = batch["label"].to(device)
            prediction = model(ids, length)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

### 모델 학습

In [None]:
n_epochs = 10
best_valid_loss = float("inf")

metrics = collections.defaultdict(list)

for epoch in range(n_epochs):
    train_loss, train_acc = train(
        train_data_loader, lstm_model, criterion, optimizer, device
    )
    valid_loss, valid_acc = evaluate(valid_data_loader, lstm_model, criterion, device)
    metrics["train_losses"].append(train_loss)
    metrics["train_accs"].append(train_acc)
    metrics["valid_losses"].append(valid_loss)
    metrics["valid_accs"].append(valid_acc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(lstm_model.state_dict(), "base_lstm.pt")
    print(f"epoch: {epoch}")
    print(f"train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}")
    print(f"valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}")

training...: 100%|██████████| 48/48 [00:03<00:00, 13.25it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 27.56it/s]


epoch: 0
train_loss: 1.485, train_acc: 0.423
valid_loss: 1.242, valid_acc: 0.479


training...: 100%|██████████| 48/48 [00:03<00:00, 13.92it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 25.81it/s]


epoch: 1
train_loss: 1.159, train_acc: 0.499
valid_loss: 1.161, valid_acc: 0.498


training...: 100%|██████████| 48/48 [00:03<00:00, 13.54it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 26.93it/s]


epoch: 2
train_loss: 1.047, train_acc: 0.540
valid_loss: 1.120, valid_acc: 0.534


training...: 100%|██████████| 48/48 [00:03<00:00, 13.89it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 26.38it/s]


epoch: 3
train_loss: 0.971, train_acc: 0.582
valid_loss: 1.125, valid_acc: 0.533


training...: 100%|██████████| 48/48 [00:03<00:00, 13.57it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 26.73it/s]


epoch: 4
train_loss: 0.932, train_acc: 0.608
valid_loss: 1.145, valid_acc: 0.542


training...: 100%|██████████| 48/48 [00:03<00:00, 13.85it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 27.03it/s]


epoch: 5
train_loss: 0.907, train_acc: 0.622
valid_loss: 1.152, valid_acc: 0.542


training...: 100%|██████████| 48/48 [00:03<00:00, 13.70it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 26.69it/s]


epoch: 6
train_loss: 0.877, train_acc: 0.641
valid_loss: 1.181, valid_acc: 0.536


training...: 100%|██████████| 48/48 [00:03<00:00, 13.84it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 26.62it/s]


epoch: 7
train_loss: 0.834, train_acc: 0.665
valid_loss: 1.209, valid_acc: 0.541


training...: 100%|██████████| 48/48 [00:03<00:00, 13.92it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 26.55it/s]


epoch: 8
train_loss: 0.790, train_acc: 0.687
valid_loss: 1.247, valid_acc: 0.553


training...: 100%|██████████| 48/48 [00:03<00:00, 13.55it/s]
evaluating...: 100%|██████████| 16/16 [00:00<00:00, 27.91it/s]

epoch: 9
train_loss: 0.747, train_acc: 0.711
valid_loss: 1.304, valid_acc: 0.544





In [None]:
_, test_acc = evaluate(test_data_loader, lstm_model, criterion, device)
print(f"test_acc: {test_acc:.3f}")

evaluating...: 100%|██████████| 16/16 [00:00<00:00, 27.00it/s]

test_acc: 0.541



