
# Pytorch + HuggingFace를 활용한 NSMC (네이버 영화평 감정분류) 모델 
## KoElectra Model
박장원님의 KoElectra-base_v3 모델 사용<br>
https://github.com/monologg/KoELECTRA

## Dataset
네이버 영화 리뷰 데이터셋<br>
https://github.com/e9t/nsmc

## References
- https://huggingface.co/transformers/training.html
- https://tutorials.pytorch.kr/beginner/data_loading_tutorial.html
- https://tutorials.pytorch.kr/beginner/blitz/cifar10_tutorial.html
- https://wikidocs.net/44249

## 주의사항
GPU 필수 - 1epoch 당 약 30분 소요

In [None]:
#  transformers 설치 ( 그 외 도구는 설치되었다고 가정 )
# HuggingFace transformers 설치 및 NSMC 데이터셋 다운로드
!pip install transformers
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 5.7MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 45.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 41.2MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=32f3a2d62184e8c1452

In [None]:
# 모델에 필요한 도구 불러오기

import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm

In [None]:
# GPU 활성화

device = torch.device("cuda")

In [None]:
device

device(type='cuda')

In [None]:
# 이미 학습된 모델 로딩하여 사용할 경우  

#model.load_state_dict(torch.load("nsmc KoElectra_sanghunCHO v2.pt"))

# 모델 생성

In [None]:
# Koelectra 모델 사용 ( 버젼은 Base v1,v2,v3 / small v1, v2, v3 선택 가능)

model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator").to(device)



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=451776329.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

 # 전처리 & 훈련 

In [None]:
# 데이터 전처리 
# max_lenth 설정 값 : 50

class NSMCDataset(Dataset):
  
  def __init__(self, csv_file):
    # 일부 값중에 NaN이 있음...
    self.dataset = pd.read_csv(csv_file, sep="\t").dropna(axis=0) 
    # 중복제거
    self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=50,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [None]:
# 데이터 로드

train_dataset = NSMCDataset("ratings_train.txt")
test_dataset = NSMCDataset("ratings_test.txt")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263326.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=61.0, style=ProgressStyle(description_w…


                 id          label
count  1.461820e+05  146182.000000
mean   6.779186e+06       0.498283
std    2.919223e+06       0.499999
min    3.300000e+01       0.000000
25%    4.814832e+06       0.000000
50%    7.581160e+06       0.000000
75%    9.274760e+06       1.000000
max    1.027815e+07       1.000000
                 id         label
count  4.915700e+04  49157.000000
mean   6.752945e+06      0.502695
std    2.937158e+06      0.499998
min    6.010000e+02      0.000000
25%    4.777143e+06      0.000000
50%    7.565415e+06      1.000000
75%    9.260204e+06      1.000000
max    1.027809e+07      1.000000


In [None]:
# 에포크, 배치 사이즈 지정, 데이터 로딩

epochs = 10
batch_size = 32

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [None]:
 torch.cuda.empty_cache()

In [None]:
# Koelectra Pretrained 모델을 활용한 FineTuing 진행


losses = []
accuracies = []

for i in range(epochs):
    total_loss = 0.0
    correct = 0
    total = 0
    batches = 0
    
    model.train()
    
    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
        loss = F.cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        _, predicted = torch.max(y_pred, 1)
        correct += (predicted == y_batch).sum()
        total += len(y_batch)
        
        batches += 1
        if batches % 100 == 0:
            print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
            
    losses.append(total_loss)
    accuracies.append(correct.float() / total)
    print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))



Batch Loss: 62.727893620729446 Accuracy: tensor(0.6438, device='cuda:0')
Batch Loss: 102.63869740068913 Accuracy: tensor(0.7405, device='cuda:0')
Batch Loss: 139.2818687558174 Accuracy: tensor(0.7765, device='cuda:0')
Batch Loss: 172.3182421028614 Accuracy: tensor(0.7978, device='cuda:0')
Batch Loss: 206.59232857823372 Accuracy: tensor(0.8099, device='cuda:0')
Batch Loss: 240.09853337705135 Accuracy: tensor(0.8185, device='cuda:0')
Batch Loss: 270.6736473888159 Accuracy: tensor(0.8257, device='cuda:0')
Batch Loss: 300.18492233753204 Accuracy: tensor(0.8316, device='cuda:0')
Batch Loss: 329.21897130459547 Accuracy: tensor(0.8369, device='cuda:0')
Batch Loss: 357.4603087976575 Accuracy: tensor(0.8411, device='cuda:0')
Batch Loss: 385.90168929845095 Accuracy: tensor(0.8446, device='cuda:0')
Batch Loss: 414.62633626908064 Accuracy: tensor(0.8472, device='cuda:0')
Batch Loss: 443.3482260480523 Accuracy: tensor(0.8498, device='cuda:0')
Batch Loss: 472.2353938445449 Accuracy: tensor(0.8519, d

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 19.38347551971674 Accuracy: tensor(0.9209, device='cuda:0')
Batch Loss: 39.06046496704221 Accuracy: tensor(0.9217, device='cuda:0')
Batch Loss: 60.67682448402047 Accuracy: tensor(0.9181, device='cuda:0')
Batch Loss: 80.75022358819842 Accuracy: tensor(0.9187, device='cuda:0')
Batch Loss: 100.54403660818934 Accuracy: tensor(0.9189, device='cuda:0')
Batch Loss: 121.5021339468658 Accuracy: tensor(0.9192, device='cuda:0')
Batch Loss: 142.22625272348523 Accuracy: tensor(0.9189, device='cuda:0')
Batch Loss: 163.4764384366572 Accuracy: tensor(0.9186, device='cuda:0')
Batch Loss: 185.60775032639503 Accuracy: tensor(0.9180, device='cuda:0')
Batch Loss: 205.3148831911385 Accuracy: tensor(0.9183, device='cuda:0')
Batch Loss: 227.39072467014194 Accuracy: tensor(0.9175, device='cuda:0')
Batch Loss: 246.31686520203948 Accuracy: tensor(0.9180, device='cuda:0')
Batch Loss: 266.8925540931523 Accuracy: tensor(0.9179, device='cuda:0')
Batch Loss: 287.7547939866781 Accuracy: tensor(0.9176, devi

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 16.619200541637838 Accuracy: tensor(0.9381, device='cuda:0')
Batch Loss: 31.257050818763673 Accuracy: tensor(0.9409, device='cuda:0')
Batch Loss: 46.19612168427557 Accuracy: tensor(0.9408, device='cuda:0')
Batch Loss: 63.025988708250225 Accuracy: tensor(0.9401, device='cuda:0')
Batch Loss: 79.94791651610285 Accuracy: tensor(0.9392, device='cuda:0')
Batch Loss: 96.11818210501224 Accuracy: tensor(0.9388, device='cuda:0')
Batch Loss: 111.38344196323305 Accuracy: tensor(0.9392, device='cuda:0')
Batch Loss: 127.31670966465026 Accuracy: tensor(0.9394, device='cuda:0')
Batch Loss: 143.8165367813781 Accuracy: tensor(0.9390, device='cuda:0')
Batch Loss: 160.34214471559972 Accuracy: tensor(0.9389, device='cuda:0')
Batch Loss: 176.8901331750676 Accuracy: tensor(0.9389, device='cuda:0')
Batch Loss: 192.24606916774064 Accuracy: tensor(0.9389, device='cuda:0')
Batch Loss: 208.8072731057182 Accuracy: tensor(0.9385, device='cuda:0')
Batch Loss: 225.0924085592851 Accuracy: tensor(0.9386, de

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 11.250052258372307 Accuracy: tensor(0.9566, device='cuda:0')
Batch Loss: 23.31558086257428 Accuracy: tensor(0.9558, device='cuda:0')
Batch Loss: 35.19163529248908 Accuracy: tensor(0.9555, device='cuda:0')
Batch Loss: 45.67866594390944 Accuracy: tensor(0.9579, device='cuda:0')
Batch Loss: 57.884259179700166 Accuracy: tensor(0.9577, device='cuda:0')
Batch Loss: 69.81135906325653 Accuracy: tensor(0.9579, device='cuda:0')
Batch Loss: 81.33387581119314 Accuracy: tensor(0.9576, device='cuda:0')
Batch Loss: 92.89939645584673 Accuracy: tensor(0.9577, device='cuda:0')
Batch Loss: 105.23379173874855 Accuracy: tensor(0.9571, device='cuda:0')
Batch Loss: 117.08100616559386 Accuracy: tensor(0.9569, device='cuda:0')
Batch Loss: 130.79708660952747 Accuracy: tensor(0.9563, device='cuda:0')
Batch Loss: 143.3087985869497 Accuracy: tensor(0.9558, device='cuda:0')
Batch Loss: 155.7455927822739 Accuracy: tensor(0.9554, device='cuda:0')
Batch Loss: 168.59463110752404 Accuracy: tensor(0.9553, dev

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 9.93597868271172 Accuracy: tensor(0.9638, device='cuda:0')
Batch Loss: 17.907934652641416 Accuracy: tensor(0.9686, device='cuda:0')
Batch Loss: 25.600026432890445 Accuracy: tensor(0.9704, device='cuda:0')
Batch Loss: 35.31203445279971 Accuracy: tensor(0.9697, device='cuda:0')
Batch Loss: 43.883934958837926 Accuracy: tensor(0.9696, device='cuda:0')
Batch Loss: 53.80042913090438 Accuracy: tensor(0.9692, device='cuda:0')
Batch Loss: 62.02052327338606 Accuracy: tensor(0.9693, device='cuda:0')
Batch Loss: 71.07288398360834 Accuracy: tensor(0.9690, device='cuda:0')
Batch Loss: 81.00647577410564 Accuracy: tensor(0.9686, device='cuda:0')
Batch Loss: 89.67494412558153 Accuracy: tensor(0.9686, device='cuda:0')
Batch Loss: 99.66869783168659 Accuracy: tensor(0.9680, device='cuda:0')
Batch Loss: 109.95705482689664 Accuracy: tensor(0.9673, device='cuda:0')
Batch Loss: 119.11946289474145 Accuracy: tensor(0.9670, device='cuda:0')
Batch Loss: 128.47891566483304 Accuracy: tensor(0.9666, devi

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 69.36388540267944 Accuracy: tensor(0.5075, device='cuda:0')
Batch Loss: 138.66681170463562 Accuracy: tensor(0.5056, device='cuda:0')
Batch Loss: 208.01927584409714 Accuracy: tensor(0.5039, device='cuda:0')
Batch Loss: 277.42557913064957 Accuracy: tensor(0.5039, device='cuda:0')
Batch Loss: 346.81019270420074 Accuracy: tensor(0.5034, device='cuda:0')
Batch Loss: 416.3123852610588 Accuracy: tensor(0.5029, device='cuda:0')
Batch Loss: 485.7103818655014 Accuracy: tensor(0.5018, device='cuda:0')
Batch Loss: 555.0413938760757 Accuracy: tensor(0.5023, device='cuda:0')
Batch Loss: 624.5336722135544 Accuracy: tensor(0.5015, device='cuda:0')
Batch Loss: 693.8322353363037 Accuracy: tensor(0.5023, device='cuda:0')
Batch Loss: 763.0780945420265 Accuracy: tensor(0.5036, device='cuda:0')
Batch Loss: 832.3949919939041 Accuracy: tensor(0.5036, device='cuda:0')
Batch Loss: 901.763164460659 Accuracy: tensor(0.5036, device='cuda:0')
Batch Loss: 971.3261370658875 Accuracy: tensor(0.5024, device

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 24.03402891755104 Accuracy: tensor(0.8891, device='cuda:0')
Batch Loss: 45.82479493319988 Accuracy: tensor(0.8920, device='cuda:0')
Batch Loss: 64.72921433672309 Accuracy: tensor(0.8997, device='cuda:0')
Batch Loss: 80.90169796720147 Accuracy: tensor(0.9080, device='cuda:0')
Batch Loss: 99.27392704039812 Accuracy: tensor(0.9114, device='cuda:0')
Batch Loss: 117.79027356207371 Accuracy: tensor(0.9111, device='cuda:0')
Batch Loss: 135.73120976611972 Accuracy: tensor(0.9121, device='cuda:0')
Batch Loss: 154.79890466481447 Accuracy: tensor(0.9134, device='cuda:0')
Batch Loss: 171.87933537736535 Accuracy: tensor(0.9148, device='cuda:0')
Batch Loss: 188.0569290407002 Accuracy: tensor(0.9162, device='cuda:0')
Batch Loss: 202.9686826299876 Accuracy: tensor(0.9177, device='cuda:0')
Batch Loss: 217.8480694880709 Accuracy: tensor(0.9193, device='cuda:0')
Batch Loss: 232.1805742615834 Accuracy: tensor(0.9207, device='cuda:0')
Batch Loss: 247.61005281563848 Accuracy: tensor(0.9211, devi

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 8.276737586129457 Accuracy: tensor(0.9678, device='cuda:0')
Batch Loss: 15.863050474319607 Accuracy: tensor(0.9700, device='cuda:0')
Batch Loss: 23.74425406381488 Accuracy: tensor(0.9702, device='cuda:0')
Batch Loss: 31.50534084951505 Accuracy: tensor(0.9716, device='cuda:0')
Batch Loss: 39.2675892082043 Accuracy: tensor(0.9714, device='cuda:0')
Batch Loss: 45.98719253204763 Accuracy: tensor(0.9719, device='cuda:0')
Batch Loss: 54.167441082187 Accuracy: tensor(0.9719, device='cuda:0')
Batch Loss: 63.51743168383837 Accuracy: tensor(0.9711, device='cuda:0')
Batch Loss: 70.32397087174468 Accuracy: tensor(0.9712, device='cuda:0')
Batch Loss: 78.31500539672561 Accuracy: tensor(0.9715, device='cuda:0')
Batch Loss: 86.24742109770887 Accuracy: tensor(0.9711, device='cuda:0')
Batch Loss: 94.04643202316947 Accuracy: tensor(0.9710, device='cuda:0')
Batch Loss: 102.94085830403492 Accuracy: tensor(0.9708, device='cuda:0')
Batch Loss: 110.98163256933913 Accuracy: tensor(0.9707, device='c

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 6.716459071263671 Accuracy: tensor(0.9775, device='cuda:0')
Batch Loss: 11.226067865733057 Accuracy: tensor(0.9808, device='cuda:0')
Batch Loss: 17.0179624510929 Accuracy: tensor(0.9814, device='cuda:0')
Batch Loss: 22.63810487696901 Accuracy: tensor(0.9815, device='cuda:0')
Batch Loss: 27.81828311784193 Accuracy: tensor(0.9821, device='cuda:0')
Batch Loss: 34.06993270316161 Accuracy: tensor(0.9818, device='cuda:0')
Batch Loss: 39.936727962456644 Accuracy: tensor(0.9816, device='cuda:0')
Batch Loss: 45.74873391864821 Accuracy: tensor(0.9814, device='cuda:0')
Batch Loss: 51.44323090906255 Accuracy: tensor(0.9813, device='cuda:0')
Batch Loss: 56.92750448640436 Accuracy: tensor(0.9813, device='cuda:0')
Batch Loss: 62.96115191956051 Accuracy: tensor(0.9813, device='cuda:0')
Batch Loss: 68.38423057179898 Accuracy: tensor(0.9814, device='cuda:0')
Batch Loss: 74.35287963692099 Accuracy: tensor(0.9810, device='cuda:0')
Batch Loss: 80.47044061403722 Accuracy: tensor(0.9807, device='

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 5.344634755863808 Accuracy: tensor(0.9809, device='cuda:0')
Batch Loss: 11.704383883741684 Accuracy: tensor(0.9780, device='cuda:0')
Batch Loss: 19.568022827268578 Accuracy: tensor(0.9758, device='cuda:0')
Batch Loss: 25.38783131132368 Accuracy: tensor(0.9766, device='cuda:0')
Batch Loss: 31.673510704771616 Accuracy: tensor(0.9768, device='cuda:0')
Batch Loss: 39.5917835686123 Accuracy: tensor(0.9761, device='cuda:0')
Batch Loss: 45.54735690692905 Accuracy: tensor(0.9769, device='cuda:0')
Batch Loss: 52.5618799807271 Accuracy: tensor(0.9764, device='cuda:0')
Batch Loss: 58.30699869489763 Accuracy: tensor(0.9766, device='cuda:0')
Batch Loss: 64.47006690024864 Accuracy: tensor(0.9769, device='cuda:0')
Batch Loss: 70.8777633557329 Accuracy: tensor(0.9768, device='cuda:0')
Batch Loss: 76.31799377966672 Accuracy: tensor(0.9771, device='cuda:0')
Batch Loss: 82.91371634835377 Accuracy: tensor(0.9770, device='cuda:0')
Batch Loss: 88.32415757875424 Accuracy: tensor(0.9771, device='c

In [None]:
losses, accuracies

([1301.1318032741547,
  948.3374948650599,
  740.1298047462478,
  589.6774791488424,
  1041.6840516240336,
  2575.104144103825,
  615.1428137994371,
  390.00026818411425,
  434.93712106719613,
  288.69053939846344],
 [tensor(0.8796, device='cuda:0'),
  tensor(0.9169, device='cuda:0'),
  tensor(0.9374, device='cuda:0'),
  tensor(0.9516, device='cuda:0'),
  tensor(0.8679, device='cuda:0'),
  tensor(0.6146, device='cuda:0'),
  tensor(0.9442, device='cuda:0'),
  tensor(0.9682, device='cuda:0'),
  tensor(0.9607, device='cuda:0'),
  tensor(0.9775, device='cuda:0')])

 # 학습 & 테스트


In [None]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    _, predicted = torch.max(y_pred, 1)
    test_correct += (predicted == y_batch).sum()
    test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

HBox(children=(FloatProgress(value=0.0, max=3073.0), HTML(value='')))




Accuracy: tensor(0.9054, device='cuda:0')


In [None]:
# 모델 저장하기
torch.save(model.state_dict(), "nsmc KoElectra_sanghunCHO v2.pt")

# 결과 예측(샘플 파일 적용)

In [None]:
class NSMCDataset_ko(Dataset):
  
  def __init__(self, csv_file):
    # 일부 값중에 NaN이 있음...
    self.dataset = pd.read_csv(csv_file, sep=',', encoding='CP949').dropna(axis=0) 
    # 중복제거
    #self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    
    

    # ko_data label 없음
    #y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=64,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    

    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask

In [None]:
# test dataset upload
from google.colab import files
myfile = files.upload()

Saving ko_data.csv to ko_data.csv


In [None]:
test_dataset = NSMCDataset_ko("ko_data.csv")

                 Id
count  11187.000000
mean    5593.000000
std     3229.553065
min        0.000000
25%     2796.500000
50%     5593.000000
75%     8389.500000
max    11186.000000


In [None]:
# 입력데이터 1개씩 predicted 값 생성을 위해 batch_size 1 로 설정

test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
model.eval()

test_preds = []

for input_ids_batch, attention_masks_batch in tqdm(test_loader):

  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)


  preds = y_pred.detach()
  test_preds.append(preds.cpu().numpy())


HBox(children=(FloatProgress(value=0.0, max=11187.0), HTML(value='')))






In [None]:
import numpy as np

outputs = []
for _ in test_preds:
    # argmax를 사용해서 가장 높은 확률로 예측한 class 반환
    predicted_class_indices=np.argmax(_, axis=1).tolist()
    outputs.append(predicted_class_indices)

result = np.concatenate(outputs)

In [None]:
#결과 값 행수 일치 확인
len(result)

11187

In [None]:
import pandas as pd

ko_data = pd.read_csv('ko_data.csv', delimiter=',', encoding='CP949')



In [None]:
outfile_df = pd.DataFrame()

outfile_df['Id'] = ko_data['Id']
outfile_df['Predicted'] = result

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
outfile_df.to_csv("/content/drive/MyDrive/Colab Notebooks/Submission6.csv", index=False)