
# Pytorch + HuggingFace를 활용한 NSMC (네이버 영화평 감정분류) 모델 
## KoElectra Model
박장원님의 KoElectra-base_v3 모델 사용<br>
https://github.com/monologg/KoELECTRA

## Dataset
네이버 영화 리뷰 데이터셋<br>
https://github.com/e9t/nsmc

## References
- https://huggingface.co/transformers/training.html
- https://tutorials.pytorch.kr/beginner/data_loading_tutorial.html
- https://tutorials.pytorch.kr/beginner/blitz/cifar10_tutorial.html
- https://wikidocs.net/44249

## 주의사항
꼭 GPU로 해주세요 - 1epoch 당 약 20분 소요

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
#  transformers 설치 ( 그 외 도구는 설치되었다고 가정 )
# HuggingFace transformers 설치 및 NSMC 데이터셋 다운로드
!pip install transformers
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt

--2020-12-21 20:45:06--  https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4893335 (4.7M) [text/plain]
Saving to: ‘ratings_test.txt.1’


2020-12-21 20:45:06 (32.4 MB/s) - ‘ratings_test.txt.1’ saved [4893335/4893335]

--2020-12-21 20:45:06--  https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14628807 (14M) [text/plain]
Saving to: ‘ratings_train.txt.1’


2020-12-21 20:45:06 (64.8 MB/s) - ‘ratings_train.txt.1’ save

In [6]:
# 모델에 필요한 도구 불러오기

import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm

In [7]:
# GPU 활성화

device = torch.device("cuda")

In [8]:
device

device(type='cuda')

In [9]:
# 이미 학습된 모델 로딩하여 사용할 경우  

#model.load_state_dict(torch.load("nsmc kobert.pt"))

# Create Model

In [10]:
# Koelectra 모델 사용 ( 버젼은 Base v1,v2,v3 / small v1, v2, v3 선택 가능)
# 가장 높은 성능을 보여주는 Base v3 버젼으로 Fine Tuing 진행

model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator").to(device)



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=451776329.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

 # Train & Test Model
 ## Train Model

In [11]:
# 데이터 전처리 
# max_lenth 설정 값 : 50

class NSMCDataset(Dataset):
  
  def __init__(self, csv_file):
    # 일부 값중에 NaN이 있음...
    self.dataset = pd.read_csv(csv_file, sep="\t").dropna(axis=0) 
    # 중복제거
    self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=50,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [12]:
# 데이터 로드

train_dataset = NSMCDataset("ratings_train.txt")
test_dataset = NSMCDataset("ratings_test.txt")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263326.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=61.0, style=ProgressStyle(description_w…


                 id          label
count  1.461820e+05  146182.000000
mean   6.779186e+06       0.498283
std    2.919223e+06       0.499999
min    3.300000e+01       0.000000
25%    4.814832e+06       0.000000
50%    7.581160e+06       0.000000
75%    9.274760e+06       1.000000
max    1.027815e+07       1.000000
                 id         label
count  4.915700e+04  49157.000000
mean   6.752945e+06      0.502695
std    2.937158e+06      0.499998
min    6.010000e+02      0.000000
25%    4.777143e+06      0.000000
50%    7.565415e+06      1.000000
75%    9.260204e+06      1.000000
max    1.027809e+07      1.000000


In [13]:
# 에포크, 배치 사이즈 지정, 데이터 로딩

epochs = 10
batch_size = 32

In [14]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [15]:
 torch.cuda.empty_cache()

In [16]:
# Koelectra Pretrained 모델을 활용한 FineTuing 진행


losses = []
accuracies = []

for i in range(epochs):
    total_loss = 0.0
    correct = 0
    total = 0
    batches = 0
    
    model.train()
    
    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
        loss = F.cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        _, predicted = torch.max(y_pred, 1)
        correct += (predicted == y_batch).sum()
        total += len(y_batch)
        
        batches += 1
        if batches % 100 == 0:
            print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
            
    losses.append(total_loss)
    accuracies.append(correct.float() / total)
    print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))



Batch Loss: 61.982283532619476 Accuracy: tensor(0.6675, device='cuda:0')
Batch Loss: 104.4811854660511 Accuracy: tensor(0.7459, device='cuda:0')
Batch Loss: 142.8386540710926 Accuracy: tensor(0.7769, device='cuda:0')
Batch Loss: 177.36984972655773 Accuracy: tensor(0.7972, device='cuda:0')
Batch Loss: 211.83253636956215 Accuracy: tensor(0.8095, device='cuda:0')
Batch Loss: 243.04109231382608 Accuracy: tensor(0.8198, device='cuda:0')
Batch Loss: 274.92407973110676 Accuracy: tensor(0.8272, device='cuda:0')
Batch Loss: 307.1132615059614 Accuracy: tensor(0.8317, device='cuda:0')
Batch Loss: 337.3032695353031 Accuracy: tensor(0.8360, device='cuda:0')
Batch Loss: 366.0610908046365 Accuracy: tensor(0.8408, device='cuda:0')
Batch Loss: 394.9615585952997 Accuracy: tensor(0.8445, device='cuda:0')
Batch Loss: 424.5290277376771 Accuracy: tensor(0.8475, device='cuda:0')
Batch Loss: 452.0566579774022 Accuracy: tensor(0.8504, device='cuda:0')
Batch Loss: 480.11598071455956 Accuracy: tensor(0.8528, dev

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 22.652656260877848 Accuracy: tensor(0.9103, device='cuda:0')
Batch Loss: 43.177033154293895 Accuracy: tensor(0.9147, device='cuda:0')
Batch Loss: 64.76420160196722 Accuracy: tensor(0.9154, device='cuda:0')
Batch Loss: 86.27851049415767 Accuracy: tensor(0.9145, device='cuda:0')
Batch Loss: 108.04564514197409 Accuracy: tensor(0.9149, device='cuda:0')
Batch Loss: 128.6030208300799 Accuracy: tensor(0.9148, device='cuda:0')
Batch Loss: 149.59452587924898 Accuracy: tensor(0.9146, device='cuda:0')
Batch Loss: 171.83655349723995 Accuracy: tensor(0.9138, device='cuda:0')
Batch Loss: 192.37909190915525 Accuracy: tensor(0.9140, device='cuda:0')
Batch Loss: 213.32929011248052 Accuracy: tensor(0.9140, device='cuda:0')
Batch Loss: 236.59419791959226 Accuracy: tensor(0.9134, device='cuda:0')
Batch Loss: 257.8486799579114 Accuracy: tensor(0.9139, device='cuda:0')
Batch Loss: 278.78227517567575 Accuracy: tensor(0.9145, device='cuda:0')
Batch Loss: 299.8629460949451 Accuracy: tensor(0.9144, 

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 17.70765708759427 Accuracy: tensor(0.9300, device='cuda:0')
Batch Loss: 36.6241379249841 Accuracy: tensor(0.9252, device='cuda:0')
Batch Loss: 56.51427700556815 Accuracy: tensor(0.9234, device='cuda:0')
Batch Loss: 75.87677404843271 Accuracy: tensor(0.9244, device='cuda:0')
Batch Loss: 94.25414116121829 Accuracy: tensor(0.9249, device='cuda:0')
Batch Loss: 112.4835603479296 Accuracy: tensor(0.9263, device='cuda:0')
Batch Loss: 130.19023353606462 Accuracy: tensor(0.9271, device='cuda:0')
Batch Loss: 146.32774511352181 Accuracy: tensor(0.9282, device='cuda:0')
Batch Loss: 163.46242844685912 Accuracy: tensor(0.9286, device='cuda:0')
Batch Loss: 181.56541807577014 Accuracy: tensor(0.9288, device='cuda:0')
Batch Loss: 199.8841311223805 Accuracy: tensor(0.9288, device='cuda:0')
Batch Loss: 216.69472427107394 Accuracy: tensor(0.9293, device='cuda:0')
Batch Loss: 234.98473595641553 Accuracy: tensor(0.9293, device='cuda:0')
Batch Loss: 252.51164081133902 Accuracy: tensor(0.9294, dev

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 13.888021349906921 Accuracy: tensor(0.9506, device='cuda:0')
Batch Loss: 28.90570930019021 Accuracy: tensor(0.9458, device='cuda:0')
Batch Loss: 43.55097882822156 Accuracy: tensor(0.9448, device='cuda:0')
Batch Loss: 57.199090868234634 Accuracy: tensor(0.9455, device='cuda:0')
Batch Loss: 71.4700954305008 Accuracy: tensor(0.9457, device='cuda:0')
Batch Loss: 87.61782959382981 Accuracy: tensor(0.9446, device='cuda:0')
Batch Loss: 102.66467922460288 Accuracy: tensor(0.9440, device='cuda:0')
Batch Loss: 116.61588096804917 Accuracy: tensor(0.9446, device='cuda:0')
Batch Loss: 131.90803046524525 Accuracy: tensor(0.9443, device='cuda:0')
Batch Loss: 145.11714367195964 Accuracy: tensor(0.9448, device='cuda:0')
Batch Loss: 159.21199762448668 Accuracy: tensor(0.9449, device='cuda:0')
Batch Loss: 173.97081557288766 Accuracy: tensor(0.9449, device='cuda:0')
Batch Loss: 186.84603766072541 Accuracy: tensor(0.9453, device='cuda:0')
Batch Loss: 199.7717631245032 Accuracy: tensor(0.9458, d

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 10.07603788562119 Accuracy: tensor(0.9625, device='cuda:0')
Batch Loss: 22.069059124216437 Accuracy: tensor(0.9578, device='cuda:0')
Batch Loss: 32.943648227490485 Accuracy: tensor(0.9581, device='cuda:0')
Batch Loss: 43.76415418740362 Accuracy: tensor(0.9592, device='cuda:0')
Batch Loss: 53.6546830595471 Accuracy: tensor(0.9603, device='cuda:0')
Batch Loss: 65.66424762876704 Accuracy: tensor(0.9592, device='cuda:0')
Batch Loss: 76.93986983085051 Accuracy: tensor(0.9592, device='cuda:0')
Batch Loss: 88.06278535211459 Accuracy: tensor(0.9589, device='cuda:0')
Batch Loss: 99.42056603170931 Accuracy: tensor(0.9588, device='cuda:0')
Batch Loss: 110.55963512510061 Accuracy: tensor(0.9588, device='cuda:0')
Batch Loss: 123.66828735452145 Accuracy: tensor(0.9582, device='cuda:0')
Batch Loss: 133.6351667912677 Accuracy: tensor(0.9589, device='cuda:0')
Batch Loss: 144.94654341507703 Accuracy: tensor(0.9589, device='cuda:0')
Batch Loss: 156.0432532466948 Accuracy: tensor(0.9592, devic

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 8.328623315319419 Accuracy: tensor(0.9716, device='cuda:0')
Batch Loss: 16.926647580228746 Accuracy: tensor(0.9709, device='cuda:0')
Batch Loss: 24.722456654999405 Accuracy: tensor(0.9720, device='cuda:0')
Batch Loss: 35.371826756279916 Accuracy: tensor(0.9705, device='cuda:0')
Batch Loss: 43.51849257852882 Accuracy: tensor(0.9705, device='cuda:0')
Batch Loss: 52.033147271722555 Accuracy: tensor(0.9702, device='cuda:0')
Batch Loss: 60.448123997077346 Accuracy: tensor(0.9694, device='cuda:0')
Batch Loss: 69.10093080019578 Accuracy: tensor(0.9700, device='cuda:0')
Batch Loss: 76.25240863952786 Accuracy: tensor(0.9706, device='cuda:0')
Batch Loss: 85.0838473495096 Accuracy: tensor(0.9703, device='cuda:0')
Batch Loss: 94.78902506548911 Accuracy: tensor(0.9697, device='cuda:0')
Batch Loss: 102.18787797400728 Accuracy: tensor(0.9701, device='cuda:0')
Batch Loss: 110.89624864142388 Accuracy: tensor(0.9697, device='cuda:0')
Batch Loss: 119.80029825260863 Accuracy: tensor(0.9696, de

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 4.580923170316964 Accuracy: tensor(0.9853, device='cuda:0')
Batch Loss: 10.371839098632336 Accuracy: tensor(0.9825, device='cuda:0')
Batch Loss: 17.809583419002593 Accuracy: tensor(0.9799, device='cuda:0')
Batch Loss: 24.936896388186142 Accuracy: tensor(0.9793, device='cuda:0')
Batch Loss: 31.085446709999815 Accuracy: tensor(0.9790, device='cuda:0')
Batch Loss: 37.55621574143879 Accuracy: tensor(0.9784, device='cuda:0')
Batch Loss: 44.3521341921296 Accuracy: tensor(0.9781, device='cuda:0')
Batch Loss: 51.51063423510641 Accuracy: tensor(0.9778, device='cuda:0')
Batch Loss: 58.54462549625896 Accuracy: tensor(0.9775, device='cuda:0')
Batch Loss: 64.0218079064507 Accuracy: tensor(0.9777, device='cuda:0')
Batch Loss: 70.35414918255992 Accuracy: tensor(0.9779, device='cuda:0')
Batch Loss: 76.79471617541276 Accuracy: tensor(0.9777, device='cuda:0')
Batch Loss: 83.55919947312213 Accuracy: tensor(0.9776, device='cuda:0')
Batch Loss: 90.86979257594794 Accuracy: tensor(0.9775, device=

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 4.447977642528713 Accuracy: tensor(0.9844, device='cuda:0')
Batch Loss: 9.00682754558511 Accuracy: tensor(0.9842, device='cuda:0')
Batch Loss: 13.715191145660356 Accuracy: tensor(0.9842, device='cuda:0')
Batch Loss: 19.144155727233738 Accuracy: tensor(0.9837, device='cuda:0')
Batch Loss: 24.106410957872868 Accuracy: tensor(0.9837, device='cuda:0')
Batch Loss: 29.90081956004724 Accuracy: tensor(0.9832, device='cuda:0')
Batch Loss: 35.04149288800545 Accuracy: tensor(0.9828, device='cuda:0')
Batch Loss: 41.12811310798861 Accuracy: tensor(0.9825, device='cuda:0')
Batch Loss: 46.18368917820044 Accuracy: tensor(0.9826, device='cuda:0')
Batch Loss: 52.238532338058576 Accuracy: tensor(0.9824, device='cuda:0')
Batch Loss: 57.959606748772785 Accuracy: tensor(0.9823, device='cuda:0')
Batch Loss: 65.79385544010438 Accuracy: tensor(0.9819, device='cuda:0')
Batch Loss: 72.02024322119541 Accuracy: tensor(0.9816, device='cuda:0')
Batch Loss: 77.04648622660898 Accuracy: tensor(0.9817, devic

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 3.515591413830407 Accuracy: tensor(0.9887, device='cuda:0')
Batch Loss: 7.971199957537465 Accuracy: tensor(0.9869, device='cuda:0')
Batch Loss: 12.316192389582284 Accuracy: tensor(0.9870, device='cuda:0')
Batch Loss: 17.456570142996497 Accuracy: tensor(0.9858, device='cuda:0')
Batch Loss: 23.06314617942553 Accuracy: tensor(0.9846, device='cuda:0')
Batch Loss: 28.815022771363147 Accuracy: tensor(0.9836, device='cuda:0')
Batch Loss: 32.71832462132443 Accuracy: tensor(0.9839, device='cuda:0')
Batch Loss: 37.371937516960315 Accuracy: tensor(0.9842, device='cuda:0')
Batch Loss: 41.889728095033206 Accuracy: tensor(0.9843, device='cuda:0')
Batch Loss: 45.723280367208645 Accuracy: tensor(0.9845, device='cuda:0')
Batch Loss: 50.85735861817375 Accuracy: tensor(0.9843, device='cuda:0')
Batch Loss: 56.58150635752827 Accuracy: tensor(0.9842, device='cuda:0')
Batch Loss: 60.80019112699665 Accuracy: tensor(0.9845, device='cuda:0')
Batch Loss: 66.46675070596393 Accuracy: tensor(0.9843, dev

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 3.571237144060433 Accuracy: tensor(0.9875, device='cuda:0')
Batch Loss: 8.282500049099326 Accuracy: tensor(0.9866, device='cuda:0')
Batch Loss: 11.363942027790472 Accuracy: tensor(0.9874, device='cuda:0')
Batch Loss: 15.286352470633574 Accuracy: tensor(0.9872, device='cuda:0')
Batch Loss: 20.377018545405008 Accuracy: tensor(0.9863, device='cuda:0')
Batch Loss: 24.480745369568467 Accuracy: tensor(0.9864, device='cuda:0')
Batch Loss: 28.099346480099484 Accuracy: tensor(0.9866, device='cuda:0')
Batch Loss: 32.53561920445645 Accuracy: tensor(0.9864, device='cuda:0')
Batch Loss: 36.60980098793516 Accuracy: tensor(0.9864, device='cuda:0')
Batch Loss: 41.144536888983566 Accuracy: tensor(0.9864, device='cuda:0')
Batch Loss: 45.72415708139306 Accuracy: tensor(0.9862, device='cuda:0')
Batch Loss: 49.617062322970014 Accuracy: tensor(0.9863, device='cuda:0')
Batch Loss: 54.372718277794775 Accuracy: tensor(0.9862, device='cuda:0')
Batch Loss: 58.40797273739008 Accuracy: tensor(0.9864, d

In [17]:
losses, accuracies

([1313.905930813402,
  1016.8588439393789,
  813.3920396454632,
  667.9796953611076,
  522.9718890630174,
  406.79359528119676,
  319.97024575574324,
  267.95815305877477,
  229.17974565410987,
  194.44499128340976],
 [tensor(0.8794, device='cuda:0'),
  tensor(0.9099, device='cuda:0'),
  tensor(0.9304, device='cuda:0'),
  tensor(0.9443, device='cuda:0'),
  tensor(0.9582, device='cuda:0'),
  tensor(0.9683, device='cuda:0'),
  tensor(0.9755, device='cuda:0'),
  tensor(0.9802, device='cuda:0'),
  tensor(0.9832, device='cuda:0'),
  tensor(0.9859, device='cuda:0')])

 # Learn & Test Model
 ## Test Model
 

In [18]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    _, predicted = torch.max(y_pred, 1)
    test_correct += (predicted == y_batch).sum()
    test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

HBox(children=(FloatProgress(value=0.0, max=3073.0), HTML(value='')))




Accuracy: tensor(0.9025, device='cuda:0')


In [20]:
# 모델 저장하기
torch.save(model.state_dict(), "nsmc kobert.pt")

# Prediction (W ko_data.csv)

In [21]:
class NSMCDataset_ko(Dataset):
  
  def __init__(self, csv_file):
    # 일부 값중에 NaN이 있음...
    self.dataset = pd.read_csv(csv_file, sep=',', encoding='CP949').dropna(axis=0) 
    # 중복제거
    #self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    
    

    # ko_data label 없음
    #y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=64,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    

    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask

In [30]:
from google.colab import files
myfile = files.upload()

Saving ko_data.csv to ko_data (1).csv


In [31]:
test_dataset = NSMCDataset_ko("ko_data (1).csv")

                 Id
count  11187.000000
mean    5593.000000
std     3229.553065
min        0.000000
25%     2796.500000
50%     5593.000000
75%     8389.500000
max    11186.000000


In [32]:
# 입력데이터 1개씩 predicted 값 생성을 위해 batch_size 1 로 설정

test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [33]:
model.eval()

test_preds = []

for input_ids_batch, attention_masks_batch in tqdm(test_loader):

  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)


  preds = y_pred.detach()
  test_preds.append(preds.cpu().numpy())


HBox(children=(FloatProgress(value=0.0, max=11187.0), HTML(value='')))






In [34]:
import numpy as np

outputs = []
for _ in test_preds:
    # argmax를 사용해서 가장 높은 확률로 예측한 class 반환
    predicted_class_indices=np.argmax(_, axis=1).tolist()
    outputs.append(predicted_class_indices)

result = np.concatenate(outputs)

In [35]:
len(result)

11187

In [37]:
import pandas as pd

ko_data = pd.read_csv('ko_data (1).csv', delimiter=',', encoding='CP949')



In [40]:
outfile_df = pd.DataFrame()

outfile_df['Id'] = ko_data['Id']
outfile_df['Predicted'] = result

In [43]:
outfile_df.to_csv("/content/drive/MyDrive/Colab Notebooks/Submission5.csv", index=False)