#### CUDA 확인

In [1]:
# !pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [1]:
# GPU 확인

import torch
USE_CUDA = torch.cuda.is_available()
print(USE_CUDA)
device = torch.device('cuda:0' if USE_CUDA else 'cpu')
print('학습을 진행하는 기기:',device)
name = torch.cuda.get_device_name(0)
print('name', name)

True
학습을 진행하는 기기: cuda:0
name NVIDIA GeForce RTX 3060 Ti


## Experiment options

In [2]:
## Experiment Option
from easydict import EasyDict
import torch

opt = EasyDict()
opt.dataset_series = 'company'
opt.dataset_domain = ''
opt.subtask = 'sub1' # sub1: sentence, sub2: document(full review) only sub1
opt.task = 'category' # category, term
opt.num_classes = 3 # negative, positive, neutral, (+ conflict)
opt.max_length = 200
opt.model_name = 'kobert' # model_name: {bert_base, kobert}
opt.pos = False # not use
opt.lastid = False # not use
opt.top_k = 3 # how many top-k attention score words to use
opt.valset_ratio = 0.125
opt.batch_size = 16
opt.num_layers = 6 # only use bert_intermediate. how many intermediate layers to use?
opt.num_epochs = 12
opt.runs = 5
opt.seed = 42
opt.log_step = 100
opt.patience = 5
opt.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(opt.device)

cuda


## Load Dataset

In [3]:
if opt.dataset_series == 'company':
    path = 'dataset/{}_train.csv'.format(opt.dataset_series)
    path_test = 'dataset/{}_test.csv'.format(opt.dataset_series)


import pandas as pd

df_train = pd.read_csv(path)
df_test = pd.read_csv(path_test)

print('length of train set: {:,}'.format(len(df_train)))
print('length of test set: {:,}'.format(len(df_test)))

length of train set: 2,000
length of test set: 500


In [4]:
df_train

Unnamed: 0,sentence,term,category,polarity
0,높은 시장 점유율과 인지도,인지도,커리어,positive
1,대외적으로 젊고 혁신적인 이미지,이미지,커리어,positive
2,서비스들 배끼기에만 급급하고 스스로 혁신 하고자 하는 의지나 창의성이 전혀 없는 안...,베끼기,사내문화,negative
3,최고의 아이티 기업으로 폭풍성장을 경험할 수 있음,폭풍성장,커리어,positive
4,글로벌도 경험 가능,글로벌,커리어,positive
...,...,...,...,...
1995,업무가 많아 연장근무를 하면 한 만큼 급여를 지급하는 회사,급여,급여,positive
1996,하고싶은거 능력만 있으면 펼칠수있음 그리고 잘하면 인정도 받음,능력,커리어,positive
1997,사내 문화가 유연하고 소통이 잘 되는 편,문화,사내문화,positive
1998,업무 범위가 넓어서 역량 개발에 도움이 됨,역량,커리어,positive


In [5]:
trained = df_train.to_csv()

In [None]:
from data_utils import clean_sentence, preprocess
df_train = clean_sentence(df=df_train, clean_func=preprocess)
df_test = clean_sentence(df=df_test, clean_func=preprocess)

In [7]:
from kobert_tokenizer import KoBertTokenizer

tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


In [8]:
from data_utils import Category_Classification_Dataset as Dataset

trainset = Dataset(df=df_train, tokenizer=tokenizer, opt=opt, pos_encoding=False)
testset = Dataset(df=df_test, tokenizer=tokenizer, opt=opt, pos_encoding=False)

category: True
2,000 samples in this dataset
category: True
500 samples in this dataset


In [10]:
from data_utils import custom_random_split as rs

train_set, val_set, test_set = rs(dataset=trainset, testset=testset,
                                  val_ratio=opt.valset_ratio, random_seed=opt.seed)


Ratio of datasets 1750 : 250 : 500


In [11]:
from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_set, batch_size=opt.batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_set, batch_size=opt.batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_set, batch_size=opt.batch_size, shuffle=False)

## Model

use top-k attention words + some tokens + pooling

- top-k: 3, 4
- additional tokens: [SEP_1], [SEP_2], both [SEP], [CLS], pair words(aspect words)
- pooling: 'mean' or 'bi-gru'

In [12]:
opt.model_name

'kobert'

In [13]:
from models.kobert import *

if opt.model_name == 'kobert':
    model = KoBERT(opt=opt, embed_dim=768, fc_hid_dim=128, top_k=opt.top_k, att_head='all', att_pooling='mean')

In [14]:
from models.parameters import get_parameters
total, params = get_parameters(model)

92,189,187 total parameters in this model
92,189,187 trainable parameters in this model


## Train

In [15]:
import torch.nn as nn
import torch.optim as optim
from custom_trainer import *

optimizer = optim.AdamW(params, lr=2e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.8) # can't use for multiple runs
criterion = nn.CrossEntropyLoss()

result_dict, best_path = runs(trainer=trainer, train_loader=train_loader, val_loader=val_loader, test_loader=test_loader,
                             model=model, criterion=criterion, optimizer=optimizer, scheduler=False, opt=opt)

>>>>> RUN NUMBER: 01 <<<<<
   global step: 100 | train loss: 0.531, train_acc: 73.88%
Epoch: 01 | Val Loss: 0.028 | Val Acc: 83.20%
   global step: 200 | train loss: 0.272, train_acc: 89.79%
Epoch: 02 | Val Loss: 0.018 | Val Acc: 89.20%
   global step: 300 | train loss: 0.134, train_acc: 95.78%
>> saved: state_dict/kobert_company_preprocess5_epoch_3_val_acc_84.8%
Epoch: 03 | Val Loss: 0.026 | Val Acc: 84.80%
   global step: 400 | train loss: 0.097, train_acc: 96.70%
>> saved: state_dict/kobert_company_preprocess5_epoch_4_val_acc_89.2%
Epoch: 04 | Val Loss: 0.020 | Val Acc: 89.20%
   global step: 500 | train loss: 0.066, train_acc: 98.23%
Epoch: 05 | Val Loss: 0.028 | Val Acc: 88.00%
   global step: 600 | train loss: 0.043, train_acc: 99.00%
Epoch: 06 | Val Loss: 0.039 | Val Acc: 84.40%
   global step: 700 | train loss: 0.055, train_acc: 98.44%
Epoch: 07 | Val Loss: 0.026 | Val Acc: 86.40%
   global step: 800 | train loss: 0.032, train_acc: 99.38%
Epoch: 08 | Val Loss: 0.023 | Val Acc: 