In [1]:
!pip install torch --index-url https://download.pytorch.org/whl/cu117
!pip install transformers
!pip install torchmetrics
!pip install optuna
!pip install koeda
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Looking in indexes: https://download.pytorch.org/whl/cu117
[0mmecab-ko is already installed
mecab-ko-dic is already installed
mecab-python is already installed
Done.


In [41]:
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from koeda import EDA
from konlpy.tag import Mecab
from sklearn.preprocessing import LabelEncoder


import pandas as pd
import numpy as np
import optuna
import joblib
import random
import time
import datetime
import gc

## GPU 확인
import os

In [48]:

class CustomDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_len, test_csv='output.csv'):
        self.data = pd.read_csv(csv_file)
        self.test = pd.read_csv(test_csv)
        self.test = self.test[['text', 'class']]

        self.test = self.test.fillna(value=' ')
        self.test = self.test.dropna()

        self.__data_augmentation()
        
        self.tokenizer = tokenizer
        self.max_len = max_len
        
        self.le = LabelEncoder()
        self.le.fit(self.data['class'])
        self.data['class'] = self.le.transform(self.data['class'])

        # Tokenization
        self.data['conversation'] = "[CLS] " + self.data['conversation'] + " [SEP]"
        tokenized_texts = [self.tokenizer.tokenize(s) for s in self.data['conversation']]

        # Convert tokens to IDs, pad sequences and create attention masks
        input_ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
        input_ids = pad_sequences(input_ids, maxlen=self.max_len, dtype='long', truncating='post', padding='post')
        attention_masks = [[float(i>0) for i in seq] for seq in input_ids]

        # Split into training and validation sets
        train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, self.data['class'].values, random_state=42, test_size=0.1)
        train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=42, test_size=0.1)

        # Convert to tensors
        self.train_inputs = torch.tensor(train_inputs)
        self.train_labels = torch.tensor(train_labels)
        self.train_masks = torch.tensor(train_masks)
        self.validation_inputs = torch.tensor(validation_inputs)
        self.validation_labels = torch.tensor(validation_labels)
        self.validation_masks = torch.tensor(validation_masks)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            'train_inputs': self.train_inputs[idx],
            'train_labels': self.train_labels[idx],
            'train_masks': self.train_masks[idx],
            'validation_inputs': self.validation_inputs[idx],
            'validation_labels': self.validation_labels[idx],
            'validation_masks': self.validation_masks[idx],
        }

    def get_dataloader(self, batch_size):
        train_data = TensorDataset(self.train_inputs, self.train_masks, self.train_labels)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

        validation_data = TensorDataset(self.validation_inputs, self.validation_masks, self.validation_labels)
        validation_sampler = SequentialSampler(validation_data)
        validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

        return train_dataloader, validation_dataloader

    def __data_augmentation(self):
        augmenter = EDA(morpheme_analyzer=Mecab(), alpha_sr=0.3, alpha_ri=0.3, alpha_rs=0.3, prob_rd=0.3)
        p = (0.4, 0.4, 0.4, 0.4)
        
        # 랜덤하게 행 선택 (예: 전체 행의 20%를 선택)
        random_indices = np.random.choice(self.data.index, size=int(len(self.data) * 0.3), replace=False)

        # 선택된 행에 대해 Random swap 함수 적용
        augmented_rows = self.data.loc[random_indices, 'conversation'].apply(lambda text: augmenter(text, p, 1))

        # 증강된 데이터를 복사하고, 'text' 열에 증강된 텍스트를 삽입
        new_rows = self.data.loc[random_indices].copy()
        new_rows['conversation'] = augmented_rows

        self.data = pd.concat([self.data, new_rows])
        


In [4]:
#GPU 체크 및 할당
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')


There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3090


In [5]:
# BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('kykim/bert-kor-base', do_lower_case=False)

# BERT Model
model = BertForSequenceClassification.from_pretrained('kykim/bert-kor-base', num_labels=4)
model.cuda()

# model.to(device)

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(42000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
model.num_parameters()

118300420

In [49]:
# Dataset
dataset = CustomDataset("train.csv", tokenizer, max_len=500)

In [8]:
def train(model, train_dataloader, optimizer, scheduler, t0):
    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)

        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # Forward 수행
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    return total_loss

In [9]:
def val(model, validation_dataloader):
    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)

        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # 그래디언트 계산 안함
        with torch.no_grad():
            # Forward 수행
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        # 로스 구함
        logits = outputs[0]

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    return eval_accuracy, nb_eval_steps

In [10]:
# accuracy 와 시간 표시함수 정의
# 정확도 계산 함수
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))


def train_bert(trial):
    cfg = {
        'epochs': trial.suggest_int('epochs', 3, 5, 1), # 4,
        'lr': trial.suggest_loguniform('lr', 1e-5, 2e-5), #2e-5,
        'eps': 1e-8,
        'batch_size': trial.suggest_categorical('batch_size',[8, 16, 32]), # 32,
        'seed_val':42,
    }
    global model
    global tokenizer
    global dataset

    # DataLoader
    train_dataloader, validation_dataloader = dataset.get_dataloader(batch_size=cfg['batch_size'])

    # transformers에서 제공하는 옵티마이저 중 AdamW를 사용
    # 총 훈련 스텝은 이터레이션 * 에폭 수로 설정
    # lr 스케쥴러도 transformers에서 제공하는것을 사용
    print('schedule start')
    #옵티마이저 설정
    optimizer = AdamW(model.parameters(),
                      lr=cfg['lr'], # 학습률
                      eps=cfg['eps'] # 0으로 나누는 것을 방지하기 위한 epsilon 값
                    )



    # 총 훈련 스텝
    total_steps = len(train_dataloader) * cfg['epochs']

    # lr 조금씩 감소시키는 스케줄러
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps = 0,
                                                num_training_steps = total_steps)


    # gradient update는 명시적으로 하지 않고 위에서 로드한 optimizer를 활용
    # 재현을 위해 랜덤시드 고정

    random.seed(cfg['seed_val'])
    np.random.seed(cfg['seed_val'])
    torch.manual_seed(cfg['seed_val'])
    torch.cuda.manual_seed_all(cfg['seed_val'])

    # 그래디언트 초기화
    model.zero_grad()

    # 에폭만큼 반복
    for epoch_i in range(0, cfg['epochs']):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, cfg['epochs']))
        print('Training...')

        # 시작 시간 설정
        t0 = time.time()

        # 학습 함수
        total_loss = train(model, train_dataloader, optimizer, scheduler, t0)

        # 평균 로스 계산
        avg_train_loss = total_loss / len(train_dataloader)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        # ===================================================================== #
        print("")
        print("Running Validation...")

        #시작 시간 설정
        t0 = time.time()

        # 평가 함수 실행
        eval_accuracy, nb_eval_steps = val(model, validation_dataloader)

        print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
        print("  Validation took: {:}".format(format_time(time.time() - t0)))



In [11]:
# 컴퓨터 사양에 자신이 없다면 실행하지 말 것
study = optuna.create_study(sampler=optuna.samplers.TPESampler(), direction='maximize')

# https://optuna.readthedocs.io/en/stable/faq.html#how-do-i-avoid-running-out-of-memory-oom-when-optimizing-studies
study.optimize(train_bert, n_trials=10, gc_after_trial=True)
joblib.dump(study, './bert_optuna.pkl')

[I 2023-07-10 19:12:33,113] A new study created in memory with name: no-name-71746e60-6ec4-4849-9bb0-8e94a85e49c2


schedule start

Training...

  Average training loss: 0.58
  Training epcoh took: 0:01:32

Running Validation...
  Accuracy: 0.90
  Validation took: 0:00:03

Training...

  Average training loss: 0.25
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.89
  Validation took: 0:00:03

Training...

  Average training loss: 0.16
  Training epcoh took: 0:01:31

Running Validation...


[W 2023-07-10 19:17:16,529] Trial 0 failed with parameters: {'epochs': 3, 'lr': 1.0092620562972524e-05, 'batch_size': 16} because of the following error: The value None could not be cast to float..
[W 2023-07-10 19:17:16,529] Trial 0 failed with value None.


  Accuracy: 0.91
  Validation took: 0:00:03
schedule start

Training...
  Batch   500  of    578.    Elapsed: 0:01:25.

  Average training loss: 0.28
  Training epcoh took: 0:01:39

Running Validation...
  Accuracy: 0.90
  Validation took: 0:00:04

Training...
  Batch   500  of    578.    Elapsed: 0:01:26.

  Average training loss: 0.11
  Training epcoh took: 0:01:39

Running Validation...
  Accuracy: 0.92
  Validation took: 0:00:04

Training...
  Batch   500  of    578.    Elapsed: 0:01:25.

  Average training loss: 0.04
  Training epcoh took: 0:01:39

Running Validation...


[W 2023-07-10 19:22:23,625] Trial 1 failed with parameters: {'epochs': 3, 'lr': 1.8395894392247034e-05, 'batch_size': 8} because of the following error: The value None could not be cast to float..
[W 2023-07-10 19:22:23,625] Trial 1 failed with value None.


  Accuracy: 0.93
  Validation took: 0:00:04
schedule start

Training...

  Average training loss: 0.05
  Training epcoh took: 0:01:26

Running Validation...
  Accuracy: 0.92
  Validation took: 0:00:03

Training...

  Average training loss: 0.02
  Training epcoh took: 0:01:26

Running Validation...
  Accuracy: 0.93
  Validation took: 0:00:03

Training...

  Average training loss: 0.01
  Training epcoh took: 0:01:27

Running Validation...


[W 2023-07-10 19:26:52,993] Trial 2 failed with parameters: {'epochs': 3, 'lr': 1.845997183098764e-05, 'batch_size': 32} because of the following error: The value None could not be cast to float..
[W 2023-07-10 19:26:52,994] Trial 2 failed with value None.


  Accuracy: 0.93
  Validation took: 0:00:03
schedule start

Training...
  Batch   500  of    578.    Elapsed: 0:01:26.

  Average training loss: 0.06
  Training epcoh took: 0:01:39

Running Validation...
  Accuracy: 0.91
  Validation took: 0:00:04

Training...
  Batch   500  of    578.    Elapsed: 0:01:25.

  Average training loss: 0.02
  Training epcoh took: 0:01:39

Running Validation...
  Accuracy: 0.93
  Validation took: 0:00:04

Training...
  Batch   500  of    578.    Elapsed: 0:01:26.

  Average training loss: 0.01
  Training epcoh took: 0:01:39

Running Validation...


[W 2023-07-10 19:32:00,232] Trial 3 failed with parameters: {'epochs': 3, 'lr': 1.3966651345707101e-05, 'batch_size': 8} because of the following error: The value None could not be cast to float..
[W 2023-07-10 19:32:00,233] Trial 3 failed with value None.


  Accuracy: 0.93
  Validation took: 0:00:04
schedule start

Training...

  Average training loss: 0.04
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.91
  Validation took: 0:00:03

Training...

  Average training loss: 0.02
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.93
  Validation took: 0:00:03

Training...

  Average training loss: 0.00
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.93
  Validation took: 0:00:03

Training...

  Average training loss: 0.00
  Training epcoh took: 0:01:31

Running Validation...


[W 2023-07-10 19:38:16,744] Trial 4 failed with parameters: {'epochs': 4, 'lr': 1.4068825606435588e-05, 'batch_size': 16} because of the following error: The value None could not be cast to float..
[W 2023-07-10 19:38:16,745] Trial 4 failed with value None.


  Accuracy: 0.93
  Validation took: 0:00:03
schedule start

Training...

  Average training loss: 0.05
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.91
  Validation took: 0:00:03

Training...

  Average training loss: 0.01
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.92
  Validation took: 0:00:03

Training...

  Average training loss: 0.00
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.93
  Validation took: 0:00:03

Training...

  Average training loss: 0.00
  Training epcoh took: 0:01:31

Running Validation...


[W 2023-07-10 19:44:33,048] Trial 5 failed with parameters: {'epochs': 4, 'lr': 1.6834129012468502e-05, 'batch_size': 16} because of the following error: The value None could not be cast to float..
[W 2023-07-10 19:44:33,049] Trial 5 failed with value None.


  Accuracy: 0.93
  Validation took: 0:00:03
schedule start

Training...

  Average training loss: 0.01
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.93
  Validation took: 0:00:03

Training...

  Average training loss: 0.01
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.93
  Validation took: 0:00:03

Training...

  Average training loss: 0.00
  Training epcoh took: 0:01:31

Running Validation...


[W 2023-07-10 19:49:15,276] Trial 6 failed with parameters: {'epochs': 3, 'lr': 1.0164449333108146e-05, 'batch_size': 16} because of the following error: The value None could not be cast to float..
[W 2023-07-10 19:49:15,277] Trial 6 failed with value None.


  Accuracy: 0.93
  Validation took: 0:00:03
schedule start

Training...
  Batch   500  of    578.    Elapsed: 0:01:25.

  Average training loss: 0.04
  Training epcoh took: 0:01:39

Running Validation...
  Accuracy: 0.92
  Validation took: 0:00:04

Training...
  Batch   500  of    578.    Elapsed: 0:01:25.

  Average training loss: 0.01
  Training epcoh took: 0:01:39

Running Validation...
  Accuracy: 0.93
  Validation took: 0:00:04

Training...
  Batch   500  of    578.    Elapsed: 0:01:26.

  Average training loss: 0.00
  Training epcoh took: 0:01:39

Running Validation...
  Accuracy: 0.94
  Validation took: 0:00:04

Training...
  Batch   500  of    578.    Elapsed: 0:01:26.

  Average training loss: 0.00
  Training epcoh took: 0:01:39

Running Validation...


[W 2023-07-10 19:56:04,786] Trial 7 failed with parameters: {'epochs': 4, 'lr': 1.3224444147881717e-05, 'batch_size': 8} because of the following error: The value None could not be cast to float..
[W 2023-07-10 19:56:04,787] Trial 7 failed with value None.


  Accuracy: 0.93
  Validation took: 0:00:04
schedule start

Training...

  Average training loss: 0.01
  Training epcoh took: 0:01:26

Running Validation...
  Accuracy: 0.93
  Validation took: 0:00:03

Training...

  Average training loss: 0.01
  Training epcoh took: 0:01:26

Running Validation...
  Accuracy: 0.93
  Validation took: 0:00:03

Training...

  Average training loss: 0.00
  Training epcoh took: 0:01:26

Running Validation...
  Accuracy: 0.94
  Validation took: 0:00:03

Training...

  Average training loss: 0.00
  Training epcoh took: 0:01:26

Running Validation...
  Accuracy: 0.94
  Validation took: 0:00:03

Training...

  Average training loss: 0.00
  Training epcoh took: 0:01:26

Running Validation...


[W 2023-07-10 20:03:31,997] Trial 8 failed with parameters: {'epochs': 5, 'lr': 1.5203784162009396e-05, 'batch_size': 32} because of the following error: The value None could not be cast to float..
[W 2023-07-10 20:03:31,998] Trial 8 failed with value None.


  Accuracy: 0.94
  Validation took: 0:00:03
schedule start

Training...

  Average training loss: 0.00
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.94
  Validation took: 0:00:03

Training...

  Average training loss: 0.05
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.92
  Validation took: 0:00:03

Training...

  Average training loss: 0.01
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.93
  Validation took: 0:00:03

Training...

  Average training loss: 0.01
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.94
  Validation took: 0:00:03

Training...

  Average training loss: 0.00
  Training epcoh took: 0:01:31

Running Validation...


[W 2023-07-10 20:11:22,080] Trial 9 failed with parameters: {'epochs': 5, 'lr': 1.843052246404542e-05, 'batch_size': 16} because of the following error: The value None could not be cast to float..
[W 2023-07-10 20:11:22,081] Trial 9 failed with value None.


  Accuracy: 0.94
  Validation took: 0:00:03


['./bert_optuna.pkl']

In [15]:

study = joblib.load('./bert_optuna.pkl')
df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)
df.head(5)

Unnamed: 0,number,value,duration,params_batch_size,params_epochs,params_lr
0,0,,0 days 00:04:43.415155,16,3,1e-05
1,1,,0 days 00:05:06.951276,8,3,1.8e-05
2,2,,0 days 00:04:29.221624,32,3,1.8e-05
3,3,,0 days 00:05:07.093242,8,3,1.4e-05
4,4,,0 days 00:06:16.367611,16,4,1.4e-05


In [17]:
# 최적의 파라미터로 함수 재정의
def train_bert():
    cfg = {
        'epochs': 3, # 4,
        'lr': 0.00001, #2e-5,
        'eps': 1e-8,
        'batch_size': 16, # 32,
        'seed_val':42,
    }
    global model
    global tokenizer
    global dataset

    # DataLoader
    train_dataloader, validation_dataloader = dataset.get_dataloader(batch_size=cfg['batch_size'])

    # transformers에서 제공하는 옵티마이저 중 AdamW를 사용
    # 총 훈련 스텝은 이터레이션 * 에폭 수로 설정
    # lr 스케쥴러도 transformers에서 제공하는것을 사용
    print('schedule start')
    #옵티마이저 설정
    optimizer = AdamW(model.parameters(),
                      lr=cfg['lr'], # 학습률
                      eps=cfg['eps'] # 0으로 나누는 것을 방지하기 위한 epsilon 값
                    )



    # 총 훈련 스텝
    total_steps = len(train_dataloader) * cfg['epochs']

    # lr 조금씩 감소시키는 스케줄러
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps = 0,
                                                num_training_steps = total_steps)


    # gradient update는 명시적으로 하지 않고 위에서 로드한 optimizer를 활용
    # 재현을 위해 랜덤시드 고정

    random.seed(cfg['seed_val'])
    np.random.seed(cfg['seed_val'])
    torch.manual_seed(cfg['seed_val'])
    torch.cuda.manual_seed_all(cfg['seed_val'])

    # 그래디언트 초기화
    model.zero_grad()

    # 에폭만큼 반복
    for epoch_i in range(0, cfg['epochs']):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, cfg['epochs']))
        print('Training...')

        # 시작 시간 설정
        t0 = time.time()

        # 학습 함수
        total_loss = train(model, train_dataloader, optimizer, scheduler, t0)

        # 평균 로스 계산
        avg_train_loss = total_loss / len(train_dataloader)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        # ===================================================================== #
        print("")
        print("Running Validation...")

        #시작 시간 설정
        t0 = time.time()

        # 평가 함수 실행
        eval_accuracy, nb_eval_steps = val(model, validation_dataloader)

        print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
        print("  Validation took: {:}".format(format_time(time.time() - t0)))

In [50]:
## 기본 학습 루틴
print('train start')

# 학습 총 시간 계산을 위한 datetime
sstart = datetime.datetime.now()

train_bert()

print("")
print("Training complete!")
print("")

print('\n\n', datetime.datetime.now() - sstart)

train start
schedule start

Training...





  Average training loss: 0.77
  Training epcoh took: 0:01:30

Running Validation...
  Accuracy: 0.93
  Validation took: 0:00:03

Training...

  Average training loss: 0.04
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.94
  Validation took: 0:00:03

Training...

  Average training loss: 0.02
  Training epcoh took: 0:01:31

Running Validation...
  Accuracy: 0.94
  Validation took: 0:00:03

Training complete!



 0:04:42.265108


In [51]:
# 테스트셋 생성
test = pd.read_csv('output.csv')
test = test[['text', 'class']]

test = test.fillna(value=' ')
test = test.dropna()

sentences = test['text']
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
test['class'] = dataset.le.transform(test['class'])
labels = test['class'].values

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=128, dtype="long", truncating="post", padding="post")

attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=16)

In [58]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import label_binarize
import numpy as np

# 시작 시간 설정
t0 = time.time()

# 평가모드로 변경
model.eval()

# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# F1 Score 초기화
preds = []
true = []

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(test_dataloader):
    # 경과 정보 표시
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_labels = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # 출력 로짓과 라벨을 비교하여 정확도 계산
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

    # F1 Score를 위해 예측값과 실제값을 저장
    preds.append(np.argmax(logits, axis=1))
    true.append(label_ids)

# Accuracy 출력
print("")
print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))

# F1 Score, Precision, Recall 출력
# Predictions와 labels이 nested lists로 저장되므로 flatten하여 계산
preds = [p for sublist in preds for p in sublist]
true = [t for sublist in true for t in sublist]
print("F1 Score: {0:.2f}".format(f1_score(true, preds, average='weighted')))
print("Precision: {0:.2f}".format(precision_score(true, preds, average='weighted')))
print("Recall: {0:.2f}".format(recall_score(true, preds, average='weighted')))

# ROC AUC
num_classes = len(np.unique(true)) # 클래스 수 계산
true_bin = label_binarize(true, classes=list(range(num_classes))) # 실제 레이블 이진화
preds_bin = label_binarize(preds, classes=list(range(num_classes))) # 예측 레이블 이진화
roc_auc = roc_auc_score(true_bin, preds_bin, average='weighted') # 각 클래스에 대한 ROC AUC를 계산하고 가중 평균
print("ROC AUC: {0:.2f}".format(roc_auc))

print("Test took: {:}".format(format_time(time.time() - t0)))



Accuracy: 0.87
F1 Score: 0.87
Precision: 0.87
Recall: 0.87
ROC AUC: 0.91
Test took: 0:00:01
