In [1]:
# 사용 라이브러리 임포트

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
import pandas as pd
import os
import re
from tqdm import tqdm, tqdm_notebook
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from sklearn.model_selection import train_test_split
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn.metrics import f1_score, precision_score, recall_score
import wandb
from wandb.keras import WandbCallback

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# device

In [2]:
# 사전에 다운로드 받은 경로를 통해 데이터 불러오기
filepath_train = os.getenv('HOME')+'/aiffel/dktc/data/train.csv'

train = pd.read_csv(filepath_train)

train.head()

Unnamed: 0,idx,class,conversation
0,0,협박 대화,지금 너 스스로를 죽여달라고 애원하는 것인가?\n 아닙니다. 죄송합니다.\n 죽을 ...
1,1,협박 대화,길동경찰서입니다.\n9시 40분 마트에 폭발물을 설치할거다.\n네?\n똑바로 들어 ...
2,2,기타 괴롭힘 대화,너 되게 귀여운거 알지? 나보다 작은 남자는 첨봤어.\n그만해. 니들 놀리는거 재미...
3,3,갈취 대화,어이 거기\n예??\n너 말이야 너. 이리 오라고\n무슨 일.\n너 옷 좋아보인다?...
4,4,갈취 대화,저기요 혹시 날이 너무 뜨겁잖아요? 저희 회사에서 이 선크림 파는데 한 번 손등에 ...


데이터 전처리 및 라벨링

In [3]:
# 텍스트 클렌징 함수
def clean_text(text):
    # 불필요한 특수 문자, 숫자 제거 (한글, 영문, 공백 제외)
    text = re.sub(r"[^ㄱ-ㅎㅏ-ㅣ가-힣a-zA-Z\s]", "", text)
    text = re.sub(r"\n", " ", text)
    return text

train['conversation'] = train['conversation'].apply(clean_text)

train.loc[(train['class'] == "협박 대화"), 'label'] = 0  # 협박 대화 => 0
train.loc[(train['class'] == "갈취 대화"), 'label'] = 1  # 갈취 대화 => 1
train.loc[(train['class'] == "직장 내 괴롭힘 대화"), 'label'] = 2  # 직장 내 괴롭힘 대화 => 2
train.loc[(train['class'] == "기타 괴롭힘 대화"), 'label'] = 3  # 기타 괴롭힘 대화 => 3
train.loc[(train['class'] == "일반 대화"), 'label'] = 4  # 일반 대화 => 4

data_list = []
for content, label in zip(train['conversation'], train['label'])  :
    temp = []
    temp.append(content)
    temp.append(str(int(label)))

    data_list.append(temp)

train.drop(columns = 'idx', inplace = True)
dataset_train, dataset_test = train_test_split(data_list, test_size = 0.2, random_state = 42)


Bert 데이터셋 생성

In [5]:
## Setting model parameters
max_len = 130     # 65~200,  패딩이 많아서 줄여봐도 좋을 듯
batch_size = 50     #넘으면 안돌아감 
warmup_ratio = 0.2     #  0.1 로 하니까 안 좋음
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5
dr_rate = 0.5


#bert 모델, vocab 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))
    

#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
#BERTDataset 클래스 이용, TensorDataset으로 만들어주기
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

#배치 및 데이터로더 설정
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=4)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=4)

using cached model. /aiffel/aiffel/dktc/src/.cache/kobert_v1.zip
using cached model. /aiffel/aiffel/dktc/src/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
using cached model. /aiffel/aiffel/dktc/src/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


모델 정의

In [7]:
# Bert 모델 정의
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=5,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)
    


model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
!pip install wandb==0.16.0

모델 학습 및 시각화

In [7]:
import wandb
from wandb.keras import WandbCallback

https://wandb.ai/settings 접속
Danger Zone에서 키 생성 혹은 복사

In [8]:
# 본인키 복사

wandb.login(key = "9ee7086efc9eab486f064f510a83d43bc3ea91aa")

[34m[1mwandb[0m: Currently logged in as: [33msjjky27[0m ([33msjjky27-personal[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /aiffel/.netrc


True

Sweep 설정 [https://docs.wandb.ai/guides/sweeps/define-sweep-configuration]

In [31]:
"""
파라미터 설정

- validation f1 score를 최대화하는 방향으로 하이퍼 파라미터 튜닝 (metrics 파라미터 세팅으로 변경할 수 있음)
- 각 파라미터의 범위는 리스트로 지정 가능 또는 분포를 통해 지정할 수 있음
- early_terminate의 경우 사전에 정해진 조건에 달할 경우 튜닝을 멈추는 

"""


# wandb parameter setting

WANDB_ID = "sjjky27"  # 본인 wandb 계정 ID 넣어주기

PROJ_NAME = "kobert_hyperparameter_tuning"

sweep_config = {
    "name": "sweep_test_nlp",
    "metric": {"name": "val_f1", "goal": "maximize"},
    "method": "random",
    "parameters": {
        "learning_rate": {
            "values": [5e-5]
        },
        "epochs": {
            "values": [5, 10, 15]
#             "distribution": "int_uniform",
#             "min": 5,
#             "max": 6
        },
        "batch_size": {
            "values": [50]
        },
        "max_len": {
            "values": [65, 100, 130]
        }
    },
     "early_terminate": {
        "type": "hyperband",
        "eta": 2,
        "min_iter":2
     }
}


# 성능 지표 계산 함수
def calc_metrics(X, Y):
    max_vals, max_indices = torch.max(X, 1)
    pred_labels = max_indices.cpu().numpy()
    true_labels = Y.cpu().numpy()
    
    accuracy = (pred_labels == true_labels).sum() / len(true_labels)
    f1 = f1_score(true_labels, pred_labels, average='weighted', zero_division=0)
    precision = precision_score(true_labels, pred_labels, average='weighted', zero_division=0)
    recall = recall_score(true_labels, pred_labels, average='weighted', zero_division=0)
    
    return accuracy, f1, precision, recall



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [None]:

# train 함수 정의
def train():
    
    wandb.init()  # Weights & Biases 초기화
    config = wandb.config  # 설정 값 불러오기
    

    # 학습 루프
    for epoch in range(config.epochs):
        train_acc = 0.0
        train_f1, train_precision, train_recall = 0.0, 0.0, 0.0  # 성능 지표 초기화
        test_acc, val_f1, val_precision, val_recall = 0.0, 0.0, 0.0, 0.0  # Validation 성능 지표 초기화
        
        # 학습모드 전환
        model.train()
        
        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(train_dataloader):
            optimizer.zero_grad()

            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length = valid_length.to(device)
            label = label.long().to(device)

            out = model(token_ids, valid_length, segment_ids)
            loss = loss_fn(out, label)
            loss.backward()
            optimizer.step()
            scheduler.step()
            
            # 성능 지표 계산
            batch_acc, batch_f1, batch_precision, batch_recall = calc_metrics(out, label)
            train_acc += batch_acc
            train_f1 += batch_f1
            train_precision += batch_precision
            train_recall += batch_recall

            if batch_id % 100 == 0:
                print(f"Epoch {epoch + 1}, Batch {batch_id + 1}, Loss {loss.item()}, Accuracy {train_acc / (batch_id + 1)}")

        # 검증 모드 전환
        model.eval()
        val_acc = 0.0
        
        with torch.no_grad():
            for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
                token_ids = token_ids.long().to(device)
                segment_ids = segment_ids.long().to(device)
                valid_length = valid_length.to(device)
                label = label.long().to(device)

                out = model(token_ids, valid_length, segment_ids)
                batch_acc, batch_f1, batch_precision, batch_recall = calc_metrics(out, label)
                test_acc += batch_acc
                val_f1 += batch_f1
                val_precision += batch_precision
                val_recall += batch_recall
            print(f"Epoch {epoch+1}: Validation Accuracy {test_acc/len(test_dataloader)}, F1{val_f1/len(test_dataloader)}, Precision {val_precision/len(test_dataloader)}, Recall {val_recall/len(test_dataloader)}")

        # Wandb에 학습 결과 기록
        wandb.log({
            "epoch": epoch + 1,
            "train_accuracy": train_acc / len(train_dataloader),
            "val_accuracy": val_acc / len(test_dataloader),
            "train_f1": train_f1 / len(train_dataloader),
            "val_f1": val_f1/len(test_dataloader),
            "loss": loss.item()
        })

# Sweep 실행
sweep_id = wandb.sweep(sweep_config, project=PROJ_NAME)
wandb.agent(sweep_id, train)

0,1
epoch,▁
loss,▁
train_accuracy,▁
train_f1,▁
val_accuracy,▁
val_f1,▁

0,1
epoch,1.0
loss,0.03129
train_accuracy,0.94687
train_f1,0.94736
val_accuracy,0.0
val_f1,0.87374


Create sweep with ID: v1kpfd2p
Sweep URL: https://wandb.ai/sjjky27-personal/kobert_hyperparameter_tuning/sweeps/v1kpfd2p


[34m[1mwandb[0m: Agent Starting Run: 0jsbmm0c with config:
[34m[1mwandb[0m: 	batch_size: 50
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	max_len: 65


Epoch 1, Batch 1, Loss 0.068119116127491, Accuracy 0.98
Epoch 1: Validation Accuracy 0.8796875000000001, F10.8795726725774117, Precision 0.8898211927230406, Recall 0.8796875000000001
Epoch 2, Batch 1, Loss 0.035277169197797775, Accuracy 0.98
Epoch 2: Validation Accuracy 0.8671875000000001, F10.8678119734206196, Precision 0.8795278573550633, Recall 0.8671875000000001
Epoch 3, Batch 1, Loss 0.012058161199092865, Accuracy 1.0
