In [1]:
import json
import os
from datasets import Dataset, DatasetDict
import torch
import pandas as pd
import numpy as np
from numba import cuda
from sklearn.model_selection import train_test_split # train test 를 나누기 위한 라이브러리
from sklearn.metrics import accuracy_score # 정확도 계산 라이브러리
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# GPU 메모리 초기화
def gpu_clear():
    device = cuda.get_current_device(); 
    device.reset()
    !nvidia-smi

In [3]:
# seed 고정
import random
import torch.backends.cudnn as cudnn

def random_seed(seed_num):
    torch.manual_seed(seed_num)
    torch.cuda.manual_seed(seed_num)
    torch.cuda.manual_seed_all(seed_num)
    np.random.seed(seed_num)
    cudnn.benchmark = False
    cudnn.deterministic = True
    random.seed(seed_num)
    
random_seed(42)

In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Data load for Fine-tuning

In [5]:
def load_and_combine_json_files(folder_path):
    all_data = []

    # 폴더와 하위 폴더를 순회하면서 JSON 파일 찾기
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith('.json'):
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    json_data = json.load(file)
                    all_data.extend(json_data)  # 리스트를 확장하여 데이터 추가

    return all_data

In [6]:
# train_path = '../data/train'
# test_path = '../data/validation'

# train_datas = load_and_combine_json_files(train_path)
# test_datas = load_and_combine_json_files(test_path)

## Data Process

In [7]:
# train = pd.DataFrame(train_datas)[['RawText', 'GeneralPolarity']]
# train.dropna(subset=['GeneralPolarity'], inplace=True)
# train['GeneralPolarity'] = train['GeneralPolarity'].astype(int)
# train['GeneralPolarity'] = train['GeneralPolarity'].map({0: 0, 1: 1, -1: 2})
# train.rename(columns={'RawText': 'text', 'GeneralPolarity':'label'}, inplace=True)
# index = train[train['label'] == 1].index
# index = np.random.choice(index, size=int(len(index) * 0.7))
# train.drop(index=index, inplace=True)
# train.reset_index(drop=True, inplace=True)

# test = pd.DataFrame(test_datas)[['RawText', 'GeneralPolarity']]
# test.dropna(subset=['GeneralPolarity'], inplace=True)
# test['GeneralPolarity'] = test['GeneralPolarity'].astype(int)
# test['GeneralPolarity'] = test['GeneralPolarity'].map({0: 0, 1: 1, -1: 2})
# test.rename(columns={'RawText': 'text', 'GeneralPolarity':'label'}, inplace=True)
# test.reset_index(drop=True, inplace=True)

In [8]:
# val, test = train_test_split(test, test_size = .5, random_state = 42)

In [9]:
# val.reset_index(drop=True, inplace=True) # index 재정렬
# test.reset_index(drop=True, inplace=True) # index 재정렬

In [10]:
# train_dataset = Dataset.from_pandas(train[['text', 'label']])
# valid_dataset = Dataset.from_pandas(test[['text', 'label']])
# test_dataset = Dataset.from_pandas(test[['text', 'label']])

## Load HuggingFace Model

In [11]:
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification
from torch import optim
from transformers import get_scheduler
from tqdm.auto import tqdm

In [12]:
model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [14]:
# train_dataset = train_dataset.map(tokenize_function, batched=True).shuffle(seed=42)
# valid_dataset = valid_dataset.map(tokenize_function, batched=True).shuffle(seed=42)
# test_dataset = test_dataset.map(tokenize_function, batched=True).shuffle(seed=42)

In [15]:
# # 데이터셋 저장
# torch.save(train_dataset, 'dataset/train_dataset.pth')
# torch.save(valid_dataset, 'dataset/valid_dataset.pth')
# torch.save(test_dataset, 'dataset/test_dataset.pth')

In [16]:
# 저장된 데이터셋 불러오기
train_dataset = torch.load('dataset/train_dataset.pth')
valid_dataset = torch.load('dataset/valid_dataset.pth')
test_dataset = torch.load('dataset/test_dataset.pth')

In [17]:
train_num = 50000
test_val_num = 10000

# 학습이 너무 오래걸리기 때문에 + 데이터가 분류별로 나눠져있어 다양한 데이터를 사용하기 위함
small_train_dataset = train_dataset.shuffle(seed=42).select(range(train_num))
small_valid_dataset = valid_dataset.shuffle(seed=42).select(range(test_val_num))
small_test_dataset = test_dataset.shuffle(seed=42).select(range(test_val_num))

In [18]:
small_train_dataset = small_train_dataset.remove_columns(["text"])
small_train_dataset = small_train_dataset.rename_column("label", "labels")
small_train_dataset.set_format("torch") # 텐서로 변환

small_valid_dataset = small_valid_dataset.remove_columns(["text"])
small_valid_dataset = small_valid_dataset.rename_column("label", "labels")
small_valid_dataset.set_format("torch") # 텐서로 변환

small_test_dataset = small_test_dataset.remove_columns(["text"])
small_test_dataset = small_test_dataset.rename_column("label", "labels")
small_test_dataset.set_format("torch") # 텐서로 변환

In [19]:
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=16)
valid_dataloader = DataLoader(small_valid_dataset, batch_size=16)
test_dataloader = DataLoader(small_test_dataset, batch_size=16)

## Parameters

In [20]:
hyPara = {
    "lr": 1e-6,
    "epoch": 10,
    "patience": 2
}

## Custom

In [21]:
# training 코드, evaluation 코드, training_loop 코드
def training(model, dataloader, train_dataset, optimizer, scheduler, device, epoch, num_epochs):
    model.train()  # 모델을 학습 모드로 설정
    train_loss = 0.0
    train_accuracy = 0

    tbar = tqdm(dataloader)
    for batch in tbar:
        labels = batch['labels'].to(device)
        input_ = batch['input_ids'].to(device)
        token_type = batch['token_type_ids'].to(device)
        mask = batch['attention_mask'].to(device)

        # 순전파
        output = model(labels=labels,
                    input_ids=input_,
                    token_type_ids=token_type,
                    attention_mask=mask)

        loss = output['loss']

        # 역전파 및 가중치 업데이트
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        # 손실과 정확도 계산
        train_loss += loss.item()
        # torch.max에서 dim 인자에 값을 추가할 경우, 해당 dimension에서 최댓값과 최댓값에 해당하는 인덱스를 반환
        _, predicted = torch.max(output['logits'], 1)
        train_accuracy += (predicted == labels).sum().item()

        # tqdm의 진행바에 표시될 설명 텍스트를 설정
        tbar.set_description(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}")

    # 에폭별 학습 결과 출력
    train_loss = train_loss / len(dataloader)
    train_accuracy = train_accuracy / len(train_dataset)

    return model, train_loss, train_accuracy

def evaluation(model, dataloader, val_dataset, device, epoch, num_epochs):
    model.eval()  # 모델을 평가 모드로 설정
    valid_loss = 0.0
    valid_accuracy = 0

    with torch.no_grad(): # model의 업데이트 막기
        tbar = tqdm(dataloader)
        for batch in tbar:
            labels = batch['labels'].to(device)
            input_ = batch['input_ids'].to(device)
            token_type = batch['token_type_ids'].to(device)
            mask = batch['attention_mask'].to(device)

            # 순전파
            output = model(labels=labels,
                        input_ids=input_,
                        token_type_ids=token_type,
                        attention_mask=mask)
            
            loss = output['loss']
            
            # 손실과 정확도 계산
            valid_loss += loss.item()
            # torch.max에서 dim 인자에 값을 추가할 경우, 해당 dimension에서 최댓값과 최댓값에 해당하는 인덱스를 반환
            _, predicted = torch.max(output['logits'], 1)
            valid_accuracy += (predicted == labels).sum().item()
            
            # tqdm의 진행바에 표시될 설명 텍스트를 설정
            tbar.set_description(f"Epoch [{epoch+1}/{num_epochs}], Valid Loss: {loss.item():.4f}")
            
    valid_loss = valid_loss / len(dataloader)
    valid_accuracy = valid_accuracy / len(val_dataset)

    return model, valid_loss, valid_accuracy


def training_loop(model, train_dataloader, valid_dataloader, train_dataset, val_dataset, optimizer, scheduler, device, num_epochs, patience, model_path):
    best_valid_loss = float('inf')  # 가장 좋은 validation loss를 저장
    early_stop_counter = 0  # 카운터
    valid_max_accuracy = -1
    
    run = wandb.init(project = 'ko-bert-sentiment02')

    
    for epoch in range(num_epochs):
        model, train_loss, train_accuracy = training(model, train_dataloader, train_dataset, optimizer, scheduler, device, epoch, num_epochs)
        model, valid_loss, valid_accuracy = evaluation(model, valid_dataloader, val_dataset, device, epoch, num_epochs)
        
        monitoring_value = {'train_num': train_num, 'valid_num':test_val_num,'train_loss': train_loss, 'train_accuracy': train_accuracy, 'valid_loss': valid_loss, 'valid_accuracy': valid_accuracy, 'lr': optimizer.param_groups[0]['initial_lr'], 'lr2': optimizer.param_groups[0]['lr']}
        run.log(monitoring_value, step=epoch)
        
        if valid_accuracy > valid_max_accuracy:
            valid_max_accuracy = valid_accuracy
            
        # validation loss가 감소하면 모델 저장 및 카운터 리셋
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            tokenizer.save_pretrained(model_path)
            model.save_pretrained(model_path)
            early_stop_counter = 0

            
        # validation loss가 증가하거나 같으면 카운터 증가
        else:
            early_stop_counter += 1

        print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f} Valid Loss: {valid_loss:.4f}, Valid Accuracy: {valid_accuracy:.4f}")

        # 조기 종료 카운터가 설정한 patience를 초과하면 학습 종료
        if early_stop_counter >= patience:
            print("Early stopping")
            break
        
    run.finish()
    return model, valid_max_accuracy

In [22]:
model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# 모델 전체 fine tuning
model.to(device)
num_epochs = hyPara['epoch']
model_path = 'my_model01/'
lr = hyPara['lr']
patience = hyPara['patience']

optimizer = optim.RMSprop(model.parameters(), lr=lr, alpha=0.9)

num_training_steps = num_epochs * len(train_dataloader)
scheduler = get_scheduler(
    name='cosine', optimizer=optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_training_steps
)

model, valid_max_accuracy = training_loop(model, train_dataloader, valid_dataloader, small_train_dataset, small_valid_dataset, optimizer, scheduler, device, num_epochs, patience, model_path)
print('Valid max accuracy : ', valid_max_accuracy)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdudcjs2779[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch [1/10], Train Loss: 1.0788:   0%|          | 7/3125 [00:09<59:23,  1.14s/it]  

In [None]:
def test_evaluation(model, dataloader, device):
    model.eval()
    total_preds = []
    total_labels = []
    total_probs = []
    valid_loss = 0.0
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            labels = batch['labels'].to(device)
            input_ = batch['input_ids'].to(device)
            token_type = batch['token_type_ids'].to(device)
            mask = batch['attention_mask'].to(device)

            output = model(labels=labels,
                        input_ids=input_,
                        token_type_ids=token_type,
                        attention_mask=mask)
            
            loss = output['loss']
            
            # 손실과 정확도 계산
            valid_loss += loss.item()
            
            # torch.max에서 dim 인자에 값을 추가할 경우, 해당 dimension에서 최댓값과 최댓값에 해당하는 인덱스를 반환
            _, predicted = torch.max(output['logits'], 1)

            total_preds.extend(predicted.detach().cpu().tolist())
            total_labels.extend(labels.tolist())
            total_probs.append(output['logits'].detach().cpu().numpy())
            
    total_preds = np.array(total_preds)
    total_labels = np.array(total_labels)
    total_probs = np.concatenate(total_probs, axis= 0)
    acc = accuracy_score(total_labels, total_preds)
    valid_loss = valid_loss / len(dataloader)
    
    return acc, valid_loss
    

In [None]:
# tok = AutoTokenizer.from_pretrained(model_path)
mod = AutoModelForSequenceClassification.from_pretrained(model_path)
model = mod.to(device)
acc, loss = test_evaluation(model, test_dataloader, device)
print(f"Full fine tuning model accuracy : {acc}, loss: {loss:.4f}")

100%|██████████| 313/313 [01:41<00:00,  3.07it/s]


Full fine tuning model accuracy : 0.892, loss: 0.2726
