In [1]:
import json
import os
from datasets import Dataset, DatasetDict
import torch
import pandas as pd
import numpy as np
from numba import cuda
from sklearn.model_selection import train_test_split # train test 를 나누기 위한 라이브러리
from sklearn.metrics import accuracy_score # 정확도 계산 라이브러리

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# GPU 메모리 초기화
def gpu_clear():
    device = cuda.get_current_device(); 
    device.reset()
    !nvidia-smi

In [3]:
# seed 고정
import random
import torch.backends.cudnn as cudnn

def random_seed(seed_num):
    torch.manual_seed(seed_num)
    torch.cuda.manual_seed(seed_num)
    torch.cuda.manual_seed_all(seed_num)
    np.random.seed(seed_num)
    cudnn.benchmark = False
    cudnn.deterministic = True
    random.seed(seed_num)
random_seed(42)

In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Data load for Fine-tuning

In [5]:
def load_and_combine_json_files(folder_path):
    all_data = []

    # 폴더와 하위 폴더를 순회하면서 JSON 파일 찾기
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith('.json'):
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    json_data = json.load(file)
                    all_data.extend(json_data)  # 리스트를 확장하여 데이터 추가

    return all_data

In [6]:
train_path = '../data/train'
test_path = '../data/validation'

train_datas = load_and_combine_json_files(train_path)
test_datas = load_and_combine_json_files(test_path)

## Data Process

In [7]:
train = pd.DataFrame(train_datas)[['RawText', 'GeneralPolarity']]
train.dropna(subset=['GeneralPolarity'], inplace=True)
train['GeneralPolarity'] = train['GeneralPolarity'].astype(int)
train['GeneralPolarity'] = train['GeneralPolarity'].map({0: 0, 1: 1, -1: 2})
train.rename(columns={'RawText': 'text', 'GeneralPolarity':'label'}, inplace=True)
train.reset_index(drop=True, inplace=True)

test = pd.DataFrame(test_datas)[['RawText', 'GeneralPolarity']]
test.dropna(subset=['GeneralPolarity'], inplace=True)
test['GeneralPolarity'] = test['GeneralPolarity'].astype(int)
test['GeneralPolarity'] = test['GeneralPolarity'].map({0: 0, 1: 1, -1: 2})
test.rename(columns={'RawText': 'text', 'GeneralPolarity':'label'}, inplace=True)
test.reset_index(drop=True, inplace=True)

In [8]:
val, test = train_test_split(test, test_size = .5, random_state = 42)

In [9]:
val.reset_index(drop=True, inplace=True) # index 재정렬
test.reset_index(drop=True, inplace=True) # index 재정렬

In [10]:
train_dataset = Dataset.from_pandas(train[['text', 'label']])
valid_dataset = Dataset.from_pandas(test[['text', 'label']])
test_dataset = Dataset.from_pandas(test[['text', 'label']])

## Load HuggingFace Model

In [11]:
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification
from torch import optim
from transformers import get_scheduler
from tqdm.auto import tqdm

In [12]:
model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [14]:
train_dataset = train_dataset.map(tokenize_function, batched=True).shuffle(seed=42)
valid_dataset = valid_dataset.map(tokenize_function, batched=True).shuffle(seed=42)
test_dataset = test_dataset.map(tokenize_function, batched=True).shuffle(seed=42)

Map:   0%|          | 0/183432 [00:00<?, ? examples/s]

Map: 100%|██████████| 183432/183432 [01:05<00:00, 2816.35 examples/s]
Map: 100%|██████████| 12435/12435 [00:04<00:00, 3013.52 examples/s]
Map: 100%|██████████| 12435/12435 [00:04<00:00, 3033.10 examples/s]


In [15]:
# 학습이 너무 오래걸리기 때문에 + 데이터가 분류별로 나눠져있어 다양한 데이터를 사용하기 위함
train_dataset = train_dataset.shuffle(seed=42).select(range(500))
valid_dataset = valid_dataset.shuffle(seed=42).select(range(200))
test_dataset = test_dataset.shuffle(seed=42).select(range(200))

In [16]:
train_dataset = train_dataset.remove_columns(["text"])
train_dataset = train_dataset.rename_column("label", "labels")
train_dataset.set_format("torch") # 텐서로 변환

valid_dataset = valid_dataset.remove_columns(["text"])
valid_dataset = valid_dataset.rename_column("label", "labels")
valid_dataset.set_format("torch") # 텐서로 변환

test_dataset = test_dataset.remove_columns(["text"])
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format("torch") # 텐서로 변환

In [17]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
valid_dataloader = DataLoader(valid_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

## Parameters

In [18]:
hyPara = {
    "lr": 5e-5,
    "epoch": 3,
}

## Tutorial code

In [19]:
optimizer = optim.AdamW(model.parameters(), lr=hyPara['lr'])

In [20]:
num_epochs = hyPara['epoch']
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [21]:
# gpu_clear()

In [22]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [23]:

progress_bar = tqdm(range(num_training_steps))


''' 
train_dataloader는 for문을 돌려서 뽑아보면 dict 타입임
batch라는 dict를 for을 돌려 key, value로 하나씩 뽑으면서 v를 gpu에 올리고난뒤 model에서 연산
'''

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

progress_bar.close() 

100%|██████████| 189/189 [01:31<00:00,  2.05it/s]


In [24]:
import evaluate

progress_bar = tqdm(range(len(test_dataloader)))

metric = evaluate.load("accuracy")
model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    progress_bar.update(1)
    
progress_bar.close() 

metric.compute()

100%|██████████| 25/25 [00:05<00:00,  4.68it/s]


{'accuracy': 0.855}

## Custom

In [29]:
model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# training 코드, evaluation 코드, training_loop 코드
def training(model, dataloader, train_dataset, optimizer, device, epoch, num_epochs):
    model.train()  # 모델을 학습 모드로 설정
    train_loss = 0.0
    train_accuracy = 0

    tbar = tqdm(dataloader)
    for batch in tbar:
        labels = batch['labels'].to(device)
        input_ = batch['input_ids'].to(device)
        token_type = batch['token_type_ids'].to(device)
        mask = batch['attention_mask'].to(device)

        # 순전파
        output = model(labels=labels,
                    input_ids=input_,
                    token_type_ids=token_type,
                    attention_mask=mask)

        loss = output['loss'] # 얘 확인

        # 역전파 및 가중치 업데이트
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 손실과 정확도 계산
        train_loss += loss.item()
        # torch.max에서 dim 인자에 값을 추가할 경우, 해당 dimension에서 최댓값과 최댓값에 해당하는 인덱스를 반환
        _, predicted = torch.max(output['logits'], 1)
        train_accuracy += (predicted == labels).sum().item()

        # tqdm의 진행바에 표시될 설명 텍스트를 설정
        tbar.set_description(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}")

    # 에폭별 학습 결과 출력
    train_loss = train_loss / len(dataloader)
    train_accuracy = train_accuracy / len(train_dataset)

    return model, train_loss, train_accuracy

def evaluation(model, dataloader, val_dataset, device, epoch, num_epochs):
    model.eval()  # 모델을 평가 모드로 설정
    valid_accuracy = 0

    with torch.no_grad(): # model의 업데이트 막기
        tbar = tqdm(dataloader)
        for batch in tbar:
            labels = batch['labels'].to(device)
            input_ = batch['input_ids'].to(device)
            token_type = batch['token_type_ids'].to(device)
            mask = batch['attention_mask'].to(device)

            # 순전파
            output = model(labels=labels,
                        input_ids=input_,
                        token_type_ids=token_type,
                        attention_mask=mask)

            # torch.max에서 dim 인자에 값을 추가할 경우, 해당 dimension에서 최댓값과 최댓값에 해당하는 인덱스를 반환
            _, predicted = torch.max(output['logits'], 1)
            valid_accuracy += (predicted == labels).sum().item()

            # tqdm의 진행바에 표시될 설명 텍스트를 설정
            tbar.set_description(f"Epoch [{epoch+1}/{num_epochs}]")

    valid_accuracy = valid_accuracy / len(val_dataset)

    return model, valid_accuracy


def training_loop(model, train_dataloader, valid_dataloader, train_dataset, val_dataset, optimizer, device, num_epochs, model_path):
    best_valid_loss = float('inf')  # 가장 좋은 validation loss를 저장
    valid_max_accuracy = -1

    for epoch in range(num_epochs):
        model, train_loss, train_accuracy = training(model, train_dataloader, train_dataset, optimizer, device, epoch, num_epochs)
        model, valid_accuracy = evaluation(model, valid_dataloader, val_dataset, device, epoch, num_epochs)

        if valid_accuracy > valid_max_accuracy:
            valid_max_accuracy = valid_accuracy
            tokenizer.save_pretrained(model_path)
            model.save_pretrained(model_path)

        print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Valid Accuracy: {valid_accuracy:.4f}")

    return model, valid_max_accuracy

In [31]:
# 모델 전체 fine tuning
model.to(device)
num_epochs = hyPara['epoch']
model_path = 'my_model01/'
lr = hyPara['lr']
optimizer = optim.Adam(model.parameters(), lr=lr)
model, valid_max_accuracy = training_loop(model, train_dataloader, valid_dataloader, train_dataset, valid_dataset, optimizer, device, num_epochs, model_path)
print('Valid max accuracy : ', valid_max_accuracy)

Epoch [1/3], Train Loss: 0.7832: 100%|██████████| 63/63 [00:31<00:00,  2.03it/s]
Epoch [1/3]: 100%|██████████| 25/25 [00:04<00:00,  6.13it/s]


Epoch [1/3], Train Loss: 0.5616, Train Accuracy: 0.7620, Valid Accuracy: 0.8400


Epoch [2/3], Train Loss: 0.0607: 100%|██████████| 63/63 [00:30<00:00,  2.04it/s]
Epoch [2/3]: 100%|██████████| 25/25 [00:04<00:00,  6.15it/s]


Epoch [2/3], Train Loss: 0.2764, Train Accuracy: 0.8960, Valid Accuracy: 0.8600


Epoch [3/3], Train Loss: 0.0287: 100%|██████████| 63/63 [00:30<00:00,  2.05it/s]
Epoch [3/3]: 100%|██████████| 25/25 [00:04<00:00,  6.15it/s]

Epoch [3/3], Train Loss: 0.1261, Train Accuracy: 0.9540, Valid Accuracy: 0.8000
Valid max accuracy :  0.86





In [32]:
def test_evaluation(model, dataloader, device):
    model.eval()
    total_preds = []
    total_labels = []
    total_probs = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            labels = batch['labels'].to(device)
            input_ = batch['input_ids'].to(device)
            token_type = batch['token_type_ids'].to(device)
            mask = batch['attention_mask'].to(device)

            output = model(labels=labels,
                        input_ids=input_,
                        token_type_ids=token_type,
                        attention_mask=mask)


            # torch.max에서 dim 인자에 값을 추가할 경우, 해당 dimension에서 최댓값과 최댓값에 해당하는 인덱스를 반환
            _, predicted = torch.max(output['logits'], 1)

            total_preds.extend(predicted.detach().cpu().tolist())
            total_labels.extend(labels.tolist())
            total_probs.append(output['logits'].detach().cpu().numpy())

    total_preds = np.array(total_preds)
    total_labels = np.array(total_labels)
    total_probs = np.concatenate(total_probs, axis= 0)
    acc = accuracy_score(total_labels, total_preds)
    return acc
    

In [33]:
# tok = AutoTokenizer.from_pretrained(model_path)
mod = AutoModelForSequenceClassification.from_pretrained(model_path)
model = model.to(device)
acc = test_evaluation(model, test_dataloader, device)
print("Full fine tuning model accuracy : ",acc)

100%|██████████| 25/25 [00:04<00:00,  6.16it/s]

Full fine tuning model accuracy :  0.8



