# HuggingFace 커스텀 프로젝트 만들기

## 환경 설정

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [36]:
try:
    import numpy, pandas, matplotlib, sklearn, transformers
except:
    !pip install numpy pandas matplotlib scikit-learn
    !pip install transformers
    !pip install transformers[torch]
    !pip install datasets
    !pip install transformers[onnx]

In [3]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    BertTokenizerFast, 
    BertForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    AutoModel, 
    AutoTokenizer,
    AutoModelForSequenceClassification,
)

In [22]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## 데이터 전처리 및 준비

In [4]:
# 데이터 로딩
df = pd.read_csv("ratings_train.txt", delimiter='\t')

# 빈값 제거
df = df.dropna()

# 중복 제거
df.drop_duplicates(subset=['document'], inplace=True)

# 혹시나 하는 인덱스 초기화
df = df.reset_index(drop=True)

# 한글만 남기기
df['document'] = df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

클래스 종류 체크

In [5]:
df.groupby(by=['label']).count()

Unnamed: 0_level_0,id,document
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,73342,73342
1,72840,72840


In [6]:
# 데이터 전처리
sentences = df['document'].to_list()
labels = df['label'].to_list()

## 데이터셋 준비

In [7]:
# Split Train and Validation data
train_texts, val_texts, train_labels, val_labels = train_test_split(sentences, labels, test_size=0.2, random_state=0)

In [8]:
# Load Tokenizer
# tokenizer = BertTokenizerFast.from_pretrained("klue/bert-base")
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

# Tokenizing
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [9]:
# Define PyTorch Datasets
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

## 모델 준비

In [11]:
# Define PyTorch model
# model = BertForSequenceClassification.from_pretrained("klue/bert-base", num_labels=2)
# model = AutoModel.from_pretrained("klue/bert-base")
model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base")


Downloading model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)  # ensure model is on the right device

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

## 학습 환경 구성

In [13]:
# https://stackoverflow.com/questions/67457480/how-to-get-the-accuracy-per-epoch-or-step-for-the-huggingface-transformers-train

from datasets import load_metric
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric('accuracy')


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
# Bucketing과 Dynamic Padding 적용한 Data Collator
# 미리 구현한다.
from transformers import DataCollatorWithPadding

def collate_batch(examples):
    # Sort by length (descending)
    examples.sort(key=lambda x: len(x['input_ids']), reverse=True)

    # Group by length
    batches = []
    batch = []
    prev_length = len(examples[0]['input_ids'])
    for example in examples:
        length = len(example['input_ids'])
        if length != prev_length:
            batches.append(batch)
            batch = []
        batch.append(example)
        prev_length = length
    batches.append(batch)

    # Apply dynamic padding
    data_collator = DataCollatorWithPadding(tokenizer)
    return [data_collator(batch) for batch in batches]

In [26]:
# Define TrainingArguments and Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=64, # 16
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,     # best model을 저장
    metric_for_best_model='accuracy', # best model을 평가하는 기준을 정의
    evaluation_strategy='epoch',      # Set evaluation to occur every 'steps'
    save_strategy='epoch',            # Set saving to occur every 'steps'
    # eval_steps=500,                   # Set the evaluation/saving steps
    fp16=True,
    # fsdp='auto_wrap',
    # ddp_backend='nccl',
    dataloader_num_workers=4,
    # group_by_length=True,
)

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,      # 위에서 정의한 compute_metrics 함수를 Trainer에 전달
    # data_collator=collate_batch,
)

In [28]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.1361,0.241892,0.900298
2,0.1398,0.262861,0.901392
3,0.0682,0.376744,0.902316


TrainOutput(global_step=5484, training_loss=0.1224857451357518, metrics={'train_runtime': 987.5756, 'train_samples_per_second': 355.249, 'train_steps_per_second': 5.553, 'total_flos': 2.56012041586446e+16, 'train_loss': 0.1224857451357518, 'epoch': 3.0})

## 모델 테스트

In [29]:
# 데이터 로딩
test = pd.read_csv("ratings_test.txt", delimiter='\t')
test = test.dropna()
test.drop_duplicates(subset=['document'], inplace=True)
test = test.reset_index(drop=True)
test['document'] = test['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

# 데이터 전처리
test_texts = test['document'].to_list()
test_labels = test['label'].to_list()

test_encodings = tokenizer(test_texts, truncation=True, padding=True)

test_dataset = Dataset(test_encodings, test_labels)

In [30]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.3817775249481201,
 'eval_accuracy': 0.9009296743088472,
 'eval_runtime': 30.537,
 'eval_samples_per_second': 1609.75,
 'eval_steps_per_second': 25.183,
 'epoch': 3.0}

In [None]:
!pip install transformers[onnx]

## Bucketing 적용

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base")
model.to(device)  # ensure model is on the right device

In [None]:
# Define TrainingArguments and Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=64, # 16
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,     # best model을 저장
    metric_for_best_model='accuracy', # best model을 평가하는 기준을 정의
    evaluation_strategy='epoch',      # Set evaluation to occur every 'steps'
    save_strategy='epoch',            # Set saving to occur every 'steps'
    # eval_steps=500,                   # Set the evaluation/saving steps
    fp16=True,
    # fsdp='auto_wrap',
    # ddp_backend='nccl',
    dataloader_num_workers=4,
    group_by_length=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,      # 위에서 정의한 compute_metrics 함수를 Trainer에 전달
    data_collator=collate_batch,
)

In [None]:
# Train the model
trainer.train()

In [None]:
trainer.evaluate(test_dataset)