# 1- Import libraries

In [1]:
import tensorflow
import numpy
import transformers
import datasets
import pandas as pd

print(tensorflow.__version__)
print(numpy.__version__)
print(transformers.__version__)
print(datasets.__version__)



2.15.1
1.26.3
4.38.2
2.18.0


# 2- Loading dataset

In [2]:
import datasets
from datasets import load_dataset

nsmc = datasets.load_dataset("nsmc")

In [3]:
print(nsmc)

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})


In [4]:
# display sample data in nsmc dataset
print(nsmc['train'][0])
print(nsmc['test'][22])

{'id': '9976970', 'document': '아 더빙.. 진짜 짜증나네요 목소리', 'label': 0}
{'id': '9727606', 'document': '꽤 재밌게 본 영화였다!', 'label': 1}


# 3- Tokenizer and Model

In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# 모델 정의 (2개의 레이블을 가진 분류 문제에 적합한 구조)
model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 4- Tokenizer

In [17]:
from datasets import DatasetDict

# 훈련 데이터셋에서 처음 10,000개를 검증 데이터셋으로 분리
nsmc_val = nsmc['train'].select(range(10000))
nsmc_train = nsmc['train'].select(range(10000, nsmc['train'].num_rows))

# 새로운 데이터셋 구조 생성
split_dataset = DatasetDict({
    'train': nsmc_train,
    'validation': nsmc_val,
    'test': nsmc['test']
})

# 토크나이즈 함수 정의
def tokenize_function(examples):
    return tokenizer(examples['document'], padding=True, truncation=True, max_length=30)

# 데이터셋에 토크나이즈 함수 적용
tokenized_datasets = split_dataset.map(tokenize_function, batched=True)

# 결과 출력
print("토크나이즈된 데이터셋:", tokenized_datasets)

Map:   0%|          | 0/140000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

토크나이즈된 데이터셋: DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 140000
    })
    validation: Dataset({
        features: ['id', 'document', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['id', 'document', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})


In [18]:
# display sample data in nsmc dataset
print(tokenized_datasets['train'][100])
print(tokenized_datasets['validation'][100])
print(tokenized_datasets['test'][100])

{'id': '7424174', 'document': '완전 재밌음!!!!!! 극본 탄탄 워낙에 원작이 좋으니 ㅠㅠ 게다가 연극배우들의 호연연출도 너무 좋아요 종편의 좋은예날림편집도 아니고 생방송으로 찍는 것도 아니고진짜 예상외 고퀄리티', 'label': 1, 'input_ids': [2, 5124, 7478, 2053, 5, 5, 5, 5, 5, 5, 18632, 9110, 7020, 2170, 9990, 2052, 1560, 5882, 6516, 5560, 5909, 9736, 2031, 2079, 1926, 2156, 2156, 2102, 2119, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'id': '10044377', 'document': '신카이 마코토의 작화와,미유와 하나카나가 연기를 잘해줘서 더대박이였다.', 'label': 1, 'input_ids': [2, 1327, 25219, 1036, 2258, 2386, 2079, 1518, 2267, 2522, 16, 1107, 2298, 2522, 3657, 2127, 16570, 4483, 2138, 1521, 2097, 2810, 2112, 831, 2104, 12756, 2507, 2062, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

# 5- TrainingArguments, Trainer, Train

In [19]:
from transformers import TrainingArguments, Trainer
from datasets import load_metric
import numpy as np

metric = load_metric('accuracy')

def compute_metrics(eval_pred):    
    predictions,labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references = labels)

output_dir = '/Users/kenny_jung/aiffel/data/nsmc_transformer'

# TrainingArguments 설정: 학습 관련 설정 지정
training_args = TrainingArguments(
    output_dir=output_dir,          # 결과(모델 체크포인트 등) 저장 경로
    num_train_epochs=3,              # 에폭 수
    per_device_train_batch_size=64,   # 디바이스 당 훈련 배치 크기
    per_device_eval_batch_size=64,    # 디바이스 당 평가 배치 크기
    warmup_steps=500,                # 학습률 웜업에 사용되는 학습 스텝 수
    weight_decay=0.01,               # 가중치 감쇠
    evaluation_strategy='epoch',     # 평가 전략 (각 에폭마다 평가)
    logging_dir='./logs',            # 로깅 파일 저장 경로
    # fp16=True,  # fp16 활성화
    # gradient_accumulation_steps=2,  # 그래디언트 축적
    # dataloader_num_workers=8,  # 데이터 로딩에 사용할 프로세스 수
)

# Trainer 생성: 모델, 학습 설정, 데이터셋 지정
trainer = Trainer(
    model=model,                       # 모델 객체
    args=training_args,                # 학습 설정
    train_dataset=tokenized_datasets['train'],  # 훈련 데이터셋
    eval_dataset=tokenized_datasets['validation'],  # 평가 데이터셋
    compute_metrics=compute_metrics,   # 메트릭 계산 함수
)

trainer.train()


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/6564 [00:00<?, ?it/s]

{'loss': 0.2607, 'grad_norm': 4.429640769958496, 'learning_rate': 5e-05, 'epoch': 0.23}
{'loss': 0.307, 'grad_norm': 2.5545895099639893, 'learning_rate': 4.587730870712401e-05, 'epoch': 0.46}
{'loss': 0.2773, 'grad_norm': 4.389836311340332, 'learning_rate': 4.1754617414248024e-05, 'epoch': 0.69}
{'loss': 0.2739, 'grad_norm': 2.9800498485565186, 'learning_rate': 3.7631926121372034e-05, 'epoch': 0.91}


  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.2649264931678772, 'eval_accuracy': 0.8906, 'eval_runtime': 13.323, 'eval_samples_per_second': 750.582, 'eval_steps_per_second': 11.784, 'epoch': 1.0}
{'loss': 0.2137, 'grad_norm': 2.730454683303833, 'learning_rate': 3.3509234828496045e-05, 'epoch': 1.14}
{'loss': 0.1857, 'grad_norm': 3.0903496742248535, 'learning_rate': 2.9386543535620053e-05, 'epoch': 1.37}
{'loss': 0.1796, 'grad_norm': 5.273860454559326, 'learning_rate': 2.5263852242744063e-05, 'epoch': 1.6}
{'loss': 0.1845, 'grad_norm': 2.561957597732544, 'learning_rate': 2.1141160949868074e-05, 'epoch': 1.83}


  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.2690999507904053, 'eval_accuracy': 0.8996, 'eval_runtime': 11.7602, 'eval_samples_per_second': 850.324, 'eval_steps_per_second': 13.35, 'epoch': 2.0}
{'loss': 0.1626, 'grad_norm': 2.925692558288574, 'learning_rate': 1.7018469656992088e-05, 'epoch': 2.06}
{'loss': 0.0993, 'grad_norm': 4.088841915130615, 'learning_rate': 1.2895778364116095e-05, 'epoch': 2.29}
{'loss': 0.0972, 'grad_norm': 6.135788917541504, 'learning_rate': 8.773087071240106e-06, 'epoch': 2.51}
{'loss': 0.0949, 'grad_norm': 4.127559661865234, 'learning_rate': 4.650395778364117e-06, 'epoch': 2.74}
{'loss': 0.0947, 'grad_norm': 4.724252700805664, 'learning_rate': 5.277044854881267e-07, 'epoch': 2.97}


  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.3486613631248474, 'eval_accuracy': 0.8979, 'eval_runtime': 11.7282, 'eval_samples_per_second': 852.648, 'eval_steps_per_second': 13.387, 'epoch': 3.0}
{'train_runtime': 2414.2819, 'train_samples_per_second': 173.965, 'train_steps_per_second': 2.719, 'train_loss': 0.18605261143934865, 'epoch': 3.0}


TrainOutput(global_step=6564, training_loss=0.18605261143934865, metrics={'train_runtime': 2414.2819, 'train_samples_per_second': 173.965, 'train_steps_per_second': 2.719, 'train_loss': 0.18605261143934865, 'epoch': 3.0})

# 6- Evaluation

In [20]:
# 테스트 데이터셋을 사용하여 모델 평가
test_result = trainer.evaluate(tokenized_datasets['test'])

# 평가 결과 출력
print(test_result)

  0%|          | 0/782 [00:00<?, ?it/s]

{'eval_loss': 0.34507355093955994, 'eval_accuracy': 0.89782, 'eval_runtime': 68.6014, 'eval_samples_per_second': 728.848, 'eval_steps_per_second': 11.399, 'epoch': 3.0}


# 7- Bucketing, Dynamic Padding

In [22]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

# Data Collator 설정
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

# TrainingArguments 설정
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    logging_dir='./logs',
    group_by_length=True,  # Bucketing 활성화
)

# Trainer 객체 생성 및 학습
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,   # 메트릭 계산 함수
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/6564 [00:00<?, ?it/s]

Checkpoint destination directory /Users/kenny_jung/aiffel/data/nsmc_transformer/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.036, 'grad_norm': 5.498893737792969, 'learning_rate': 5e-05, 'epoch': 0.23}


Checkpoint destination directory /Users/kenny_jung/aiffel/data/nsmc_transformer/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0542, 'grad_norm': 0.17856967449188232, 'learning_rate': 4.587730870712401e-05, 'epoch': 0.46}


Checkpoint destination directory /Users/kenny_jung/aiffel/data/nsmc_transformer/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0565, 'grad_norm': 1.2704957723617554, 'learning_rate': 4.1754617414248024e-05, 'epoch': 0.69}


Checkpoint destination directory /Users/kenny_jung/aiffel/data/nsmc_transformer/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0563, 'grad_norm': 3.0632383823394775, 'learning_rate': 3.7631926121372034e-05, 'epoch': 0.91}


  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.5306587815284729, 'eval_accuracy': 0.8854, 'eval_runtime': 12.3566, 'eval_samples_per_second': 809.285, 'eval_steps_per_second': 12.706, 'epoch': 1.0}


Checkpoint destination directory /Users/kenny_jung/aiffel/data/nsmc_transformer/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0487, 'grad_norm': 0.5476275682449341, 'learning_rate': 3.3509234828496045e-05, 'epoch': 1.14}


Checkpoint destination directory /Users/kenny_jung/aiffel/data/nsmc_transformer/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0478, 'grad_norm': 6.9563374519348145, 'learning_rate': 2.9386543535620053e-05, 'epoch': 1.37}


Checkpoint destination directory /Users/kenny_jung/aiffel/data/nsmc_transformer/checkpoint-3500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0441, 'grad_norm': 14.539857864379883, 'learning_rate': 2.5263852242744063e-05, 'epoch': 1.6}


Checkpoint destination directory /Users/kenny_jung/aiffel/data/nsmc_transformer/checkpoint-4000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0452, 'grad_norm': 1.9192602634429932, 'learning_rate': 2.1141160949868074e-05, 'epoch': 1.83}


  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.548985481262207, 'eval_accuracy': 0.8929, 'eval_runtime': 12.4179, 'eval_samples_per_second': 805.286, 'eval_steps_per_second': 12.643, 'epoch': 2.0}


Checkpoint destination directory /Users/kenny_jung/aiffel/data/nsmc_transformer/checkpoint-4500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0407, 'grad_norm': 0.4755144715309143, 'learning_rate': 1.7018469656992088e-05, 'epoch': 2.06}


Checkpoint destination directory /Users/kenny_jung/aiffel/data/nsmc_transformer/checkpoint-5000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0303, 'grad_norm': 4.084561347961426, 'learning_rate': 1.2895778364116095e-05, 'epoch': 2.29}


Checkpoint destination directory /Users/kenny_jung/aiffel/data/nsmc_transformer/checkpoint-5500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0275, 'grad_norm': 11.744807243347168, 'learning_rate': 8.773087071240106e-06, 'epoch': 2.51}


Checkpoint destination directory /Users/kenny_jung/aiffel/data/nsmc_transformer/checkpoint-6000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0387, 'grad_norm': 0.3738650381565094, 'learning_rate': 4.650395778364117e-06, 'epoch': 2.74}


Checkpoint destination directory /Users/kenny_jung/aiffel/data/nsmc_transformer/checkpoint-6500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0368, 'grad_norm': 16.536075592041016, 'learning_rate': 5.277044854881267e-07, 'epoch': 2.97}


  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.600199818611145, 'eval_accuracy': 0.8934, 'eval_runtime': 12.6781, 'eval_samples_per_second': 788.759, 'eval_steps_per_second': 12.384, 'epoch': 3.0}
{'train_runtime': 2464.8313, 'train_samples_per_second': 170.397, 'train_steps_per_second': 2.663, 'train_loss': 0.04326476039572651, 'epoch': 3.0}


TrainOutput(global_step=6564, training_loss=0.04326476039572651, metrics={'train_runtime': 2464.8313, 'train_samples_per_second': 170.397, 'train_steps_per_second': 2.663, 'train_loss': 0.04326476039572651, 'epoch': 3.0})

# 8- Evaluation

In [23]:
# 테스트 데이터셋을 사용하여 모델 평가
test_result = trainer.evaluate(tokenized_datasets['test'])

# 평가 결과 출력
print(test_result)

  0%|          | 0/782 [00:00<?, ?it/s]

{'eval_loss': 0.5967464447021484, 'eval_accuracy': 0.89354, 'eval_runtime': 62.7376, 'eval_samples_per_second': 796.97, 'eval_steps_per_second': 12.465, 'epoch': 3.0}
