# colab runned

In [None]:
import json
import random

# JSON 파일에서 딕셔너리 읽기
with open('data/text_data/output_text.json', 'r') as file:
    data_loaded = json.load(file)

# 데이터 섞기
items = list(data_loaded.items())
random.shuffle(items)

# 데이터를 섞은 후 딕셔너리로 변환
data_loaded_shuffled = dict(items)


In [None]:
# 학습 및 테스트 데이터셋 나누기
train_x = [data_loaded_shuffled[i][0] for i in data_loaded_shuffled][:5000]
train_y = [data_loaded_shuffled[i][1] for i in data_loaded_shuffled][:5000]
test_x = [data_loaded_shuffled[i][0] for i in data_loaded_shuffled][5000:]
test_y = [data_loaded_shuffled[i][1] for i in data_loaded_shuffled][5000:]

In [None]:
name_folder = 'skt_kogpt2_base_v2'
server_port = '4561'
batch_size = 16
train_epochs = 1000

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset

# 모델 로드('skt/kogpt2-base-v2')
model_name = "skt/kogpt2-base-v2"
model = GPT2LMHeadModel.from_pretrained(model_name)

# 토크나이저 로드
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [None]:
# 데이터 전처리 함수 정의
def preprocess_function(examples):
    model_inputs = tokenizer(examples['input_text'], max_length=128, truncation=True, padding='max_length')
    labels = tokenizer(examples['target_text'], max_length=128, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
# 데이터셋 생성 및 토큰화
dataset_train = Dataset.from_dict({'input_text': train_x,'target_text': train_y})
dataset_test = Dataset.from_dict({'input_text': test_x,'target_text': test_y})

tokenized_train_datasets = dataset_train.map(preprocess_function, batched=True)
tokenized_test_datasets = dataset_test.map(preprocess_function, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/994 [00:00<?, ? examples/s]

In [None]:
# 학습인자
training_args = TrainingArguments(
    per_device_train_batch_size=batch_size, # 학습 배치 사이즈
    per_device_eval_batch_size=batch_size,  # 평가 배치 사이즈
    output_dir=f'../content/drive/MyDrive/project_3_git/data/t5/{name_folder}',         # 모델 및 체크포인트 저장 디렉토리
    num_train_epochs=train_epochs,          # 학습 에폭 수
    logging_dir=f'../content/drive/MyDrive/project_3_git/data/t5/{name_folder}/logs',   # TensorBoard 로그가 저장될 디렉토리
    logging_steps=100,                      # TensorBoard 로그를 기록할 간격
    report_to='tensorboard',                # TensorBoard로 로깅
    load_best_model_at_end = True,
    eval_strategy='epoch',
    save_strategy='epoch',                  # 에포크 마다 모델 저장
    # resume_from_checkpoint=True           # 이어 학습
)

In [None]:
# Trainer 객체 생성
from transformers.trainer_callback import EarlyStoppingCallback # import EarlyStoppingCallback from transformers.trainer_callback instead of transformers.trainer_utils
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_test_datasets,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=10000000)]
)

In [None]:
# 모델 학습
trainer.train() # 이어 학습시 (resume_from_checkpoint=checkpoint_dir)

Epoch,Training Loss,Validation Loss
1,0.6403,0.72245
2,0.6257,0.72698
3,0.6044,0.739847
4,0.5894,0.75197
5,0.5847,0.758069
6,0.5749,0.760812
7,0.5925,0.754041
8,0.5873,0.756513
9,0.5681,0.764588
10,0.5756,0.773743


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=3443, training_loss=0.5876596962127996, metrics={'train_runtime': 539.8001, 'train_samples_per_second': 926.269, 'train_steps_per_second': 57.984, 'total_flos': 3592765440000000.0, 'train_loss': 0.5876596962127996, 'epoch': 11.0})

In [None]:
# 모델과 토크나이저 저장
model_save_path = '../content/drive/MyDrive/project_3_git/data/t5'
tokenizer_save_path = '../content/drive/MyDrive/project_3_git/data/tokenizer'

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

print(f"모델이 '{model_save_path}'에 저장되었습니다.")
print(f"토크나이저가 '{tokenizer_save_path}'에 저장되었습니다.")