In [1]:
# 기본 작업 경로 설정

import os
notebook_path = os.path.abspath("last project/temp/gpt2/here")
notebook_dir = os.path.dirname(notebook_path)
os.chdir(notebook_dir)

# 현재 작업 디렉토리 출력
print("Current working directory: ", os.getcwd())

Current working directory:  /mnt/e/py_data/last project/temp/gpt2


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Step 1: Tokenizer 및 모델 준비
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Step 2: 데이터셋 준비
def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    return dataset

train_dataset = load_dataset('test.txt', tokenizer)

# Step 3: Data collator 준비
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:
# Step 4: TrainingArguments 설정
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=10,
    save_steps=1000000,
    save_total_limit=2,
    report_to='tensorboard',  # TensorBoard로 로깅
    logging_dir='./logs',  # 로그 파일이 저장될 디렉토리
    logging_steps=10  # 로깅 간격
)

# Step 5: Trainer 설정 및 학습 시작
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()

In [9]:
# Step 6: 모델 저장
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')

In [6]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# 저장된 모델 및 토크나이저 불러오기
model_path = './fine_tuned_gpt2'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# 평가 모드로 변경
model.eval()

def generate_text(prompt, model, tokenizer, max_length=128, num_return_sequences=1):
    # 입력 텍스트를 토큰화
    inputs = tokenizer.encode(prompt, return_tensors='pt')

    # 생성 인자를 설정하여 모델이 텍스트를 생성
    outputs = model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        temperature=1.9,
        do_sample=True,
        early_stopping=True
    )

    # 생성된 텍스트를 디코딩
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
    return generated_texts

# 예시: "prompt"에 원하는 문장을 넣어서 결과를 확인
prompt = "오늘"
generated_texts = generate_text(prompt, model, tokenizer)
    
for i, text in enumerate(generated_texts):
    print(f"Generated Text {i+1}:")
    print(text)
    print()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text 1:
오늘 하루도 사랑과 기쁨요.
잉상 밝게 진심으로 행복한 순간들을 만껏 좋은 일쳴세어 원합니다.

