In [None]:
# 프로젝트 : Transformer 기반 한국어 대화 생성

In [None]:
# 1) 환경 설정
# !pip install pandas numpy torch transformers
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import PreTrainedTokenizerFast
from transformers import GPT2LMHeadModel

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Pytorch Version : {torch.__version__}, Device : {device}')

Pytorch Version : 2.7.1+cu118, Device : cuda


In [None]:
# import torch
# print("CUDA 사용 가능:", torch.cuda.is_available())
# print("GPU 이름:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "GPU 없음")

CUDA 사용 가능: True
GPU 이름: NVIDIA GeForce RTX 3060 Laptop GPU


In [None]:
# 2) 데이터셋 로딩 및 전처리
# ChatbotData.csv 다운로드
url = "https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv"
df = pd.read_csv(url)

# 간단한 전처리
df = df.dropna()
df = df.sample(frac=1).reset_index(drop=True)  # 셔플

# 토크나이저 설정 (KoGPT2 기반)
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2")
kogpt_model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2").to(device)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
kogpt_model.resize_token_embeddings(len(tokenizer))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(51201, 768)

In [34]:
# 3) Dataset 클래스 정의
class ChatDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=40):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, idx):
        q = self.df.iloc[idx]["Q"]
        a = self.df.iloc[idx]["A"]
        q_ids = tokenizer.encode(q, max_length=self.max_len, padding='max_length', truncation=True)
        a_ids = tokenizer.encode(a, max_length=self.max_len, padding='max_length', truncation=True)
        return torch.tensor(q_ids), torch.tensor(a_ids)

    def __len__(self):
        return len(self.df)

In [35]:
# 4) Transformer 모델 정의
class TransformerChatbot(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_layers=6):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead,
                                          num_encoder_layers=num_layers,
                                          num_decoder_layers=num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src).permute(1, 0, 2)  # [seq_len, batch, d_model]
        tgt = self.embedding(tgt).permute(1, 0, 2)
        output = self.transformer(src, tgt)
        output = self.fc_out(output)
        return output.permute(1, 0, 2)  # [batch, seq_len, vocab_size]

In [36]:
# 5) 학습 진행
dataset = ChatDataset(df, tokenizer)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

vocab_size = len(tokenizer)
model = TransformerChatbot(vocab_size=vocab_size).to(device)
model.embedding = nn.Embedding(len(tokenizer), 512).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
# loss_fn = nn.CrossEntropyLoss()
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

for epoch in range(30):
    model.train()
    total_loss = 0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = loss_fn(output.reshape(-1, vocab_size), tgt[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss / len(loader):.4f}")




Epoch 1 Loss: 6.1420
Epoch 2 Loss: 4.0712
Epoch 3 Loss: 3.0455
Epoch 4 Loss: 2.2801
Epoch 5 Loss: 1.6892
Epoch 6 Loss: 1.2364
Epoch 7 Loss: 0.8854
Epoch 8 Loss: 0.6199
Epoch 9 Loss: 0.4307
Epoch 10 Loss: 0.2974
Epoch 11 Loss: 0.2053
Epoch 12 Loss: 0.1430
Epoch 13 Loss: 0.1035
Epoch 14 Loss: 0.0842
Epoch 15 Loss: 0.0744
Epoch 16 Loss: 0.0721
Epoch 17 Loss: 0.0633
Epoch 18 Loss: 0.0590
Epoch 19 Loss: 0.0550
Epoch 20 Loss: 0.0526
Epoch 21 Loss: 0.0541
Epoch 22 Loss: 0.0524
Epoch 23 Loss: 0.0457
Epoch 24 Loss: 0.0417
Epoch 25 Loss: 0.0422
Epoch 26 Loss: 0.0415
Epoch 27 Loss: 0.0430
Epoch 28 Loss: 0.0430
Epoch 29 Loss: 0.0398
Epoch 30 Loss: 0.0369


In [None]:
# 6) 대화 생성 함수

def kogpt_generate(text, max_len=50):
    input_ids = tokenizer.encode(text, return_tensors='pt').to(device)
    with torch.no_grad():
        output = kogpt_model.generate(
            input_ids,
            max_length=max_len,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.8,
            temperature=0.9,
            top_k=50,
            top_p=0.95,
            do_sample=True
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [None]:
# 7) 테스트

print(kogpt_generate("오늘 기분이 어때요?"))


오늘 기분이 어때요?"
"아뇨. 그럼 제가 대신 마무리를 잘 하겠습니다. 저는 내일 다시 뵙게 될 겁니다. 고맙습니다, 선생님."
오랜만에 만난 남매는 서로 부러운 눈빛
