In [None]:
# =============================
# 0. 라이브러리 설치
# =============================
!pip install transformers sentencepiece pandas torch --quiet

# =============================
# 1. 데이터셋 로드
# =============================
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

data = pd.read_csv("~/work/transformer_chatbot/data/ChatbotData.csv")
print(data.head(), flush=True)

questions = data['Q'].astype(str).tolist()
answers   = data['A'].astype(str).tolist()

# =============================
# 2. 토크나이저 (SentencePiece 기반)
# =============================
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token="<bos>",
    eos_token="<eos>",
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>"
)

# =============================
# 3. 데이터셋 클래스
# =============================
class ChatDataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_len=64):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        q = self.questions[idx]
        a = self.answers[idx]

        q_enc = self.tokenizer.encode_plus(
            q, truncation=True, padding="max_length",
            max_length=self.max_len, return_tensors="pt"
        )
        a_enc = self.tokenizer.encode_plus(
            a, truncation=True, padding="max_length",
            max_length=self.max_len, return_tensors="pt"
        )

        return {
            "input_ids": q_enc["input_ids"].squeeze(),
            "attention_mask": q_enc["attention_mask"].squeeze(),
            "labels": a_enc["input_ids"].squeeze()
        }

train_dataset = ChatDataset(questions, answers, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# =============================
# 4. 모델 정의
# =============================
from transformers import GPT2LMHeadModel
from torch.optim import AdamW

model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# =============================
# 5. 학습 루프 (loss + token accuracy)
# =============================
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3
model.train()

for epoch in range(epochs):
    total_loss = 0
    total_correct = 0
    total_tokens = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits  # [batch_size, seq_len, vocab_size]
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # =============================
        # Token-level accuracy 계산
        # =============================
        with torch.no_grad():
            preds = logits.argmax(dim=-1)
            mask = labels.ne(tokenizer.pad_token_id)
            correct = (preds == labels) & mask
            total_correct += correct.sum().item()
            total_tokens += mask.sum().item()

    avg_loss = total_loss / len(train_loader)
    avg_acc = total_correct / total_tokens
    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | Token Accuracy: {avg_acc:.4f}", flush=True)

# =============================
# 6. 챗봇 테스트 함수
# =============================
def chat(model, tokenizer, text, max_len=50):
    model.eval()
    input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_len,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# =============================
# 7. 챗봇 테스트
# =============================
print(chat(model, tokenizer, "안녕?"))
print(chat(model, tokenizer, "오늘 기분 어때?"))


                 Q            A  label
0           12시 땡!   하루가 또 가네요.      0
1      1지망 학교 떨어졌어    위로해 드립니다.      0
2     3박4일 놀러가고 싶다  여행은 언제나 좋죠.      0
3  3박4일 정도 놀러가고 싶다  여행은 언제나 좋죠.      0
4          PPL 심하네   눈살이 찌푸려지죠.      0


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch 1 | Loss: 0.8487 | Token Accuracy: 0.0105
