In [None]:
!pip install transformers datasets wandb focal-loss -q
!wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# ✅ KoBigBird 기반 Binary 호감도 변화 분류 모델 (발화 + 문맥 + MaxPool + BCE)

import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import wandb
from tqdm import tqdm
import numpy as np

# ✅ Constants
MODEL_NAME = "monologg/kobigbird-bert-base"
MAX_LEN = 4096
BATCH_SIZE = 4
EPOCHS = 10
LR = 2e-5

# ✅ Dataset (input_text → utterance + context로 분리해 concat)
class DialogueDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        utterance = item.get("target_utterance", "")  # 주된 발화
        context = item.get("dialogue_history", "")     # 문맥
        combined = utterance + " [SEP] " + context      # 발화 + 문맥

        inputs = self.tokenizer(
            combined,
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        input_ids = inputs["input_ids"].squeeze(0)
        attention_mask = inputs["attention_mask"].squeeze(0)

        label = 1.0 if float(item["label_male"]) > 0.5 else 0.0  # Binary
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": torch.tensor(label, dtype=torch.float)
        }

# ✅ Model with [CLS] + max-pooling and BCE output
class KoBigBirdBinaryClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        hidden = self.encoder.config.hidden_size
        self.classifier = nn.Linear(hidden * 2, 1)  # binary output

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_emb = outputs.last_hidden_state[:, 0, :]                     # [CLS]
        pooled = torch.max(outputs.last_hidden_state, dim=1).values     # max pooling
        combined = torch.cat([cls_emb, pooled], dim=-1)
        return self.classifier(combined).squeeze(-1)  # (B,) → Binary logit

# ✅ Load data
with open("dialogues_human.jsonl", "r", encoding="utf-8") as f:
    all_data = [json.loads(line) for line in f if line.strip()]

train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)

# ✅ Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_dataset = DialogueDataset(train_data, tokenizer)
test_dataset = DialogueDataset(test_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# ✅ Initialize
model = KoBigBirdBinaryClassifier().cuda()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
loss_fn = nn.BCEWithLogitsLoss()

wandb.init(project="huggingface", name="kobigbird-binary-utterance-context")
wandb.watch(model, log_freq=100)

# ✅ Training
model.train()
for epoch in range(EPOCHS):
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['label'].cuda()

        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        wandb.log({"train_loss": loss.item()})

# ✅ Evaluation
model.eval()
preds, labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        label = batch['label'].cuda()

        logits = model(input_ids, attention_mask)
        prob = torch.sigmoid(logits)
        pred = (prob > 0.5).long()

        preds.extend(pred.cpu().numpy())
        labels.extend(label.cpu().long().numpy())

acc = accuracy_score(labels, preds)
f1 = f1_score(labels, preds, average="binary")
conf_mat = confusion_matrix(labels, preds)
print(f"Accuracy: {acc:.4f}, F1: {f1:.4f}")
print("Confusion Matrix:\n", conf_mat)

wandb.log({"accuracy": acc, "f1": f1})

# ✅ Save model & upload as artifact
torch.save(model.state_dict(), "kobigbird_binary_model.pt")
artifact = wandb.Artifact("kobigbird-binary-model", type="model")
artifact.add_file("kobigbird_binary_model.pt")
wandb.log_artifact(artifact)


0,1
accuracy,▁
macro_f1,▁
train_loss,▇▅▄▆▄▇▆▇▅▅▅▃▄▅▄▂█▂▂▂▂▃▁▁▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.57143
macro_f1,0.36574
train_loss,0.0299


100%|██████████| 28/28 [00:39<00:00,  1.40s/it]
100%|██████████| 28/28 [00:39<00:00,  1.40s/it]
100%|██████████| 28/28 [00:39<00:00,  1.40s/it]
100%|██████████| 28/28 [00:39<00:00,  1.40s/it]
100%|██████████| 28/28 [00:39<00:00,  1.40s/it]
100%|██████████| 28/28 [00:39<00:00,  1.40s/it]
100%|██████████| 28/28 [00:39<00:00,  1.40s/it]
100%|██████████| 28/28 [00:39<00:00,  1.40s/it]
100%|██████████| 28/28 [00:39<00:00,  1.40s/it]
100%|██████████| 28/28 [00:39<00:00,  1.40s/it]


Accuracy: 0.6071, F1: 0.7556
Confusion Matrix:
 [[ 0 11]
 [ 0 17]]


<Artifact kobigbird-binary-model>

1번째 https://wandb.ai/cres4205-sangmyung-university/huggingface/runs/9ypgun68  

Accuracy: 0.6071, Macro-F1: 0.2519