# Stage 0 - externel emotion pre-training

In [1]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)
import torch
import evaluate
import numpy as np
import os

MODEL_NAME = "roberta-base"

# 로컬 경로
SAVE_DIR = "./goemo_out"
os.makedirs(SAVE_DIR, exist_ok=True)
SAVE_ENCODER_PATH = os.path.join(SAVE_DIR, "pretrained_emotion_encoder.pt")

# 1) GoEmotions 로드
goemo = load_dataset("go_emotions")   # train / validation / test

# 클래스 개수 (28개)
num_emotions = len(goemo["train"].features["labels"].feature.names)
print("num_emotions:", num_emotions)

# 2) tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 3) 전처리: 단일 라벨로 변환 + 토크나이즈
def preprocess(batch):
    enc = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

    single_labels = []
    for label_list in batch["labels"]:
        if len(label_list) > 0:
            # 여러 개 있으면 첫 번째 라벨만 사용 (대표 감정)
            single_labels.append(label_list[0])
        else:
            # 라벨이 비어있는 경우 0번 클래스(기본값)으로
            single_labels.append(0)

    enc["labels"] = single_labels
    return enc

goemo_proc = goemo.map(
    preprocess,
    batched=True,
    remove_columns=goemo["train"].column_names,  # text, labels, id 등 제거
)

goemo_proc.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)

print(goemo_proc)
print(goemo_proc["train"][0])

# 4) 모델 준비: 다중 클래스 분류 (CrossEntropyLoss)
model_emo = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_emotions,
)

# 5) metric (일반 accuracy)
acc_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = acc_metric.compute(predictions=preds, references=labels)["accuracy"]
    return {"accuracy": acc}

# 6) TrainingArguments
training_args = TrainingArguments(
    output_dir=SAVE_DIR,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    num_train_epochs=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_steps=200,
    report_to="none",
)

# 7) Trainer
trainer_goemo = Trainer(
    model=model_emo,
    args=training_args,
    train_dataset=goemo_proc["train"],
    eval_dataset=goemo_proc["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 8) 학습
trainer_goemo.train()

# 9) encoder만 저장 (roberta-base 기준)
torch.save(model_emo.roberta.state_dict(), SAVE_ENCODER_PATH)
print("✅ Saved pretrained emotion encoder to:", SAVE_ENCODER_PATH)


num_emotions: 28
DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 5427
    })
})
{'labels': tensor(27), 'input_ids': tensor([   0, 2387, 5548,  689,   16,  932,   38,  399,   75,   33,    7, 7142,
        2185,    4,    2,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_goemo = Trainer(
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.4886,1.397162,0.578879
2,1.2734,1.344146,0.592518


  arr = np.array(obj)


✅ Saved pretrained emotion encoder to: ./goemo_out\pretrained_emotion_encoder.pt


# Stage 1 — Multi-View Dataset 구축 (context-only + ctx+rsp)

In [2]:
from datasets import load_from_disk
from transformers import AutoTokenizer
import torch

# ===== 경로 설정 =====
RAW_ED_PATH = "empathy_dataset/hf_raw_with_emo"  # 네가 저장한 경로로 수정
SAVE_ENCODER_PATH = "./goemo_out/pretrained_emotion_encoder.pt"
MODEL_NAME = "roberta-base"

# 1) ED 데이터 로드
ds = load_from_disk(RAW_ED_PATH)
print(ds)

# 확인: train[0]에 'context', 'response', 'label', 'emotion_id' 등이 있어야 함
print(ds["train"][0])

# 2) multi-view 텍스트 생성
def build_multiview(example):
    ctx = example["context"]
    rsp = example["response"]

    # 감정 분류용 view: context-only
    example["input_ctx"] = "CTX: " + ctx

    # 공감 회귀용 view: context + response
    example["input_full"] = f"CTX: {ctx} [SEP] RSP: {rsp}"
    return example

ds_mv = ds.map(build_multiview)

# 3) 토크나이저 준비
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
MAX_LEN = 256

def tokenize_views(batch):
    enc_ctx = tokenizer(
        batch["input_ctx"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )
    enc_full = tokenizer(
        batch["input_full"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )

    return {
        "input_ids_ctx": enc_ctx["input_ids"],
        "attention_mask_ctx": enc_ctx["attention_mask"],
        "input_ids_full": enc_full["input_ids"],
        "attention_mask_full": enc_full["attention_mask"],
        # 공감 회귀 타깃: label(=final_hybrid)
        "labels": batch["label"],
        # 감정 분류 타깃: emotion_id
        "emotion_labels": batch["emotion_id"],
    }

# 4) 원래 컬럼들은 제거하고 multi-view 텐서만 남기기
cols_to_remove = ds_mv["train"].column_names
ds_tokenized = ds_mv.map(
    tokenize_views,
    batched=True,
    remove_columns=cols_to_remove,
)

# 5) torch 텐서 포맷 설정
for split in ds_tokenized.keys():
    ds_tokenized[split].set_format(
        type="torch",
        columns=[
            "input_ids_ctx", "attention_mask_ctx",
            "input_ids_full", "attention_mask_full",
            "labels", "emotion_labels",
        ],
    )

print(ds_tokenized)
print(ds_tokenized["train"][0])


DatasetDict({
    train: Dataset({
        features: ['context', 'response', 'label', 'conv_id', 'emotion', 'type', 'final_hybrid', 'llm_15', 'rule_15', 'seed_15', 'emotion_id'],
        num_rows: 5076
    })
    validation: Dataset({
        features: ['context', 'response', 'label', 'conv_id', 'emotion', 'type', 'final_hybrid', 'llm_15', 'rule_15', 'seed_15', 'emotion_id'],
        num_rows: 627
    })
    test: Dataset({
        features: ['context', 'response', 'label', 'conv_id', 'emotion', 'type', 'final_hybrid', 'llm_15', 'rule_15', 'seed_15', 'emotion_id'],
        num_rows: 669
    })
})
{'context': "I felt guilty when I was driving home one night and a person tried to fly into my lane, and didn't see me. I honked and they swerved back into their lane, slammed on their brakes, and hit the water cones. [SEP] Yeah about 10 years ago I had a horrifying experience. It was 100% their fault but they hit the water barrels and survived. They had no injuries but they almost ran me off 

# 공유 encoder + multi-view cascade 모델 정의
- emotion branch: input_ctx만 encoder에 넣고 emtion logits 계산
- Empathy branch: input_full을 encoder에 넣고 CLS + emotion_logits를 concat해서 최종 공감 점수 회귀
- pre-trained encoder로 초기화: Stage 0에서 만든 pretrained_emotion_encoder.pt

In [3]:
import torch.nn as nn
from transformers import AutoModel

num_emotions = int(max(ds["train"]["emotion_id"])) + 1
print("num_emotions:", num_emotions)

class MultiViewCascadeModel(nn.Module):
    def __init__(self, encoder_name, num_emotions, pretrained_encoder_path=None):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(encoder_name)
        hidden = self.encoder.config.hidden_size

        # Stage0에서 학습한 encoder 가중치 로드
        if pretrained_encoder_path is not None:
            print(">> Loading pretrained encoder from:", pretrained_encoder_path)
            state = torch.load(pretrained_encoder_path, map_location="cpu")
            self.encoder.load_state_dict(state, strict=False)

        # Emotion head (context-only)
        self.emo_head = nn.Linear(hidden, num_emotions)

        # Empathy regression head (CLS_full + emo_logits)
        self.reg_head = nn.Sequential(
            nn.Linear(hidden + num_emotions, hidden),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden, 1),
        )

        self.loss_reg = nn.SmoothL1Loss()
        self.loss_emo = nn.CrossEntropyLoss()

    def forward(
        self,
        input_ids_ctx=None,
        attention_mask_ctx=None,
        input_ids_full=None,
        attention_mask_full=None,
        labels=None,
        emotion_labels=None,
        **kwargs,
    ):
        # 1) Emotion branch: context-only
        out_ctx = self.encoder(
            input_ids=input_ids_ctx,
            attention_mask=attention_mask_ctx,
        )
        cls_ctx = out_ctx.last_hidden_state[:, 0]          # [B, H]
        emo_logits = self.emo_head(cls_ctx)                # [B, C]

        # 2) Empathy branch: context + response
        out_full = self.encoder(
            input_ids=input_ids_full,
            attention_mask=attention_mask_full,
        )
        cls_full = out_full.last_hidden_state[:, 0]        # [B, H]

        # concat [CLS_full ; emo_logits] → 회귀
        feat = torch.cat([cls_full, emo_logits], dim=-1)   # [B, H + C]
        reg_score = self.reg_head(feat).squeeze(-1)        # [B]

        loss = None
        if labels is not None and emotion_labels is not None:
            # 공감 회귀 손실
            loss_r = self.loss_reg(reg_score, labels.float())
            # 감정 분류 손실
            loss_e = self.loss_emo(emo_logits, emotion_labels.long())

            lambda_emo = 0.1
            loss = loss_r + lambda_emo * loss_e

        # logits: [B, 1 + num_emotions]로 묶어서 Trainer에 넘김
        logits_concat = torch.cat(
            [reg_score.unsqueeze(-1), emo_logits.detach()],
            dim=-1
        )  # [B, 1 + C]

        return {
            "loss": loss,
            "logits": logits_concat,          # compute_metrics에서 풀어씀
        }


num_emotions: 32


# 3. Trainer용 Metrics (MAE + Spearman + emotion accuracy)

In [4]:
import numpy as np
import evaluate
from scipy.stats import spearmanr

mae_metric = evaluate.load("mae")
acc_metric = evaluate.load("accuracy")

def compute_metrics_multi(eval_pred):
    # eval_pred = (logits, labels)
    logits, labels = eval_pred

    # labels: tuple → (regression labels, emotion_labels)
    reg_true = labels[0]          # shape: (N,)
    emo_true = labels[1]          # shape: (N,)

    # logits: np.ndarray → shape: (N, 1 + num_emotions)
    reg_pred = logits[:, 0]       # 공감 점수 예측 (N,)
    emo_logits = logits[:, 1:]    # 감정 로짓 (N, C)
    emo_pred = np.argmax(emo_logits, axis=-1)  # (N,)

    # MAE
    mae = float(np.mean(np.abs(reg_pred - reg_true)))

    # Spearman 상관계수
    spearman_corr = spearmanr(reg_true, reg_pred).correlation

    # 감정 분류 정확도
    emo_acc = float((emo_pred == emo_true).mean())

    return {
        "eval_mae": mae,
        "eval_spearman": spearman_corr if spearman_corr is not None else 0.0,
        "eval_emotion_accuracy": emo_acc,
    }


# 4. TrainingArguments & Trainer설정 + 학습&평가

In [5]:
from transformers import TrainingArguments, Trainer

output_dir = "./mv_cascade_ed_out"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="mae",   # 공감 회귀 기준으로 best 선택
    greater_is_better=False,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none",
    # 두 개의 label을 Trainer가 넘겨주게 지정
    label_names=["labels", "emotion_labels"],
)

set_seed(17)

model = MultiViewCascadeModel(
    encoder_name=MODEL_NAME,
    num_emotions=num_emotions,
    pretrained_encoder_path=SAVE_ENCODER_PATH,   # Stage0에서 만든 encoder
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_tokenized["train"],
    eval_dataset=ds_tokenized["validation"],
    compute_metrics=compute_metrics_multi,
)

# 학습
train_result = trainer.train()
print("== Train result ==")
print(train_result)

print("Best checkpoint:", trainer.state.best_model_checkpoint)

# Validation 성능
val_metrics = trainer.evaluate(ds_tokenized["validation"])
print("== Validation metrics ==")
for k, v in val_metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

# Test 성능
test_metrics = trainer.evaluate(ds_tokenized["test"])
print("== Test metrics ==")
for k, v in test_metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  state = torch.load(pretrained_encoder_path, map_location="cpu")


>> Loading pretrained encoder from: ./goemo_out/pretrained_emotion_encoder.pt


Epoch,Training Loss,Validation Loss,Mae,Spearman,Emotion Accuracy
1,0.6809,0.629307,0.728914,0.209364,0.248804
2,0.4077,0.367283,0.403557,0.744814,0.411483
3,0.2101,0.31562,0.309449,0.787719,0.45933
4,0.1693,0.314966,0.30126,0.779292,0.492823


== Train result ==
TrainOutput(global_step=2540, training_loss=0.39646767631290464, metrics={'train_runtime': 705.0332, 'train_samples_per_second': 28.799, 'train_steps_per_second': 3.603, 'total_flos': 0.0, 'train_loss': 0.39646767631290464, 'epoch': 4.0})
Best checkpoint: ./mv_cascade_ed_out\checkpoint-2540


== Validation metrics ==
eval_mae: 0.3013
eval_spearman: 0.7793
eval_emotion_accuracy: 0.4928
eval_loss: 0.3150
eval_runtime: 4.5012
eval_samples_per_second: 139.2950
eval_steps_per_second: 8.8860
epoch: 4.0000
== Test metrics ==
eval_mae: 0.3318
eval_spearman: 0.7541
eval_emotion_accuracy: 0.4081
eval_loss: 0.3696
eval_runtime: 2.1090
eval_samples_per_second: 317.2070
eval_steps_per_second: 19.9140
epoch: 4.0000


## MultiView 토크나이즈 데이터셋 재생성 + 저장

In [6]:
from datasets import load_from_disk
from transformers import AutoTokenizer

RAW_ED_PATH = "empathy_dataset/hf_raw_with_emo"  # 그대로 사용
MODEL_NAME = "roberta-base"
MAX_LEN = 256

# 1) raw ED 로드
ds = load_from_disk(RAW_ED_PATH)
print(ds)

# 2) multi-view 텍스트 생성
def build_multiview(example):
    ctx = example["context"]
    rsp = example["response"]
    example["input_ctx"] = "CTX: " + ctx
    example["input_full"] = f"CTX: {ctx} [SEP] RSP: {rsp}"
    return example

ds_mv = ds.map(build_multiview)

# 3) 토크나이즈
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_views(batch):
    enc_ctx = tokenizer(
        batch["input_ctx"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )
    enc_full = tokenizer(
        batch["input_full"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )

    return {
        "input_ids_ctx": enc_ctx["input_ids"],
        "attention_mask_ctx": enc_ctx["attention_mask"],
        "input_ids_full": enc_full["input_ids"],
        "attention_mask_full": enc_full["attention_mask"],
        "labels": batch["label"],
        "emotion_labels": batch["emotion_id"],
    }

cols_to_remove = ds_mv["train"].column_names
ds_tokenized = ds_mv.map(
    tokenize_views,
    batched=True,
    remove_columns=cols_to_remove,
)

# 4) torch 포맷
for split in ds_tokenized.keys():
    ds_tokenized[split].set_format(
        type="torch",
        columns=[
            "input_ids_ctx", "attention_mask_ctx",
            "input_ids_full", "attention_mask_full",
            "labels", "emotion_labels",
        ],
    )

print(ds_tokenized)
print(ds_tokenized["train"][0])

# 5) 이번에는 꼭 저장!
SAVE_MULTI_PATH = "./empathy_dataset/hf_tokenized_multiview"
ds_tokenized.save_to_disk(SAVE_MULTI_PATH)
print("Saved to:", SAVE_MULTI_PATH)


DatasetDict({
    train: Dataset({
        features: ['context', 'response', 'label', 'conv_id', 'emotion', 'type', 'final_hybrid', 'llm_15', 'rule_15', 'seed_15', 'emotion_id'],
        num_rows: 5076
    })
    validation: Dataset({
        features: ['context', 'response', 'label', 'conv_id', 'emotion', 'type', 'final_hybrid', 'llm_15', 'rule_15', 'seed_15', 'emotion_id'],
        num_rows: 627
    })
    test: Dataset({
        features: ['context', 'response', 'label', 'conv_id', 'emotion', 'type', 'final_hybrid', 'llm_15', 'rule_15', 'seed_15', 'emotion_id'],
        num_rows: 669
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_ids_ctx', 'attention_mask_ctx', 'input_ids_full', 'attention_mask_full', 'labels', 'emotion_labels'],
        num_rows: 5076
    })
    validation: Dataset({
        features: ['input_ids_ctx', 'attention_mask_ctx', 'input_ids_full', 'attention_mask_full', 'labels', 'emotion_labels'],
        num_rows: 627
    })
    test: Dataset({


Saving the dataset (0/1 shards):   0%|          | 0/5076 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/627 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/669 [00:00<?, ? examples/s]

Saved to: ./empathy_dataset/hf_tokenized_multiview
