In [None]:
!pip install transformers snorkel scikit-learn tqdm

Collecting snorkel
  Downloading snorkel-0.10.0-py3-none-any.whl.metadata (9.5 kB)
Collecting munkres>=1.0.6 (from snorkel)
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.2.0->snorkel)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.2.0->snorkel)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.2.0->snorkel)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.2.0->snorkel)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.2.0->snorkel)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinu

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csgraph
import torch
from transformers import AutoTokenizer, AutoModel
from scipy.stats import entropy
from tqdm import tqdm
import json
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import eigs

In [None]:
# ---- Configuration ----
GAMMA_THRESHOLD = 0.1      # γ-filtering threshold
C_HOPS = 1                 # c-hop distance
SPARSE_EPSILON = 0.05      # ℓ₀-sampling error parameter
CENTERS = [0.0, 0.5, 1.0]   # three peak centers
ANCHOR_SCALE = 1.0         # scaling factor for peak anchor edge weights
EPSILON = 1e-4             # convergence tolerance for Δ
MAX_ITER = 100             # maximum iterations
DEV_RATIO = 0.3            # fraction for LabelModel dev set

# ---- Modes and Ranges ----
ALPHA_MODE = 'sample'       # 'sample', 'variance', or 'hybrid'
MU_MODE = 'constant'        # 'spectral', 'constant', or 'boundary'
MU_VALUE = 0.5              # used if MU_MODE == 'constant'
MU_BOUNDARY = [0.01, 0.1, 1.0, 10.0]
INCLUDE_DONGLE = True
INCLUDE_ANCHOR = True
INCLUDE_WEAK = True         # new: include weak-prior nodes
ALPHA_CLAMP_MAX = 10.0      # new: max α_ij clamp

labeler_id_map = {"labeler_1": 0, "labeler_2": 1, "labeler_3": 2}

In [None]:
# ---- Preprocessing ----
def preprocess_utterances(data):
    utterances, hard_idxs, weak_idxs = [], [], []
    for item in data:
        did = item['dialogue_id']
        lm = float(item.get('label_male', np.nan))
        lf = float(item.get('label_female', np.nan))
        for i, line in enumerate(item['input_text'].split("\n")):
            if ':' not in line: continue
            spk, txt = line.split(':',1)
            gender = 'male' if spk.strip() == '남' else 'female' if spk.strip() == '여' else 'unknown'
            # 수정: 화자의 반대 성별 prior
            if gender=='male' and not np.isnan(lf):
                base_prior = lf
            elif gender=='female' and not np.isnan(lm):
                base_prior = lm
            else:
                base_prior = 0.5
            weak_prior = base_prior if INCLUDE_WEAK else 0.5
            hard = weak_prior if weak_prior in (0.0,1.0) else None
            idx = len(utterances)
            (hard_idxs if hard is not None else weak_idxs).append(idx)
            utterances.append({
                'dialogue_id': did,
                'utterance_id': f"{did}_{i}",
                'gender': gender,
                'text': f"{spk.strip()}: {txt.strip()}",
                'weak_prior': weak_prior,
                'prior': hard,
                'id': idx
            })
    return utterances, hard_idxs, weak_idxs

In [None]:
# ---- ℓ₀-Based Spectral Sparsification (Appendix D) ----
def spectral_sparsify_l0(W, epsilon=SPARSE_EPSILON):
    prob = np.clip(epsilon * W, 0.0, 1.0)
    mask = np.random.rand(*W.shape) < prob
    W_sparse = np.where(mask, W/np.clip(prob,1e-6,None), 0.0)
    return np.maximum(W_sparse, W_sparse.T)

In [None]:
# ---- Train LabelModel + Compute α_ij (Section 4.1, 4.2) ----
def train_label_model(utterances):
    n, m = len(utterances), len(labeler_id_map)
    # Build vote matrix L: seeds vote hard prior, others abstain=-1
    L = np.full((n, m), -1, dtype=int)
    for i, u in enumerate(utterances):
        if u['prior'] in (0, 1):
            L[i, :] = int(u['prior'])
    # Split seeds for training/dev
    seeds = np.where(L[:, 0] >= 0)[0]
    perm = np.random.permutation(seeds)
    split = int(len(perm) * 0.7)
    L_train = L.copy()
    L_train[perm[split:], :] = -1

    # Fit Snorkel LabelModel
    from snorkel.labeling.model import LabelModel
    lm = LabelModel(cardinality=2, verbose=False)
    lm.fit(L_train=L_train, n_epochs=200, seed=42)
    probs = lm.predict_proba(L=L)

    # Compute per-sample per-labeler α_ij (inverse variance)
    alpha_ij = np.zeros((n, m))
    for i in range(n):
        for j in range(m):
            if L_train[i, j] >= 0:
                alpha_ij[i, j] = 1.0 / (probs[i, 1] * (1 - probs[i, 1]) + 1e-6)
    alpha_ij = np.clip(alpha_ij, 0.0, 1.0)
    return alpha_ij, probs, L

In [None]:
# ---- Graph 및 Embedding 헬퍼 ----
def spectral_sparsify_l0(W, epsilon=SPARSE_EPSILON):
    prob = np.clip(epsilon * W, 0.0, 1.0)
    mask = np.random.rand(*W.shape) < prob
    W_sparse = np.where(mask, W/np.clip(prob,1e-6,None), 0.0)
    return np.maximum(W_sparse, W_sparse.T)

In [None]:
def build_graph_with_dongles(embs, utts, alpha_ij, L, probs, k=50):
    """
    emb   : (n × d) array of utterance embeddings
    utts  : list of utterance dicts (must include 'weak_prior')
    alpha_ij: (n × m) array of per-sample×per-labeler weights
    L     : (n × m) array of hard seed labels (−1 for abstain)
    probs : (n × 2) array of LabelModel soft probabilities
    k     : number of neighbors for k-NN
    """
    n, m = embs.shape[0], alpha_ij.shape[1]
    # 1) utterance-utterance similarity
    sim_u = (cosine_similarity(embs) + 1.0) / 2.0

    # 2) determine total node count
    N = n
    if INCLUDE_DONGLE:
        N += 2 * m
    if INCLUDE_ANCHOR:
        N += len(CENTERS)

    # initialize adjacency
    W = np.zeros((N, N))

    # 3) utterance↔utterance k-NN graph + γ-threshold
    idxs = np.argsort(-sim_u, axis=1)[:, :k]
    for i in range(n):
        for j in idxs[i]:
            w_ij = sim_u[i, j]
            if w_ij > GAMMA_THRESHOLD:
                W[i, j] = W[j, i] = w_ij

    # 4) utterance↔dongle edges
    if INCLUDE_DONGLE:
        for i in range(n):
            for j in range(m):
                idx0, idx1 = n + 2*j, n + 2*j + 1
                vote = L[i, j] if L[i, j] >= 0 else probs[i, 1]
                w0 = alpha_ij[i, j] * (1.0 - vote)
                w1 = alpha_ij[i, j] * vote
                W[i, idx0] = W[idx0, i] = w0
                W[i, idx1] = W[idx1, i] = w1

    # 5) utterance↔anchor nodes (tri-peak scaling)
    if INCLUDE_ANCHOR:
        base = n + (2*m if INCLUDE_DONGLE else 0)
        for ai, center in enumerate(CENTERS):
            anchor_idx = base + ai
            for i in range(n):
                dist   = abs(utts[i]['weak_prior'] - center)
                weight = max(0.0, ANCHOR_SCALE * (1.0 - dist))
                if weight > GAMMA_THRESHOLD:
                    W[i, anchor_idx] = W[anchor_idx, i] = weight

    # 6) ℓ₀-based spectral sparsification + row-stochastic normalization
    W = spectral_sparsify_l0(W)
    row_sum = W.sum(axis=1, keepdims=True)
    row_sum[row_sum == 0] = 1.0
    return W / row_sum


In [None]:
# (1) row-stochastic 그래프 생성 함수
def build_graph_row_stochastic(sim, k=50, gamma=GAMMA_THRESHOLD):
    n = sim.shape[0]
    A = np.zeros_like(sim)

    idxs = np.argsort(-sim, axis=1)[:, :k]
    for i in range(n):
        A[i, idxs[i]] = sim[i, idxs[i]]
    A[A < gamma] = 0
    A = np.maximum(A, A.T)
    # row-normalize
    row_sum = A.sum(axis=1, keepdims=True)
    row_sum[row_sum == 0] = 1
    return A / row_sum

def get_embeddings(utterances, batch_size=8, max_length=256):
    tokenizer = AutoTokenizer.from_pretrained("monologg/kobigbird-bert-base")
    model = AutoModel.from_pretrained("monologg/kobigbird-bert-base").cuda()
    model.eval()
    embs = []
    for i in range(0,len(utterances),batch_size):
        batch = utterances[i:i+batch_size]
        texts = [f"[CLS] {u['text']} [SEP]" for u in batch]
        inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to('cuda')
        with torch.no_grad(): out = model(**inputs).last_hidden_state[:,0,:].cpu().numpy()
        embs.append(out)
    return np.vstack(embs)

In [None]:
# ---- Label Propagation (Section 3.2, Eqn 6) ----
def label_propagation(embs, utts, alpha_ij, probs, L):
    P = build_graph_with_dongles(embs, utts, alpha_ij, L, probs)
    # Compute μ
    if MU_MODE == 'spectral':
        lap = np.eye(P.shape[0]) - P
        vals, _ = eigs(lap, k=2, which='SM')
        mu = float(np.real(vals[1]) / (1 + np.real(vals[1])))
    elif MU_MODE == 'boundary':
        # boundary 리스트에서 저·고 경계 사용
        low, high = MU_BOUNDARY[0], MU_BOUNDARY[-1]
        mu = (low + high) / 2.0
    else:  # constant
        mu = MU_VALUE
    # Initialize Y_ext to match P.shape[0] = N
    n, m = len(utts), alpha_ij.shape[1]
    Y = []
    # 2.1) utterance 노드 목표값: hard prior or LabelModel 확률
    for u in utts:
        if u['prior'] in (0,1):
            Y.append(float(u['prior']))
        else:
            Y.append(probs[u['id'],1])

    # 2.2) dongle 노드 목표값: 각 레이블러당 [0, 1]
    if INCLUDE_DONGLE:
        for _ in range(m):
            Y.append(0.0)
            Y.append(1.0)

    # 2.3) anchor 노드 목표값: 각 CENTER 값
    if INCLUDE_ANCHOR:
        for center in CENTERS:
            Y.append(float(center))

    Y = np.array(Y, dtype=float)
    #  여기까지 Y의 길이가 P.shape[0]과 일치합니다
    # Propagation
    F = Y.copy()
    for _ in range(MAX_ITER):
        F_new = (1 - mu) * (P @ F) + mu * Y
        if np.linalg.norm(F_new - F) < EPSILON:
            break
        F = F_new
    return F[:n]

In [None]:
# ---- Joint Loss (Section 5) ----
def joint_loss(f_soft, y_hard, alpha_ij, L, probs):
    n, m = alpha_ij.shape
    mask = ~np.isnan(y_hard)
    loss_h = np.mean((f_soft[mask] - y_hard[mask])**2) if mask.any() else 0.0
    loss_s = 0.0
    for i in range(n):
        for j in range(m):
            h = L[i,j] if L[i,j]>=0 else probs[i,1]
            loss_s += alpha_ij[i,j] * (f_soft[i] - h)**2
    return loss_h + loss_s/(n*m)

In [None]:
# ---- Ablation Evaluation ----
def run_ablation(input_path, ground_truth_path):
    gt = {item['utterance_id']: item['soft_label'] for item in json.load(open(ground_truth_path,'r',encoding='utf-8'))}
    results = {}
    for dong in (True, False):
        for anc in (True, False):
            for weak in (True, False):
                global INCLUDE_DONGLE, INCLUDE_ANCHOR, INCLUDE_WEAK
                INCLUDE_DONGLE, INCLUDE_ANCHOR, INCLUDE_WEAK = dong, anc, weak
                data = [json.loads(l) for l in open(input_path,'r',encoding='utf-8')]
                utts, hard_idxs, weak_idxs = preprocess_utterances(data)
                alpha_sample, alpha_var, p_js, probs = train_label_model(utts)
                embs = get_embeddings(utts)
                soft = label_propagation(embs, utts, alpha_sample, alpha_var, p_js, probs, weak_idxs)
                ids = [u['utterance_id'] for u in utts]
                y_pred = [soft[i] for i in range(len(utts))]
                y_true = [gt[id_] for id_ in ids]
                mse = mean_squared_error(y_true, y_pred)
                mae = mean_absolute_error(y_true, y_pred)
                r2 = r2_score(y_true, y_pred)
                key = f"dongle={dong},anchor={anc},weak={weak}"
                results[key] = {"MSE": mse, "MAE": mae, "R2": r2}
    print(json.dumps(results, ensure_ascii=False, indent=2))
    return results


In [None]:
def save_soft_labels_dual(utterances, soft_m, soft_f, out_path):
    # utterances: original utterance list (preprocess_utterances 사용 전의 데이터 list of dict)
    with open(out_path, "w", encoding="utf-8") as f:
        # 두 결과를 dict로 매핑
        male_map   = {u['utterance_id']: float(s) for u, s in zip(utterances, soft_m)}
        female_map = {u['utterance_id']: float(s) for u, s in zip(utterances, soft_f)}
        for u in utterances:
            rec = {
                "dialogue_id": u["dialogue_id"],
                "utterance_id": u["utterance_id"],
                "text": u["text"],
                "soft_label_male": round(male_map.get(u['utterance_id'], 0.5), 4),
                "soft_label_female": round(female_map.get(u['utterance_id'], 0.5), 4)
            }
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")


In [None]:
def end_to_end_dual(input_path, output_path, ground_truth_path=None):
    # 1) 원본 JSONL 로드
    raw = [json.loads(l) for l in open(input_path, 'r', encoding='utf-8')]
    # 2) 전처리: utterances list 생성 (prior 포함)
    utts, hard_idxs, weak_idxs = preprocess_utterances(raw)

    # 3) 남자 호감도 propagation
    alpha_m, probs_m, L_m = train_label_model(utts)
    embs_m = get_embeddings(utts)
    soft_m = label_propagation(embs_m, utts, alpha_m, probs_m, L_m)

    # 4) 여자 호감도 propagation
    alpha_f, probs_f, L_f = train_label_model(utts)
    embs_f = get_embeddings(utts)
    soft_f = label_propagation(embs_f, utts, alpha_f, probs_f, L_f)

    # 5) 결과 저장: 남/여 소프트 라벨 동시 기록
    save_soft_labels_dual(utts, soft_m, soft_f, output_path)

    # 6) (선택) ground_truth_path가 주어지면 ablation 등 평가 실행
    if ground_truth_path:
        run_ablation(input_path, ground_truth_path)
    print(f"Dual soft-labels saved to {output_path}")


In [None]:
end_to_end_dual("dialogues_human.jsonl", "output.jsonl")

100%|██████████| 200/200 [00:00<00:00, 829.64epoch/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/241k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/492k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/870 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/458M [00:00<?, ?B/s]

Attention type 'block_sparse' is not possible if sequence_length: 40 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
100%|██████████| 200/200 [00:00<00:00, 909.84epoch/s]
Attention type 'block_sparse' is not possible if sequence_length: 40 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Dual soft-labels saved to output.jsonl


In [None]:
import json
import numpy as np

def evaluate_non_abstain_quality(
    human_path: str,
    pseudo_path: str,
    eps_abstain: float = 0.001
):
    # 1) 원본 human 파일에서 utterance_id → hard prior(0/1) 매핑
    prior_map = {}
    with open(human_path, 'r', encoding='utf-8') as f:
        for line in f:
            item = json.loads(line)
            did = item['dialogue_id']
            lm = float(item.get('label_male', np.nan))
            lf = float(item.get('label_female', np.nan))
            for i, ut in enumerate(item['input_text'].split('\n')):
                if ':' not in ut:
                    continue
                spk, _ = ut.split(':', 1)
                gender = (
                    'male'   if spk.strip() == '남' else
                    'female' if spk.strip() == '여' else
                    None
                )
                if gender is None:
                    continue
                base = (
                    lm if gender == 'male' and not np.isnan(lm) else
                    lf if gender == 'female' and not np.isnan(lf) else
                    None
                )
                if base in (0.0, 1.0):
                    uid = f"{did}_{i}"
                    prior_map[uid] = int(base)

    # 2) pseudolabel 파일에서 utterance_id 순서대로 soft_label 수집
    utterance_ids = []
    soft_vals = []
    with open(pseudo_path, 'r', encoding='utf-8') as f:
        for line in f:
            rec = json.loads(line)
            uid = rec['utterance_id']
            utterance_ids.append(uid)
            if rec.get('speaker') == 'male':
                soft_vals.append(rec.get('soft_label_male', np.nan))
            elif rec.get('speaker') == 'female':
                soft_vals.append(rec.get('soft_label_female', np.nan))
            else:
                soft_vals.append(np.nan)

    utterance_ids = np.array(utterance_ids)
    soft_vals = np.array(soft_vals, dtype=float)

    # 3) 전체 발화 수 및 non‐abstain 마스크
    total = len(soft_vals)
    non_abstain = ~np.isnan(soft_vals) & (np.abs(soft_vals - 0.5) >= eps_abstain)
    coverage = non_abstain.sum() / total * 100.0

    # 4) hard‐seed에 한정한 Non‐Abstain Accuracy (@0.5)
    is_seed = np.array([uid in prior_map for uid in utterance_ids])
    eval_mask = is_seed & non_abstain

    if eval_mask.sum() > 0:
        y_true = np.array([prior_map[uid] for uid in utterance_ids[eval_mask]], dtype=int)
        y_pred = (soft_vals[eval_mask] >= 0.5).astype(int)
        na_acc = (y_pred == y_true).mean() * 100.0
    else:
        na_acc = float('nan')

    # 5) 결과 출력
    print(f"Total utterances               : {total}")
    print(f"Non‐Abstain coverage           : {coverage:.1f}%")
    print(f"Hard‐seed count                : {is_seed.sum()}")
    print(f"Evaluated seeds (non‐abstain)  : {eval_mask.sum()} "
          f"({eval_mask.sum()/is_seed.sum()*100:.1f}% of seeds)")
    print(f"Non‐Abstain Accuracy (@0.5)    : {na_acc:.1f}%")

    return {"coverage": coverage, "na_accuracy": na_acc}

# 사용 예시
res = evaluate_non_abstain_quality(
    human_path="dialogues_human.jsonl",
    pseudo_path="dialogues_with_soft_labels.jsonl",
    eps_abstain=0.001
)


Total utterances               : 5440
Non‐Abstain coverage           : 99.8%
Hard‐seed count                : 3892
Evaluated seeds (non‐abstain)  : 3890 (99.9% of seeds)
Non‐Abstain Accuracy (@0.5)    : 100.0%
