In [5]:
import json
import csv
import pandas as pd

def json_to_df(data):
    """
    将指定格式的 JSON 文件转换为 CSV 文件
    - CSV 第一列为 id（1, 2, 3...）
    - 第二列为 domain，固定为 "political"
    - 随后每两列分别为 roundN_predicted 和 roundN_actual，对应每个 round 的 predicted 与 actual 值
    """
    """
    将指定格式的 JSON 文件转换为一个 pandas DataFrame。
    
    转换规则：
    - 第一列：id（从 1 开始的编号）
    - 第二列：domain（固定填 "political"）
    - 随后每两列为每个 round 的 predicted 与 actual（例如：round1_predicted, round1_actual, round2_predicted, round2_actual, ...）
    """
    """
    将指定格式的 JSON 文件转换为 pandas DataFrame，并自动识别 domain。
    
    转换规则：
    - 第一列 "id": 从1开始的全局编号（合并所有 domain）。
    - 第二列 "domain": 根据 JSON 文件顶层 key 自动识别。
    - 之后每两列依次为各 round 的 predicted 与 actual 数值，列名格式：roundX_predicted, roundX_actual
    - 如果某个数据点缺少某 round，则填充为 None。
    """
    # 计算所有数据点中最大的 round 数量
    global_max_rounds = 0
    for domain, datapoints in data.items():
        for datapoint in datapoints:
            num_rounds = len(datapoint)
            if num_rounds > global_max_rounds:
                global_max_rounds = num_rounds

    # 构造 DataFrame 的列名
    columns = ['id', 'domain']
    for round_no in range(1, global_max_rounds + 1):
        columns.append(f"round{round_no}_predicted")
        columns.append(f"round{round_no}_actual")

    rows = []
    global_id = 1
    # 遍历每个 domain 和对应的数据点
    for domain, datapoints in data.items():
        for datapoint in datapoints:
            # 按 "round" 字段排序，确保顺序正确
            rounds_sorted = sorted(datapoint, key=lambda x: x.get("round", 0))
            # 构造一个映射：round 编号 -> (predicted, actual)
            round_data = { item.get("round"): (item.get("predicted"), item.get("actual")) for item in rounds_sorted }
            
            # 构造行数据：id 和 domain 固定在前两列
            row = [global_id, domain]
            # 根据全局最大 round 数补齐所有 round 数据
            for r in range(1, global_max_rounds + 1):
                if r in round_data:
                    row.extend(list(round_data[r]))
                else:
                    row.extend([None, None])
            rows.append(row)
            global_id += 1

    return pd.DataFrame(rows, columns=columns)

In [6]:
import json

# 定义 JSON 文件路径
json_file_path = "data/sample.json"  # 请替换成你实际的文件路径

# 打开并读取 JSON 文件
with open(json_file_path, "r", encoding="utf-8") as f:
    data = json.load(f)


In [9]:
sample = json_to_df(data)
sample

Unnamed: 0,id,domain,round1_predicted,round1_actual,round2_predicted,round2_actual,round3_predicted,round3_actual,round4_predicted,round4_actual,round5_predicted,round5_actual
0,1,politics,True,True,True,True,False,True,True,False,False,False
1,2,politics,True,True,True,False,False,False,False,True,False,False
2,3,politics,True,True,True,True,False,True,True,False,False,False


In [10]:
import math

def calculate_hybrid_loss(rounds, beta=1.0, lam=0.5):
    """
    Calculates the hybrid loss function combining Weighted Binary Cross-Entropy (WBCE) and Continuity-Weighted Loss (CWL).
    
    Parameters:
      rounds: A list of dictionaries where each dictionary represents one round. Each dictionary should have:
              - "round": the round number (integer)
              - "predicted": predicted probability (float) or boolean (which will be converted to 0/1)
              - "actual": ground truth label (boolean or 0/1)
      beta:   Exponential depth sensitivity parameter (β).
      lam:    Balance weight (λ) between immediate accuracy (WBCE) and long-term continuity (CWL), λ ∈ [0, 1].
    
    Returns:
      loss: The computed hybrid loss value.
      
    The hybrid loss LH is defined as:
        LH = (1 - lam) * sum_{i=1}^{N} (wi * LBCE(yi, ˆpi)) + lam * LCWL,
    where:
        LBCE(yi, ˆpi) = - yi·log(ˆpi) - (1 - yi)·log(1 - ˆpi)
        wi = (i / N)^beta
        LCWL = 1 - (Cmax / N)   with Cmax being the longest contiguous correct prediction sequence.
    """
    epsilon = 1e-8  # Small value to avoid taking log(0)
    
    # Sort rounds based on the "round" key to ensure the correct sequential order.
    rounds_sorted = sorted(rounds, key=lambda x: x.get("round", 0))
    N = len(rounds_sorted)
    
    weighted_bce = 0.0
    
    # Compute the weighted binary cross-entropy for each round.
    for idx, round_data in enumerate(rounds_sorted, start=1):
        # Obtain the ground truth and convert it to a float (0.0 or 1.0)
        y_val = round_data.get("actual")
        y = 1.0 if y_val in [True, 1] else 0.0
        
        # Get the prediction; if it's boolean, convert to 0.0 or 1.0; otherwise, cast to float.
        pred = round_data.get("predicted")
        if isinstance(pred, bool):
            p = 1.0 if pred else 0.0
        else:
            p = float(pred)
        
        # Clamp p to avoid log(0), keeping it within [epsilon, 1 - epsilon].
        p = min(max(p, epsilon), 1 - epsilon)
        
        # Calculate Binary Cross-Entropy for the round.
        bce = - (y * math.log(p) + (1 - y) * math.log(1 - p))
        
        # Calculate the depth-based weight: wi = (i / N)^beta.
        weight = (idx / N) ** beta
        
        weighted_bce += weight * bce

    # Compute the Continuity-Weighted Loss (CWL)
    # Identify the longest contiguous sequence of rounds where the binarized prediction matches the ground truth.
    longest_correct_seq = 0
    current_seq = 0
    for round_data in rounds_sorted:
        # Binarize the predicted value (threshold at 0.5).
        pred = round_data.get("predicted")
        if isinstance(pred, bool):
            pred_bin = 1 if pred else 0
        else:
            pred_bin = 1 if float(pred) >= 0.5 else 0
        
        # Binarize the ground truth.
        actual = round_data.get("actual")
        y_bin = 1 if actual in [True, 1] else 0
        
        if pred_bin == y_bin:
            current_seq += 1
            longest_correct_seq = max(longest_correct_seq, current_seq)
        else:
            current_seq = 0

    # Calculate CWL as 1 - (Cmax / N).
    cwl = 1 - (longest_correct_seq / N)
    
    # Combine the two parts to calculate the final hybrid loss.
    loss = (1 - lam) * weighted_bce + lam * cwl
    
    return loss

In [11]:
# Set hyperparameters beta and lambda as desired.
beta = 1.0  # Depth sensitivity parameter.
lam = 0.5   # Weight balancing WBCE and CWL.

In [None]:
example_rounds = [
        {"round": 1, "predicted": True,  "actual": False},
        {"round": 2, "predicted": True,  "actual": True},
        {"round": 3, "predicted": False, "actual": True},
        {"round": 4, "predicted": True,  "actual": False},
        {"round": 5, "predicted": False, "actual": False}
    ]

example_rounds = [
        {"round": 1, "pi": 0.85,  "yi": 1},
        {"round": 2, "pi": 0.92,  "yi": 1},
        {"round": 3, "pi": 0.21, "yi": 1},
        {"round": 4, "pi": 0.79,  "yi": 0},
        {"round": 5, "pi": 0.01, "yi": 0}
    ]

example_converted = {}

loss_value = calculate_hybrid_loss(example_rounds, beta=beta, lam=lam)

In [13]:
loss_value

13.194476526756755

In [28]:
import numpy as np

def weighted_binary_cross_entropy(y_true, y_pred, beta=0.8):
    """
    Computes depth-weighted binary cross-entropy loss.
    Args:
        y_true: List/array of true labels (0 or 1)
        y_pred: List/array of predicted probabilities (0 to 1)
        beta: Controls the steepness of round-based weighting
    Returns:
        WBCE loss (float)
    """
    N = len(y_true)
    wbce = 0.0

    for i in range(N):
        weight = ((i + 1) / N) ** beta  # i+1 to avoid zero division
        y = y_true[i]
        p = np.clip(y_pred[i], 1e-7, 1 - 1e-7)  # avoid log(0)
        loss = - (y * np.log(p) + (1 - y) * np.log(1 - p))
        wbce += weight * loss

    return wbce


def continuity_score(y_true, y_pred):
    """
    Computes the longest contiguous correct subsequence.
    Args:
        y_true: List of true labels
        y_pred: List of predicted binary labels (0 or 1)
    Returns:
        Continuity-weighted loss (float)
    """
    N = len(y_true)
    max_contig = curr = 0

    for yt, yp in zip(y_true, y_pred):
        if yt == yp:
            curr += 1
            max_contig = max(max_contig, curr)
        else:
            curr = 0

    return 1 - (max_contig / N)


def hybrid_loss(y_true, y_pred_probs, y_pred_labels, beta=1.5, lambda_coef=0.2):
    """
    Computes the hybrid loss: (1 - lambda) * WBCE + lambda * CWL
    Args:
        y_true: Ground truth binary labels
        y_pred_probs: Predicted probabilities (for WBCE)
        y_pred_labels: Predicted binary labels (for continuity scoring)
        beta: WBCE depth exponent
        lambda_coef: balance between WBCE and CWL
    Returns:
        Hybrid loss (float)
    """
    wbce = weighted_binary_cross_entropy(y_true, y_pred_probs, beta)
    cwl = continuity_score(y_true, y_pred_labels)
    return (1 - lambda_coef) * wbce + lambda_coef * cwl

In [29]:
y_true = [1, 1, 0, 1, 1]
y_pred_probs = [0.9, 0.95, 0.1, 0, 0]
y_pred_labels = [1, 1, 0, 0, 0]

loss = hybrid_loss(y_true, y_pred_probs, y_pred_labels)
print(f"Hybrid Loss: {loss:.4f}")

Hybrid Loss: 22.2581


In [30]:
y_true = [0, 0, 1, 0, 0]
y_pred_probs = [0.9, 0.95, 0.1, 0, 0]
y_pred_labels = [1, 1, 0, 0, 0]

loss = hybrid_loss(y_true, y_pred_probs, y_pred_labels)
print(f"Hybrid Loss: {loss:.4f}")

Hybrid Loss: 1.7472
