In [None]:
# data formatting scripts
import sys
# Create multiple bootstrapped 
import plotly.graph_objects as go
import sys
from pathlib import Path

from collections import defaultdict
import json, math, glob
import numpy as np
import pandas as pd
import scipy.stats as stats
import plotly.express as px
# import plotly as px
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

parent_dir = str(Path().resolve().parent)
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)
import estimators as est

In [None]:

def fill_missing_example_ids(df):
    """
    Takes a DataFrame with columns: model, example_id, step, pass1, benchmark_id
    Fills in missing example_ids for each (model, step) combination with pass1=0
    """
    # Get the union of all example_ids
    all_example_ids = df['example_id'].unique()
    
    # Get all unique combinations of model and step
    model_step_combos = df[['model', 'step']].drop_duplicates()
    
    # Create a complete grid of all combinations
    complete_grid = []
    for _, row in model_step_combos.iterrows():
        for example_id in all_example_ids:
            complete_grid.append({
                'model': row['model'],
                'step': row['step'],
                'example_id': example_id
            })
    
    complete_df = pd.DataFrame(complete_grid)
    
    # Merge with original data
    # Left join to keep all combinations, filling missing values
    result = complete_df.merge(
        df,
        on=['model', 'step', 'example_id'],
        how='left'
    )
    
    # Fill missing pass1 values with 0
    result['pass1'] = result['pass1'].fillna(0)
    
    # For benchmark_id, you might want to forward fill or use a specific value
    # Option 1: Forward fill within each model-step group
    result['benchmark_id'] = result.groupby(['model', 'step'])['benchmark_id'].ffill().bfill()
    
    # Option 2: Or if benchmark_id should be consistent per example_id:
    # result['benchmark_id'] = result.groupby('example_id')['benchmark_id'].ffill().bfill()
    
    return result

def bootstrap_sample_df_clean(df, sample_pred=True, random_state=None):
    """
    Same as bootstrap_sample_df but without the bootstrap_idx column.
    Note: This means if the same example_id is sampled multiple times,
    those rows will appear multiple times in the result.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with columns: model, example_id, step, pass1, benchmark_id
    random_state : int, optional
        Random seed for reproducibility
        
    Returns:
    --------
    pd.DataFrame
        Bootstrapped sample with the same structure as input
    """
    rng = np.random.RandomState(random_state)
    
    # Get all unique example_ids
    all_example_ids = df['example_id'].unique()
    n_examples = len(all_example_ids)
    
    # Draw bootstrap sample of example_ids (with replacement)
    bootstrapped_example_ids = rng.choice(all_example_ids, size=n_examples, replace=True)
    
    # Filter the dataframe to only include the bootstrapped example_ids
    bootstrap_dfs = []
    for example_id in bootstrapped_example_ids:
        # Get all rows for this example_id (across all models and steps)
        example_rows = df[df['example_id'] == example_id].copy()
            # Compute the difference
        if sample_pred:
        # Resample from binomial distribution for each model
            # rng = np.random.RandomState()
            example_rows['correct'] = rng.binomial(example_rows['count'], example_rows['pass1'])
            example_rows['pass1'] = example_rows['correct'] / example_rows['count'] 
        bootstrap_dfs.append(example_rows)
    
    # Concatenate all bootstrapped samples
    result = pd.concat(bootstrap_dfs, ignore_index=True)
    
    return result

def compute_model_diff(df):
    """
    Compute the difference (B-) for each step.
    Only includes steps where both models are present.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with columns: step, model, pass1, example_id
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with columns: step, model='diff', pass1_diff
    """
    # Split into baseline and selfplay dataframes
    df_A = df[df['model'] == 'A'][['step', 'pass1', 'example_id']].copy()
    df_B = df[df['model'] == 'B'][['step', 'pass1', 'example_id']].copy()
    
    # Rename pass1 columns to distinguish them
    df_A = df_A.rename(columns={'pass1': 'pass1_A'})
    df_B = df_B.rename(columns={'pass1': 'pass1_B'})
    
    # Merge on step and example_id to get only common steps
    df_merged = df_A.merge(
        df_B,
        on=['step', 'example_id'],
        how='inner'  # inner join keeps only steps present in both
    )

    df_merged['pass1_diff'] = df_merged['pass1_B'] - df_merged['pass1_A']
    
    # Create the result dataframe
    result = df_merged[['step', 'pass1_diff']].copy()
    result['model'] = 'diff'
    
    # Reorder columns
    result = result[['step', 'model', 'pass1_diff']]
    
    return result


def aggregate_steps(df, stepsize, groupsize):
    """
    Aggregate steps together with overlapping windows.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe with columns: model, step, example_id, pass1, benchmark_id, model1, correct, count
    stepsize : int
        Step increment between consecutive groups (e.g., 10 means groups start at 0, 10, 20, ...)
    groupsize : int
        Size of each group/window (e.g., 30 means each group spans 30 steps)
    
    Returns:
    --------
    pd.DataFrame
        Aggregated dataframe with overlapping step groups
        
    Example:
    --------
    aggregate_steps(df, stepsize=10, groupsize=30)
    Creates groups: [0-29], [10-39], [20-49], [30-59], ...
    """
    # Create a copy to avoid modifying the original
    df = df.copy()
    
    # Find the range of steps in the data
    min_step = df['step'].min()
    max_step = df['step'].max()
    
    # Generate all group starting points
    # Continue creating groups as long as the group start is <= max_step - groupsize + 1
    # This ensures the last group ends at or after max_step
    group_starts = []
    current_start = min_step
    while current_start <= max_step:
        group_end = current_start + groupsize - 1
        # Add this group (it will be filtered later to only include actual data points)
        group_starts.append(current_start)
        # Stop if the next group would start after we can fit a full group
        if current_start > max_step - groupsize + 1:
            break
        current_start += stepsize
    
    # Create list to store all aggregated groups
    all_groups = []
    
    for group_start in group_starts:
        group_end = group_start + groupsize - 1
        
        # Filter data for this group
        group_df = df[(df['step'] >= group_start) & (df['step'] <= group_end)].copy()
        
        # Skip if no data in this group
        if len(group_df) == 0:
            continue
        
        # Add group identifiers
        group_df['step_group'] = group_start
        group_df['step_group_end'] = group_end
        
        # Group by model, step_group, and example_id
        agg_group = group_df.groupby(['model_pref', 'step_group', 'step_group_end', 'example_id', 'benchmark_id']).agg({
            'correct': 'sum',
            'count': 'sum'
        }).reset_index()
        
        # Calculate pass1 as correct/count
        agg_group['pass1'] = agg_group['correct'] / agg_group['count']
        
        # Create new model name
        agg_group['model'] = (agg_group['model_pref'] + '_' + 
                              agg_group['step_group'].astype(str) + '-' + 
                              agg_group['step_group_end'].astype(str))
        
        all_groups.append(agg_group)
    
    # Combine all groups
    if len(all_groups) == 0:
        # Return empty dataframe with correct structure
        return pd.DataFrame(columns=['model', 'step', 'example_id', 'pass1', 'benchmark_id', 'model_pref', 'correct', 'count'])
    
    agg_df = pd.concat(all_groups, ignore_index=True)
    
    # Rename step_group to step for consistency
    agg_df = agg_df.rename(columns={'step_group': 'step'})
    
    # Reorder columns to match original structure
    agg_df = agg_df[['model', 'step', 'example_id', 'pass1', 'benchmark_id', 'model_pref', 'correct', 'count', 'step_group_end']]
    
    return agg_df


# df_f = df.copy()
# df_f = fill_missing_example_ids(df_f)
# df_f["model_pref"] = df_f["model"]
# df_f["model"] = df_f.apply(lambda x: x["model"] + "-" + str(x["step"]), axis=1)
# df_f["correct"] = df_f.apply(lambda x: int(x["pass1"]), axis=1)
# df_f["count"] = 1
# df_f.to_json(f'../data/train_curve/swebench-verified.jsonl', orient='records', lines=True)
df = pd.read_json(f'../data/train_curve/swebench-verified.jsonl', orient='records', lines=True)
df_f = df.copy()

display(df_f)
df_gr = aggregate_steps(df_f, groupsize=50, stepsize=10)
display(df_gr)
df_gr.to_json(f'../data/train_curve/swebench-verified-group-50.jsonl', orient='records', lines=True)

# df_pro = pd.read_json(f'../data/train_curve/swebench-pro.jsonl', orient='records', lines=True)
# df_f = df_pro.copy()
# display(df_f)
# df_gr = aggregate_steps(df_f, groupsize=50, stepsize=10)
# display(df_gr)
# df_gr.to_json(f'../data/train_curve/swebench-pro-group-50.jsonl', orient='records', lines=True)


df_tc_plot1 = df_f.groupby(by=["step", "model"]).aggregate(
    {"pass1": "mean", "example_id": "count"}
).reset_index()
display(df_tc_plot1)


In [None]:
# Combine all bootstrap results

# df2 = df.copy()
# df2 = fill_missing_example_ids(df2)
df0 = df_f.copy()
display(df0)

def apply_estimator_by_step(df, estimator, model_a='A', model_b='B', verbose=False):
    """
    For each step, extract pass1 and count for model A and B, call estimator function,
    and combine with aggregated statistics.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe with columns: model_pref, step, example_id, pass1, correct, count
    estimator : callable
        Function with signature estimator(pA, pB, kA, kB) -> dict[str, float/int]
        where pA, pB are pass rates and kA, kB are counts for models A and B
    model_a : str
        Name/identifier for model A (default: 'A')
    model_b : str
        Name/identifier for model B (default: 'B')
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with columns: step, model, pass1 (mean), example_id (count), 
        plus any columns returned by the estimator function
    """

    steps = sorted(df['step'].unique())
    N = len(df['example_id'].unique())
    
    # Store estimator results for each step
    estimator_results = []
    
    for step in steps:
        # Filter data for this step
        step_data = df[df['step'] == step]
        
        # Extract data for model A and B
        model_a_data = step_data[step_data['model'] == model_a]
        model_b_data = step_data[step_data['model'] == model_b]
        if len(model_a_data) == 0 or len(model_b_data) == 0:
            continue
        assert len(model_a_data) == N and len(model_b_data) == N 
        # Check if both models exist for this step
        merged = pd.merge(model_a_data, model_b_data, on="example_id", suffixes=["_a", "_b"])

        # Extract pass rates and counts
        # display(merged)
        pA = merged['pass1_a'].to_numpy()[:, None]
        pB = merged['pass1_b'].to_numpy()[:, None]
        kA = merged['count_a'].iloc[0]
        kB = merged['count_b'].iloc[0]

        if verbose:
            if np.random.rand() < 0.25:
                fig = go.Figure()
                N = len(pA)
                fig.add_trace(go.Scatter(x=merged["pass1_a"] + 0.1*np.random.rand(N),
                                         y=merged["pass1_b"] + 0.1*np.random.rand(N),
                                         mode='markers'))
                fig.update_layout(width=400, height=400)
                display(fig)

                sorted_indices = np.argsort(pA.flatten())[::-1]
                pA_sorted = pA[sorted_indices]
                pB_sorted = pB[sorted_indices]


                fig = go.Figure()
                fig.add_scatter(x=np.arange(len(pA_sorted)), y=pA_sorted.flatten(), name="pA")
                fig.add_scatter(x=np.arange(len(pB_sorted)), y=pB_sorted.flatten(), name="pB", mode='markers')
                fig.update_layout(xaxis_title="Index (sorted by pA)", yaxis_title="Probability")
                fig.update_layout(width=400, height=400)
                display(fig)

        
        # print(pA, pB, kA, kB)
        # Call estimator function
        if "unbiased" in estimator.__name__:
            est_result = estimator(pA, pB, kA, kB).to_dict()
        else:
            est_result = estimator(pA, pB).to_dict()
        
        # Add step to the result
        est_result['step'] = step
        est_result['SE'] =  np.sqrt(1/N * est_result["var(A-B)"])  
        est_result['SE_x'] =  np.sqrt(1/N * est_result["var(E(A-B))"])  
        est_result['SE_pred'] =  np.sqrt(1/N * est_result["E(var(A-B))"])
        est_result['SE_direct'] =  np.sqrt(1/N * np.var(pA-pB))
        K = kA
        est_result['SE_k'] = np.sqrt(1/N * (est_result["var(A-B)"] / K + est_result["var(E(A-B))"]))
        est_result['mean_A'] =  pA.mean()
        est_result['mean_B'] =  pB.mean()
        est_result['ks'] =  (kA, kB)
        est_result['diff'] =  pB.mean() - pA.mean()
        est_result['z'] =  (pB.mean() - pA.mean()) / np.sqrt(1/N * est_result["var(A-B)"]) 
        est_result['z_x'] =  (pB.mean() - pA.mean()) / np.sqrt(1/N * est_result["var(E(A-B))"])  
        est_result['z_k'] =  (pB.mean() - pA.mean()) / est_result['SE_direct']
        estimator_results.append(est_result)
    #
    return pd.DataFrame(estimator_results)
    
df0["model"] = df0["model_pref"]
# df1 = apply_estimator_by_step(df0, est.Paired.from_bernoulli_prob_unbiasedK)

df_gr = aggregate_steps(df_f, groupsize=50, stepsize=10)
display(df_gr)
df_gr2 = df_gr.copy()
df_gr2["model"] = df_gr2["model_pref"]
df_grSE= apply_estimator_by_step(df_gr2, est.Paired.from_bernoulli_prob_unbiasedK, model_a="cwm-baseline", model_b="cwm-selfplay", verbose=True)
display(df_grSE)
display(df_grSE.describe())
df_grSE= apply_estimator_by_step(df_gr2, est.Paired.from_bernoulli_prob, verbose=False)
display(df_grSE)


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import pandas as pd

BOOTSTRAP_COUNT = 5 
PAIRED_BOOTSTRAP_COUNT = 10
INITIAL = 0.41
df_f['model_pref'] = df_f['model_pref'].str.replace('cwm-baseline', 'A').replace('cwm-selfplay', 'B')
df_gr['model_pref'] = df_gr['model_pref'].str.replace('cwm-baseline', 'A').replace('cwm-selfplay', 'B')
# Assuming df_f, bootstrap_sample_df_clean, and compute_model_diff are already defined
# df_gr also needs to be defined
# model_pref needs to be A and B

# Create subplot figure with 1 row and 3 columns
fig = make_subplots(
    rows=1, cols=4,
    subplot_titles=("Original training curve", "Unpaired bootstrap", "Paired bootstrap", "Paired, averaged bootstrap"),
    horizontal_spacing=0.04
)

# ============================================
# SUBPLOT 1: Basic Model A vs B comparison
# ============================================
df0_plot1 = df_f.copy()
df0_plot1["model"] = df0_plot1["model_pref"]

# Group and aggregate for basic comparison
df_tc_plot1 = df0_plot1.groupby(by=["step", "model"]).aggregate(
    {"pass1": "mean", "example_id": "count"}
).reset_index()

# Get unique models
models = df_tc_plot1["model"].unique()
colors = px.colors.qualitative.Plotly

for idx, model in enumerate(models):
    df_model = df_tc_plot1[df_tc_plot1["model"] == model]
    # Add a specific row to df_model
    # new_row = {'step': 0, 'model': model, 'pass1': INITIAL, 'example_id': 500}
    # df_model = pd.concat([pd.DataFrame([new_row]), df_model], ignore_index=True)
    fig.add_trace(
        go.Scatter(
            x=df_model["step"],
            y=df_model["pass1"],
            mode='lines+markers',
            name=model,
            legendgroup="group1",
            line=dict(color=colors[idx % len(colors)]),
            showlegend=True
        ),
        row=1, col=1
    )

# ============================================
# SUBPLOT 2: Unpaired Version
# ============================================
df0_plot2 = df_f.copy()

for i in range(BOOTSTRAP_COUNT):
    dfboots = df0_plot2.copy()
    dfboots = bootstrap_sample_df_clean(dfboots)

    dfboots["model"] = dfboots["model_pref"]

    # For unpaired, we don't match example_ids - just show raw performance
    df_tc_plot2 = dfboots.groupby(by=["step", "model"]).aggregate(
        {"pass1": "mean", "example_id": "count"}
    ).reset_index()

    for idx, model in list(enumerate(models))[::-1]:
        df_model = df_tc_plot2[df_tc_plot2["model"] == model]
        fig.add_trace(
            go.Scatter(
                x=df_model["step"],
                y=df_model["pass1"],
                mode='lines',
                # mode='markers',
                name=model,
                legendgroup="group2",
                line=dict(color=colors[idx % len(colors)]),
                opacity=0.5,
                showlegend=False  # Don't duplicate legend
            ),
            row=1, col=2
        )

# ============================================
# SUBPLOT 3: Bootstrapped Comparison (your existing code)
# ============================================
df0_plot3 = df_f.copy()
df0_plot3["model"] = df0_plot3["model_pref"]

# Create multiple bootstrapped samples
count_neg = 0
count = 0
for i in range(PAIRED_BOOTSTRAP_COUNT):
    dfboots = df0_plot3.copy()
    
    if i < PAIRED_BOOTSTRAP_COUNT - 1:
        dfboots = bootstrap_sample_df_clean(dfboots)
    
    df_tc = dfboots.groupby(by=["step", "model"]).aggregate(
        {"pass1": "mean", "example_id": "count"}
    ).reset_index()
    
    df_diff = compute_model_diff(df_tc)
    count += len(df_diff)
    count_neg += len(df_diff[df_diff["pass1_diff"] < 0])
    
    if i < PAIRED_BOOTSTRAP_COUNT - 1:
        # Bootstrap samples - scatter with transparency
        fig.add_trace(
            go.Scatter(
                x=df_diff["step"],
                y=df_diff["pass1_diff"],
                mode='lines+markers',
                marker=dict(color='purple', size=4, opacity=0.5),
                name=f'B-A (bootstrap)' if i == 1 else None,
                legendgroup="group3",
                opacity=0.3,
                showlegend=(i == 1)  # Only show one bootstrap in legend
            ),
            row=1, col=3
        )
    else:
        # Original data - line plot
        fig.add_trace(
            go.Scatter(
                x=df_diff["step"],
                y=df_diff["pass1_diff"],
                mode='lines',
                line=dict(color='red', width=2),
                marker=dict(size=6, symbol='triangle-up'),
                # name='B-A',
                legendgroup="group3",
                opacity=1,
                showlegend=False
            ),
            row=1, col=3
        )
print(f"wrong signs, plot 3: {count_neg}/{count}")
# Add horizontal line at y=0 for the difference plot
fig.add_hline(y=0, line_dash="solid", line_color="blue", row=1, col=3)


# ============================================
# SUBPLOT 4
# ============================================
df0_plot4 = df_gr.copy()
display(df0_plot4)
df0_plot4["model"] = df0_plot4["model_pref"]

# Create multiple bootstrapped samples
count_neg = 0
count = 0
for i in range(PAIRED_BOOTSTRAP_COUNT):
    dfboots = df0_plot4.copy()
    
    if i < PAIRED_BOOTSTRAP_COUNT - 1:
        dfboots = bootstrap_sample_df_clean(dfboots, sample_pred=False, random_state=i)
    
    df_tc = dfboots.groupby(by=["step", "model"]).aggregate(
        {"pass1": "mean", "example_id": "count"}
    ).reset_index()
    
    df_diff = compute_model_diff(df_tc)
    count += len(df_diff)
    count_neg += len(df_diff[df_diff["pass1_diff"] < 0])
    
    
    if i < PAIRED_BOOTSTRAP_COUNT - 1:
        # Bootstrap samples - scatter with transparency
        fig.add_trace(
            go.Scatter(
                x=df_diff["step"],
                y=df_diff["pass1_diff"],
                mode='lines+markers',
                marker=dict(color='purple', size=4, opacity=0.5),
                name=f'B-A (bootstrap)' if i == 1 else None,
                legendgroup="group3",
                opacity=0.3,
                showlegend=False  # Only show one bootstrap in legend
            ),
            row=1, col=4
        )
    else:
        # Original data - line plot
        fig.add_trace(
            go.Scatter(
                x=df_diff["step"],
                y=df_diff["pass1_diff"],
                mode='lines',
                line=dict(color='red', width=2),
                marker=dict(size=6, symbol='triangle-up'),
                # name='B-A',
                legendgroup="group3",
                opacity=1,
                showlegend=False,
            ),
            row=1, col=4
        )
print(f"wrong signs, plot 4: {count_neg}/{count}")
# Add horizontal line at y=0 for the difference plot
fig.add_hline(y=0, line_dash="solid", line_color="blue", row=1, col=4)


In [None]:
fig.update_yaxes(range=[0.39, 0.55], row=1, col=1)
fig.update_yaxes(range=[0.39, 0.55], row=1, col=2)

fig.update_yaxes(range=[-0.03, 0.10], row=1, col=3)
fig.update_yaxes(range=[-0.03, 0.10], row=1, col=4)
for i in range(4):
    fig.update_xaxes(range=[0, 500], row=1, col=i+1)

# Update layout
fig.update_xaxes(title_text="Step", row=1, col=1)
fig.update_xaxes(title_text="Step", row=1, col=2)
fig.update_xaxes(title_text="Step", row=1, col=3)
fig.update_xaxes(title_text="Start Step", row=1, col=4)
fig.update_yaxes(title_text="Pass Rate", row=1, col=1)
# fig.update_yaxes(title_text="Pass Rate", row=1, col=2)
# fig.update_yaxes(title_text="B-A", row=1, col=3)
# fig.update_yaxes(title_text="E[B-A]", row=1, col=4)

# Update opacity for columns 3 and 4 to make bootstrap lines more visible

fig.update_layout(
    height=450,
    width=1500,
    showlegend=True,
    # title_text="Model Performance Analysis",
    title_x=0.5,
    legend=dict(
        orientation="v",
        yanchor="top",
        y=1,
        xanchor="left",
        x=1.02
    )
)

# Display the figure
fig.show()
PLOTLY_CONFIGS = dict(full_html=False, include_plotlyjs="cdn")
with open("./train_curve.html", "w") as f:
    f.write(fig.to_html(PLOTLY_CONFIGS))