In [1]:
from lcdb.db import LCDB
from lcdb.analysis.util import LearningCurveExtractor, merge_curves

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sb

In [None]:
workflows = [
    "lcdb.workflow.sklearn.KNNWorkflow",
    "lcdb.workflow.sklearn.LibLinearWorkflow",
    "lcdb.workflow.sklearn.LibSVMWorkflow",
    "lcdb.workflow.sklearn.TreesEnsembleWorkflow"
]

openmlid = 6
workflow = workflows[1]

# retrieve learning curve objects
lcdb = LCDB()
df = lcdb.query(
    openmlids=[openmlid],
    workflows=[workflow],
    return_generator=False,
    processors={
        "learning_curve": LearningCurveExtractor(
            metrics=["error_rate"],
            folds=["train", "val"]
        )
    },
    show_progress=True
)

# group by configurations
config_cols = [c for c in df.columns if c.startswith("p:")]
df = df.groupby(config_cols).agg({"learning_curve": merge_curves})

 16%|████████████████████████▋                                                                                                                                 | 4/25 [00:06<00:36,  1.74s/it]

In [None]:
max_schedule = df["learning_curve"].iloc[0].anchors_size
print(max_schedule)

In [None]:
df["learning_curve"].apply(lambda x: x.pad_anchors_size(max_schedule, inplace=True))

In [None]:
df

# Plot of learning curves

In [None]:
fig, ax = plt.subplots(figsize=(16, 5))
best_score = np.inf
for lc in df["learning_curve"]:
    schedule = lc.anchors_size
    lc = lc.values[0, 1, :, :, :, :, -1] if lc.is_iteration_wise_curve else lc.values[0, 1] # validation error rate
    
    if np.any(~np.isnan(lc)):
        mu = np.nanmean(lc, axis=(0, 1, 2)).squeeze()
        std = np.nanstd(lc, axis=(0, 1, 2)).squeeze()
        ax.plot(schedule, mu)
        ax.fill_between(schedule, mu - std, mu + std, alpha=0.1)
        if not np.isnan(mu[-1]) and mu[-1] < best_score:
            best_score = mu[-1]
print(best_score)
ax.axhline(best_score, color="black", linestyle="--")
plt.show()

In [None]:
def learning_curve_crossing_evaluation(df):
    
    schedule = df["learning_curve"].iloc[0].anchors_size
    
    # compute diffs at different anchors
    diffs = np.zeros((len(df), len(df), len(df["learning_curve"].iloc[0].test_seeds), len(df["learning_curve"].iloc[0].val_seeds), len(df["learning_curve"].iloc[0].workflow_seeds), len(schedule)))
    for i, lc1 in enumerate(df["learning_curve"]):
        curve1 = lc1.values[0, 1]
        for j, lc2 in enumerate(df["learning_curve"]):
            curve2 = lc2.values[0, 1]
            diffs[i, j] = curve1 - curve2
    diffs_bin = diffs > 0
    
    # define start index
    start_index = 0
    
    # compute P(A starts lower than B)
    probs_a_starts_higher_than_b = diffs_bin[:,:,:,:,:,start_index].mean(axis=(2, 3, 4))
    
    # compute P(A starts lower than B and finishes higher)
    probs_a_strats_higher_than_b_and_ends_lower_than_b = (diffs_bin[:,:,:,:,:,start_index] & ~diffs_bin[:,:,:,:,:,-1]).mean(axis=(2, 3, 4))
    
    # figure for probability of crossing
    fig, axs = plt.subplots(1, 3, figsize=(16, 4))
    sb.heatmap(probs_a_starts_higher_than_b, cmap="Greens", ax=axs[0])
    sb.heatmap(probs_a_strats_higher_than_b_and_ends_lower_than_b, cmap="Greens", ax=axs[1])
    sb.heatmap(probs_a_strats_higher_than_b_and_ends_lower_than_b / np.maximum(10**-10, probs_a_starts_higher_than_b), cmap="Greens", ax=axs[2])
    axs[0].set_title(r"$\mathbb{P}$(A starts higher than B)")
    axs[1].set_title(r"$\mathbb{P}$(A starts higher than B and ends lower)")
    axs[2].set_title(r"$\mathbb{P}$(A and ends lower than B|A starts higher than B)")
    for ax in axs:
        ax.set_xlabel("hp config A")
        ax.set_ylabel("hp config B")
    fig.savefig(f"plots/lc_crossing_{openmlid}_{workflow}.pdf", bbox_inches="tight")
    plt.show()
    
    # gaps to the finally best (at the respective budget)
    idx_of_best = np.argmin([np.nanmean(lc.values[0, 1, :, :, :, -1]) if np.count_nonzero(~np.isnan(lc.values[0, 1, :, :, :, -1])) > 0 else np.inf for lc in df["learning_curve"]])
    fig, axs = plt.subplots(1, 2, figsize=(20, 4))
    ax = axs[0]
    for diff in diffs[:,idx_of_best]:
        mu = diff.mean(axis=(0, 1, 2))
        std = diff.std(axis=(0, 1, 2))
        ax.plot(schedule, mu)
        ax.fill_between(schedule, mu - std, mu + std, alpha=0.1)
    ax.axhline(0, color="black", linestyle="--")
    ax.set_title("Gaps to finally best configuration along budget")
    ax.set_xlabel("Budget $b$ (sample size)")
    ax.set_ylabel("Gap in error rate to finally best config")
    ax.set_xscale("log")
    
    # distribution of final gaps based on selections by budgets
    gaps_by_budget = {}
    for i, budget in enumerate(schedule):
        mean_scores_at_budget = []
        for lc in df["learning_curve"]:
            lc_slice = lc.values[0, 1, :, :, :, i]
            mean_scores_at_budget.append(np.nanmean(lc_slice) if np.count_nonzero(~np.isnan(lc_slice)) > 0 else np.inf)
        idx_of_best_at_budget = np.argmin(mean_scores_at_budget)
        gaps_by_budget[budget] = diffs[idx_of_best_at_budget,idx_of_best, :, :, :, -1].flatten()
        gaps_by_budget[budget] = gaps_by_budget[budget][~np.isnan(gaps_by_budget[budget])]
    ax = axs[1]
    ax.boxplot([gaps_by_budget[b] for b in schedule])
    ax.axhline(0, color="black", linestyle="--")
    ax.set_xticklabels(schedule, rotation=90)
    ax.set_title("Regret at full dataset size when picking best config at budget $b$")
    ax.set_xlabel("Budget $b$ (sample size)")
    ax.set_ylabel("Regrets at full dataset size (across seeds)")
    fig.savefig(f"plots/regrets_{openmlid}_{workflow}.pdf", bbox_inches="tight")
    plt.show()
    
    # figures for diffs
    for i in range(len(schedule)):
        fig, ax = plt.subplots()
        sb.heatmap(diffs.mean(axis=(2, 3, 4))[:, :, i], ax=ax, vmin=-1, vmax=1, cmap="seismic")
        plt.show()
    
learning_curve_crossing_evaluation(df)