In [1]:
import matplotlib.pyplot as plt
import json
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
datasets_classification = list(pd.read_csv("../../../../lcdb/python/lcdb/datasets.csv")["openmlid"].values)

In [3]:
def prepare_result_dataset(df_results_bare):
    
    df_results = df_results_bare[df_results_bare["status"] == "done"]
    
    # decompose score field
    times_train = []
    times_dist = []
    scores_train = []
    scores_val = []
    memory = []
    for info in tqdm(df_results["scores"]):
        history = json.loads(info)
        times_train.append([t[1] for t in history])
        times_dist.append([t[2] for t in history])
        scores_train.append(list(np.round([t[3] for t in history], 6)))
        scores_val.append(list(np.round([t[4] for t in history], 6)))
        memory.append([t[-1] for t in history])
    
    # compute new result frame
    slopes = []
    for memory_hist in memory:
        cov = np.cov(range(1000), memory_hist[-1000:])
        slopes.append(np.round(cov[1,0] / cov[0,0], 6))
    return pd.DataFrame({
        "openmlid": df_results["openmlid"],
        "seed": df_results["seed"],
        "memory_per_tree": slopes,
        "train_time_per_tree_in_ms": [np.round(np.mean(e), 1) for e in times_train],
        "dist_time_per_tree_in_ms": [np.round(np.mean(e), 1) for e in times_dist],
        "scores_oob": scores_train,
        "scores_val": scores_val
    })
    
def plot_result_availability(df_results, expteded_datasets = None):
    
    if expteded_datasets is None:
        expteded_datasets = sorted(list(pd.unique(df_results["openmlid"])))
    
    # compute matrix of what we have available
    Z = np.zeros((len(expteded_datasets), 10))
    for i, openmlid in enumerate(expteded_datasets):
        m1 = df_results["openmlid"].astype(int) == openmlid
        for j in range(10):
            m2 = df_results["seed"].astype(int) == j
            Z[i,j] = np.count_nonzero(m1 & m2)
    
    # show availability
    fig, ax = plt.subplots(figsize=(10,len(expteded_datasets) * 0.4))
    ax.imshow(-Z, cmap="coolwarm")
    ax.set_yticks(range(len(expteded_datasets)))
    ax.set_yticklabels(expteded_datasets)
    ax.set_xticks(np.arange(-.5, 10, 1), minor=True)
    ax.set_yticks(np.arange(-.5, len(expteded_datasets), 1), minor=True)
    ax.grid(which='minor', color='w', linestyle='-', linewidth=2)
    plt.show()

In [None]:
for key, expected_datasets in zip(["classification", "regression"], [datasets_classification, None]):
    if key != "regression":
        continue
    print(f"Preparing results for {key}.")
    df_results = prepare_result_dataset(pd.read_csv(f"results_{key}_bare.csv", sep=";"))
    plot_result_availability(df_results, expected_datasets)
    file = f"results_{key}_base.csv"
    print(f"Results for {key} ready, storing them in {file}.")
    df_results.to_csv(file, index=False, sep=";")
    print(f"Results for {key} stored in {file}.")

Preparing results for regression.


  0%|          | 0/550 [00:00<?, ?it/s]