In [None]:
import numpy as np
import pandas as pd
import scipy.stats

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm

In [None]:
ALGOS = ["rf", "lda", "pca"]
ALGO_NAMES = ["RF", "LDA RF", "PCA RF"]

# Total CPU Time

In [None]:
df_comparison = pd.read_csv("comparison.csv")

In [None]:
cpu_hours = sum(df_comparison["traintime_total_max"]) / 3600
cpu_days = cpu_hours / 24
print(cpu_hours, "hours")
print(cpu_days, "days")

In [None]:
%matplotlib inline
fig, ax = plt.subplots(1, 3, figsize=(10,3),  gridspec_kw={'width_ratios': [1.2, 1, 1]})
ct = pd.crosstab(df_comparison["best_choice_test"], df_comparison.rename(columns={"choice": "SORF Choice"})["SORF Choice"])

permutation = [1, 2, 0]

# plot comparison of RF against LDA RF and PCA RF
df_rfcomparison = df_comparison.groupby("openmlid")["rf_test", "lda_test", "pca_test"].mean()
ax[0].scatter(df_rfcomparison["rf_test"], df_rfcomparison["lda_test"], color="C0", s=5)
ax[0].scatter(df_rfcomparison["rf_test"], df_rfcomparison["pca_test"], color="C1", s=5)
ax[0].plot([0.2,1], [0.2,1], linewidth=1, linestyle="--", color="black")
ax[0].grid()
ax[0].set_xlabel("Accuracy of standard RF")
ax[0].set_ylabel("Accuracy of\nLDA RF (blue) and PCA RF(orange)")

ct = ct.rename(columns={a: n for a, n in zip(ALGOS, ALGO_NAMES)})
ct.plot(kind='bar', stacked=True, rot=0, ax=ax[1])
ct_normalized = ct.values / np.sum(ct.values)
sns.heatmap(ct_normalized, annot=True, ax = ax[2], vmax=0.33, cmap="Greens")
for a in ax[1:]:
    a.set_xlabel("Best Choice")
ax[1].set_xticklabels([ALGO_NAMES[i] for i in permutation])
ax[1].set_title("SORF and Best Choices\n(in absolute numbers)")

ax[2].set_ylabel("SORF Choice")
ax[2].set_xticklabels([ALGO_NAMES[i] for i in permutation])
ax[2].set_yticklabels([ALGO_NAMES[i] for i in permutation])
ax[2].set_title("Confusion Matrix of SORF")
fig.tight_layout()
fig.savefig("plots/confusion.pdf", bbox_inches='tight')

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(10, 2.5),  gridspec_kw={'width_ratios': [1.5, 1.5, 1, 1]})

Z = []
for i, a_choice in enumerate(ALGOS):
    df_selection = df_comparison[(df_comparison["choice"] == a_choice)]
    Z.append([np.mean(df_selection["traintime_" + a_tree + "_act"] / df_selection["traintime_" + a_tree + "_max"]) for a_tree in ALGOS])
Z = np.array(Z)
Z = np.column_stack([Z, np.sum(Z, axis=1) / 3])

ax = axes[0]
sns.heatmap(Z, annot=True, ax = ax, cmap="Reds")
ax.set_xticklabels(ALGO_NAMES + ["Total"])
ax.set_yticklabels(ALGO_NAMES)
ax.set_ylabel("Chosen Forest Type")
ax.set_xlabel("Training time spent per tree type\n(relative to maximum possible)")

Z = []
for i, a_choice in enumerate(ALGOS):
    df_selection = df_comparison[(df_comparison["choice"] == a_choice)]
    Z.append([int(np.round(np.mean(df_selection["trees_" + a_tree]))) for a_tree in ALGOS])
Z = np.array(Z,dtype=int)
Z = np.column_stack([Z, np.sum(Z, axis=1)])
ax = axes[1]
sns.heatmap(Z, annot=True, ax = ax, cmap="Reds", fmt='g')
ax.set_xticklabels(ALGO_NAMES + ["Total"])
ax.set_yticklabels(ALGO_NAMES)
ax.set_ylabel("Chosen forest type")
ax.set_xlabel("Numbers of trees grown per tree type")




compressions_in_time = [[np.mean(g["time_compression"]) for i, g in df_comparison[df_comparison["choice"] == a].groupby("openmlid")] for a in ALGOS] + [[np.mean(g["time_compression"]) for i, g in df_comparison.groupby("openmlid")]]
compressions_in_num_trees = [[np.mean(g["tree_compression"]) for i, g in df_comparison[df_comparison["choice"] == a].groupby("openmlid")] for a in ALGOS] + [[np.mean(g["tree_compression"]) for i, g in df_comparison.groupby("openmlid")]]

for a, values in zip(axes[2:], [compressions_in_time, compressions_in_num_trees]):
    a.boxplot(values)
    a.set_ylim([0, 1])
    a.set_xticklabels(ALGO_NAMES + ["Total"], rotation=90)
    a.axhline(0.5, linestyle="dotted", color="black", linewidth=1)
    a.axhline(0.25, linestyle="dotted", color="black", linewidth=1)
    a.axvline(4.5, color="black", linewidth=1)
axes[2].set_ylabel("Time Compression")
axes[3].set_ylabel("Tree Compression")

fig.tight_layout()
fig.savefig("plots/computations.pdf", bbox_inches='tight')
plt.show()

In [None]:
def get_performance_figure_and_table_for_paper(df_comparison):
    
    
    algo_names = ["rf", "lda", "pca"]
    
    scores = []
    significances_with_sorf = {a : [] for a in algo_names}
    significances_with_best = {a : [] for a in algo_names}
    
    imps = []
    
    rows = []
    for openmlid, df_dataset in df_comparison.groupby("openmlid"):
        
        # extract the scores of the algorithms on this dataset
        perf_selfopt = df_dataset["selfopt_test"]
        data_base = []
        for comp in algo_names:
            perf_comp = df_dataset[comp + "_test"]
            data_base.append(perf_comp)
            if np.linalg.norm(perf_comp - perf_selfopt) != 0:
                significant = scipy.stats.wilcoxon(perf_comp, perf_selfopt).pvalue < 0.05
            else:
                significant = False
            significances_with_sorf[comp].append(significant)
        data_base.append(perf_selfopt)


        # compute mean scores of the algorithms on dataset
        scores_on_dataset = [np.mean(v) for v in data_base]
        imps_on_dataset = [scores_on_dataset[-1] - v for v in scores_on_dataset]
        scores.append(scores_on_dataset)
        imps.append(imps_on_dataset)
        best_score = max(scores_on_dataset)
        best_indices = [i for i in range(len(data_base)) if scores_on_dataset[i] == best_score]
        
        
        # determine significant differences to best
        best_algo = (algo_names + ["selfopt"])[np.argmax(scores_on_dataset)]
        perf_best = df_dataset[best_algo + "_test"]
        for comp in algo_names:
            perf_comp = df_dataset[comp + "_test"]
            if np.linalg.norm(perf_comp - perf_best) != 0:
                significant = scipy.stats.wilcoxon(perf_comp, perf_best).pvalue < 0.05
            else:
                significant = False
            significances_with_best[comp].append(significant)
        
        
        # format entries
        formatted_vals = [f"{np.round(100 * np.mean(v), 2)}$\pm${np.round(100 * np.std(v), 1)}" for i, v in enumerate(data_base)]
        #imps.append(scores_on_dataset[1] - scores_on_dataset[0])
        for i, val in enumerate(formatted_vals):
            if i in best_indices:
                formatted_vals[i] = "\\textbf{" + val + "}"
            elif not significant:
                formatted_vals[i] = "\\underline{" + val + "}"

        rows.append([openmlid] + formatted_vals)
    
    scores = np.array(scores)
    imps = np.array(imps)
    
    for a in algo_names:
        significances_with_sorf[a] = np.array(significances_with_sorf[a])
        significances_with_best[a] = np.array(significances_with_best[a])
    
    
    colors = {
        "rf": "C2",
        "lda": "C0",
        "pca": "C1"
    }
    
    
    
    # create figure
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 3),  gridspec_kw={'width_ratios': [1, 1]})
    
    
    for i, a in enumerate(["rf", "lda", "pca"]):
        ax1.scatter(scores[significances_with_sorf[a],i], scores[significances_with_sorf[a],-1], s=20, color=colors[a])
        ax1.scatter(scores[~significances_with_sorf[a],i], scores[~significances_with_sorf[a],-1], s=20, facecolors="None", color=colors[a])
    
    ax1.plot([0.2,1], [0.2,1], linestyle="dotted", color="black")
    ax1.grid()
    ax1.set_xlabel("Accuracy of Standard RF (blue), LDA RF (orange), and PCA RF (green).\nBullets show significant and circle statistically not significant differences.")
    ax1.set_ylabel("Accuracy of Self-Optimized RF")
    
    imp_concated = []
    for i, algo in enumerate(algo_names):
        imp_concated.append(imps[:,i])
    for i, algo in enumerate(algo_names):
        imp_concated.append(imps[significances_with_sorf[algo],i])
    
    imp_concated.append(np.min(imps, axis=1))
    
    ax2.boxplot(imp_concated, vert=False)
    ax2.axvline(-0.01, linestyle="dotted", color="red", linewidth=1)
    ax2.axvline(0, linestyle="--", color="black", linewidth=1)
    ax2.axvline(0.01, linestyle="dotted", color="black", linewidth=1)
    ax2.axvline(0.03, linestyle="dotted", color="black", linewidth=1)
    ax2.set_xlabel("Accuracy improvement achieved by SORF compared to ...")
    ax2.set_yticklabels(ALGO_NAMES + [name + " (sign.)" for name in ALGO_NAMES] + ["Oracle"])
    #ax2.scatter(list(range(len(imps))), imps)
    #ax2.hist(imps, bins=200)
    #ax2.set_yscale("log")
    fig.tight_layout()
    
    print(np.mean(imp_concated[-1]))
    
    return (fig, ax), pd.DataFrame(rows, columns=["openmlid"] + ALGO_NAMES + ["SORF"])
    
(fig, ax), df_results = get_performance_figure_and_table_for_paper(df_comparison)
fig.savefig("plots/performance.pdf", bbox_inches='tight')
plt.show()

In [None]:
print(df_results.head(48).to_latex(index = False, escape = False))

In [None]:
print(df_results.tail(48).to_latex(index = False, escape = False))

# Datasets With Properties

In [None]:
import openml
from tqdm.notebook import tqdm

datasets = [int(i) for i in sorted(pd.unique(df_results["openmlid"]))]
rows = []
for openmlid in tqdm(datasets):
    
    # load dataset object
    dataset = openml.datasets.get_dataset(openmlid)
    
    # analyze columns of data
    if False:
        dfDataset = dataset.get_data()[0]
        types = dfDataset.dtypes
        cnt_normalized = 0
        cnt_zero_mean = 0
        cnt_one_std = 0
        cnt_numerical = 0
        for i, col in enumerate(dfDataset.columns):
            if "float" in str(types[i]):
                cnt_numerical += 1
                vals = dfDataset[col].values
                is_normalized = np.round(min(vals), 3) == 0 and np.round(max(vals), 3) == 1
                is_zero_mean = np.round(np.mean(vals), 3) == 0
                is_one_std = np.round(np.std(vals), 3) == 1
                if is_normalized:
                    cnt_normalized += 1
                if is_zero_mean:
                    cnt_zero_mean += 1
                if is_one_std:
                    cnt_one_std += 1
        if cnt_numerical > 0:
            feature_stats_entries = [str(v) + "\%" for v in np.round(100 * np.array([cnt_normalized / cnt_numerical, cnt_zero_mean / cnt_numerical, cnt_one_std / cnt_numerical]), 0).astype(int)]
        else:
            feature_stats_entries = 3 * ["n/a"]
    
    
    num_instances = int(dataset.qualities["NumberOfInstances"])
    num_features = len(dataset.features) - 1
    num_features_numeric = int(dataset.qualities["NumberOfNumericFeatures"])
    num_classes = int(dataset.qualities["NumberOfClasses"])
    #min_class_percentage = str(int(dataset.qualities["MinorityClassPercentage"])) + "\%"
    #maj_class_percentage = str(int(dataset.qualities["MajorityClassPercentage"])) + "\%"
    missing_values = str(int(dataset.qualities["PercentageOfMissingValues"])) + "\%"
    
    row = [openmlid, dataset.name[:20].replace("_", "\\_"), num_instances, num_features, num_features_numeric, num_classes, missing_values]
    rows.append(row)
dfDatasets = pd.DataFrame(rows, columns=["openmlid", "name", "instances", "features", "numeric features", "classes", "\% missing"]).sort_values("openmlid")

In [None]:
print(dfDatasets[["openmlid", "name", "instances", "features", "classes"]].head(48).to_latex(index=False, escape=False))

In [None]:
print(dfDatasets[["openmlid", "name", "instances", "features", "classes"]].tail(48).to_latex(index=False, escape=False))

# Tables for Runtimes and Trained Trees

In [None]:
df_comparison.columns

In [None]:
def get_runtime_table_for_paper(df_comparison):
    
    
    algo_names = ["rf", "lda", "pca"]
    
    scores = []
    significances_with_sorf = {a : [] for a in algo_names}
    significances_with_best = {a : [] for a in algo_names}
    
    imps = []
    
    rows = []
    for openmlid, df_dataset in df_comparison.groupby("openmlid"):
        
        # extract the scores of the algorithms on this dataset
        perf_selfopt = df_dataset["traintime_total_act"]
        data_base = []
        for comp in algo_names:
            perf_comp = df_dataset["traintime_" + comp + "_max"]
            data_base.append(perf_comp)
            if np.linalg.norm(perf_comp - perf_selfopt) != 0:
                significant = scipy.stats.wilcoxon(perf_comp, perf_selfopt).pvalue < 0.05
            else:
                significant = False
            significances_with_sorf[comp].append(significant)
        data_base.append(perf_selfopt)


        # compute mean scores of the algorithms on dataset
        scores_on_dataset = [np.mean(v) for v in data_base]
        imps_on_dataset = [scores_on_dataset[-1] - v for v in scores_on_dataset]
        scores.append(scores_on_dataset)
        imps.append(imps_on_dataset)
        best_score = min(scores_on_dataset)
        best_indices = [i for i in range(len(data_base)) if scores_on_dataset[i] == best_score]
        
        
        # format entries
        formatted_vals = [f"{int(np.round(np.mean(v)))}$\pm${int(np.round(np.std(v)))}" for i, v in enumerate(data_base)]
        rows.append([openmlid] + formatted_vals)
    
    scores = np.array(scores)
    
    return pd.DataFrame(rows, columns=["openmlid"] + ALGO_NAMES + ["SORF"])
    
df_results = get_runtime_table_for_paper(df_comparison)
df_results

In [None]:
print(df_results.head(48).to_latex(index = False, escape=False))

In [None]:
print(df_results.tail(48).to_latex(index = False, escape=False))

In [None]:
def get_tree_table_for_paper(df_comparison):
    
    
    algo_names = ["rf", "lda", "pca"]
    
    scores = []
    significances_with_sorf = {a : [] for a in algo_names}
    significances_with_best = {a : [] for a in algo_names}
    
    imps = []
    
    rows = []
    for openmlid, df_dataset in df_comparison.groupby("openmlid"):
        
        # extract the scores of the algorithms on this dataset
        perf_selfopt = df_dataset["trees_rf"] + df_dataset["trees_lda"] + df_dataset["trees_pca"]
        data_base = []
        for comp in algo_names:
            perf_comp = df_dataset["trees_" + comp]
            data_base.append(perf_comp)
            if np.linalg.norm(perf_comp - perf_selfopt) != 0:
                significant = scipy.stats.wilcoxon(perf_comp, perf_selfopt).pvalue < 0.05
            else:
                significant = False
            significances_with_sorf[comp].append(significant)
        data_base.append(perf_selfopt)


        # compute mean scores of the algorithms on dataset
        scores_on_dataset = [np.mean(v) for v in data_base]
        imps_on_dataset = [scores_on_dataset[-1] - v for v in scores_on_dataset]
        scores.append(scores_on_dataset)
        imps.append(imps_on_dataset)
        best_score = min(scores_on_dataset)
        best_indices = [i for i in range(len(data_base)) if scores_on_dataset[i] == best_score]
        
        
        # format entries
        formatted_vals = [f"{int(np.round(np.mean(v)))}$\pm${int(np.round(np.std(v)))}" for i, v in enumerate(data_base)]
        rows.append([openmlid] + formatted_vals)
    
    scores = np.array(scores)
    
    return pd.DataFrame(rows, columns=["openmlid", "Standard Trees", "LDA Trees", "PCA Trees", "Total"])
    
df_results = get_tree_table_for_paper(df_comparison)
df_results

In [None]:
print(df_results.head(48).to_latex(index=False,escape=False))

In [None]:
print(df_results.tail(48).to_latex(index=False,escape=False))