In [1]:
import glob
import os
import warnings

import altair as alt
import pandas as pd
from configs_algorithms import EVAL_METRIC, RANDOMSTATE
from configs_best_models import BEST_MODELS
from configs_data import DATASETS_CONFIG, RESULT_PATH
from df_model import DataModel

In [2]:
# ignore unproblematic parameter warnings
warnings.filterwarnings("ignore")

In [3]:
%%script true # skip cell since already stored

# store results of best automl models for each dataset
performance = {}
for data_config in DATASETS_CONFIG:
    data_name = data_config["dataset"]
    data_model = DataModel(data_config, RANDOMSTATE)
    best_models = BEST_MODELS[data_name]

    for model_name, model in best_models.items():
        algorithm_name = model.__class__.__name__
        eval_metric = data_model.evaluate_model(model, EVAL_METRIC, test=True)
        performance[(data_name, model_name)] = [algorithm_name, eval_metric]

df = pd.DataFrame.from_dict(performance, orient="columns").T.reset_index()
df.columns = ["dataset", "automl", "algorithm", "metric"]
out_path = os.path.join(RESULT_PATH, "All_AutoML_comparisons.csv")
df.to_csv(out_path, sep=";", index=False)

In [4]:
# import the molecule ground state energy executions
csvfiles = glob.glob(os.path.join(RESULT_PATH, "*_comparisons.csv"))
automl_results = []

for csvfile in csvfiles:
    df = pd.read_csv(csvfile, delimiter=";")
    automl_results.append(df)

In [5]:
automl_comparisons = automl_results[0]
display(automl_comparisons)

Unnamed: 0,dataset,automl,algorithm,metric
0,AutoMpg,Custom,MLPRegressor,3.015948
1,AutoMpg,AutoSklearn,RandomForestRegressor,2.165327
2,AutoMpg,TPOT,MLPRegressor,2.310145
3,CommunitiesCrime,Custom,SVR,0.130829
4,CommunitiesCrime,AutoSklearn,RandomForestRegressor,0.133296
5,CommunitiesCrime,TPOT,MLPRegressor,0.13356
6,MiamiHousing,Custom,RandomForestRegressor,105647.27413
7,MiamiHousing,AutoSklearn,RandomForestRegressor,113547.855281
8,MiamiHousing,TPOT,RandomForestRegressor,106019.31048
9,BikeSharing,Custom,MLPRegressor,38.032322


In [6]:
# change algorithm names for clarity
automl_comparisons["algorithm"] = automl_comparisons["algorithm"].replace(r"Regressor", "", regex=True)

In [7]:
# define more explicit data names for plot axes
def to_readable(s):
    words = s.lower().split("_")
    words = [w.capitalize() for w in words]
    return " ".join(words)


map_axis = {column: to_readable(column) for column in automl_comparisons.columns}
map_axis["metric"] = "RMSE"

In [8]:
# make heatmap for molecule results given algorithm configurations
def create_heatmap(df, x, y, color, norm=True):
    return (
        alt.Chart(df)
        .transform_joinaggregate(max_col=f"max({color})", min_col=f"min({color})", groupby=[y])
        .transform_calculate(norm_col=f"(datum.{color} - datum.min_col) / (datum.max_col - datum.min_col)")
        .mark_rect()
        .encode(
            x=alt.X(x, title=map_axis[x]),
            y=alt.Y(y, title=map_axis[y]),
            color=alt.Color(
                "norm_col:Q" if norm else color,
                title=["normalized" if norm else "", map_axis[color]],
            ),
            tooltip=[color],
        )
        .properties(width=300, height=300)
    )

In [9]:
create_heatmap(automl_comparisons, "automl", "dataset", "metric", True)

In [10]:
create_heatmap(automl_comparisons, "automl", "dataset", "algorithm", False)