In [1]:
import itertools

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import mlflow

In [2]:
mlflow.set_tracking_uri("http://10.20.20.101:8010")

In [3]:
experiment_names = itertools.product(["lightgbm", "xgboost", "extra_trees", "stacking"], ["random", "kennard_stone", "kmeans"])
experiment_names = [f"au20-{model}-{split}" for model, split in experiment_names]
experiment_names

['au20-lightgbm-random',
 'au20-lightgbm-kennard_stone',
 'au20-lightgbm-kmeans',
 'au20-xgboost-random',
 'au20-xgboost-kennard_stone',
 'au20-xgboost-kmeans',
 'au20-extra_trees-random',
 'au20-extra_trees-kennard_stone',
 'au20-extra_trees-kmeans',
 'au20-stacking-random',
 'au20-stacking-kennard_stone',
 'au20-stacking-kmeans']

In [4]:
results = []

for experiment_name in experiment_names:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id]) \
        .sort_values(by="metrics.val_r2", ascending=False)
    best_run = runs.iloc[0, :].to_dict()

    results.append({
        "experiment_name": experiment_name,
        "test_rmse": best_run['metrics.test_rmse'],
        "test_r2": best_run['metrics.test_r2'],
        "val_rmse": best_run['metrics.val_rmse'],
        "test_mae": best_run['metrics.test_mae'],
        "val_mae": best_run['metrics.val_mae'],
        "val_r2": best_run['metrics.val_r2'],
    })

In [5]:
df_results = pd.DataFrame(results)
df_results["model_name"] = df_results["experiment_name"].str.split("-").str[1]
df_results["split_method"] = df_results["experiment_name"].str.split("-").str[2]
df_results = df_results.drop(columns=["experiment_name"]).sort_values(["split_method", "model_name"])

df_results.round(4)

Unnamed: 0,test_rmse,test_r2,val_rmse,test_mae,val_mae,val_r2,model_name,split_method
7,0.4104,0.9801,0.8104,0.3095,0.5698,0.9186,extra_trees,kennard_stone
1,0.4154,0.9796,0.8428,0.3135,0.5656,0.9108,lightgbm,kennard_stone
10,0.4573,0.9752,0.7388,0.3695,0.5302,0.9316,stacking,kennard_stone
4,0.3651,0.9842,0.7619,0.2764,0.5297,0.9277,xgboost,kennard_stone
8,0.8135,0.9218,0.7718,0.6009,0.5153,0.9243,extra_trees,kmeans
2,0.765,0.9308,0.7677,0.5534,0.4873,0.9245,lightgbm,kmeans
11,0.6989,0.9423,0.6996,0.5406,0.479,0.9374,stacking,kmeans
5,0.7405,0.9352,0.7092,0.564,0.4407,0.9362,xgboost,kmeans
6,0.5894,0.9591,0.7643,0.4414,0.5087,0.9281,extra_trees,random
0,0.5677,0.9621,0.7485,0.4085,0.4817,0.9299,lightgbm,random


In [7]:
df_results.to_csv("../data/regression-performance.csv", index=None)