# Study ML Results

## Load Data

Load libraries:

In [151]:
import json
import pathlib
import pandas as pd
from typing import Dict, Any

Define constants:

In [152]:
RESULTS_PATH = "../../results/"
DATE_EXPERIMENTS = "20240604_1409"
LINEAR_REGRESSION_FILENAMME = "linear_regression.json"
DECISION_TREE_FILENAME = "decision_tree_regressor.json"
KNN_FILENAME = "knn_regressor.json"
XGBOOST_SINGLE_OUTPUT_FILENAME = "xgboost_single_output_regressor.json"
XGBOOST_MULTI_OUTPUT_FILENAME = "xgboost_multi_output_regressor.json"

Load data:

In [153]:
experiments_path = pathlib.Path(RESULTS_PATH) / DATE_EXPERIMENTS

def load_experiment_results(file_path: pathlib.Path) -> Dict[str, Any]:
    """
    Load JSON data from a file.

    Args:
        file_path (pathlib.Path): Path to the JSON file.

    Returns:
        Dict[str, Any]: The loaded JSON data.
    """
    with file_path.open('r') as file:
        return json.load(file)


linear_regression_results = load_experiment_results(
    experiments_path / LINEAR_REGRESSION_FILENAMME
)
decision_tree_results = load_experiment_results(
    experiments_path / DECISION_TREE_FILENAME
)
knn_results = load_experiment_results(experiments_path / KNN_FILENAME)
xgboost_single_output_results = load_experiment_results(
    experiments_path / XGBOOST_SINGLE_OUTPUT_FILENAME
)
xgboost_multi_output_results = load_experiment_results(
    experiments_path / XGBOOST_MULTI_OUTPUT_FILENAME
)

Convert to dataframe:

In [154]:
def create_metrics_dataframe(results: Dict[str, Any]) -> pd.DataFrame:
    """
    Create a DataFrame with single-level columns from experiment results.

    Args:
        results (Dict[str, Any]): Dictionary containing the experiment results.

    Returns:
        pd.DataFrame: A DataFrame with single-level columns (formatted as {colname}_{metric}).
    """
    data = results["dataset_metrics"]

    rows = []
    index = []
    columns = set()

    for dataset_name, dataset_metrics in data.items():
        index.append(dataset_name)
        row = {}
        for col_name, metrics in dataset_metrics["metrics"].items():
            # Extract uppercase letters from column name
            col_name_processed = "".join(filter(str.isupper, col_name))
            for metric_name, metric_value in metrics.items():
                col_metric_name = f"{col_name_processed}_{metric_name}"
                row[col_metric_name] = metric_value
                columns.add(col_metric_name)
        rows.append(row)

    # Convert to DataFrame
    df = pd.DataFrame(rows, index=index)

    # Add boolean columns dummy, shifts and adj
    # Columns are True if the dataset name contains the respective string
    df["dummy"] = df.index.str.contains("dummy")
    df["shifts"] = df.index.str.contains("shifts")
    df["adj"] = df.index.str.contains("adj")

    # Add categorical column type
    # if the strings constains basic then 0
    # if the strings not constains no_ist then 1
    # if the strings constains no_ist then 2
    # if the strings constains complete then 3
    df["type"] = None
    df.loc[df.index.str.contains("basic"), "type"] = 0
    df.loc[
        (df.index.str.contains("ist")) & (~df.index.str.contains("no_ist")), "type"
    ] = 1
    df.loc[df.index.str.contains("no_ist"), "type"] = 2
    df.loc[df.index.str.contains("complete"), "type"] = 3

    # Ensure all columns are present and sort them alphabetically
    df = df.reindex(columns=sorted(df.columns), fill_value=None)
    return df

In [155]:
linear_regression_df = create_metrics_dataframe(linear_regression_results)
decision_tree_df = create_metrics_dataframe(decision_tree_results)
knn_df = create_metrics_dataframe(knn_results)
xgboost_single_output_df = create_metrics_dataframe(xgboost_single_output_results)
xgboost_multi_output_df = create_metrics_dataframe(xgboost_multi_output_results)

Add auxiliary function to highlight the maximum value in a column:

In [156]:
def highlight_max_min(data: pd.DataFrame) -> pd.DataFrame:
    """
    Highlight the maximum value in each column for R^2 and the minimum value in each column for RMSE.

    Args:
        data (pd.DataFrame): DataFrame to highlight.

    Returns:
        pd.DataFrame: DataFrame with highlighted values.
    """
    def highlight_values(column):
        is_r2 = "R^2" in column.name
        is_rmse = "RMSE" in column.name
        if not is_r2 and not is_rmse:
            return ['' for _ in column]
        if is_r2:
            highlight_value = column.max()
        if is_rmse:
            highlight_value = column.min()
            
        return ['background: yellow' if v == highlight_value else 'background: #185ed7' if v >= 0.2 and v != highlight_value and is_r2 else '' for v in column]
    
    return data.style.apply(highlight_values, axis=0)

In [157]:
def count_max_min_highlights_and_podiums(data: pd.DataFrame) -> pd.DataFrame:
    """
    Count the number of maximum R^2 and minimum RMSE for each dataset and sort the results.
    Additionally, count the top 3 values for R^2 and the lowest 3 values for RMSE.

    Args:
        data (pd.DataFrame): DataFrame containing the metrics.

    Returns:
        pd.DataFrame: DataFrame with counts of max R^2, min RMSE, and podium finishes per dataset, sorted by the total count.
    """
    count_df = pd.DataFrame(index=data.index, columns=['max_R2_count', 'min_RMSE_count', 'R2_podium_count', 'RMSE_podium_count'])
    count_df['max_R2_count'] = 0
    count_df['min_RMSE_count'] = 0
    count_df['R2_podium_count'] = 0
    count_df['RMSE_podium_count'] = 0
    count_df['R2_valid'] = 0

    for column in data.columns:
        if "R^2" in column:
            max_value = data[column].max()
            count_df['max_R2_count'] += data[column] == max_value
            top_3_r2 = data[column].nlargest(3).values
            count_df['R2_podium_count'] += data[column].isin(top_3_r2)
            count_df['R2_valid'] += data[column] > 0.2
        elif "RMSE" in column:
            min_value = data[column].min()
            count_df['min_RMSE_count'] += data[column] == min_value
            top_3_rmse = data[column].nsmallest(3).values
            count_df['RMSE_podium_count'] += data[column].isin(top_3_rmse)

    count_df['total_count'] = count_df['max_R2_count'] + count_df['min_RMSE_count']
    count_df['total_podiums'] = count_df['R2_podium_count'] + count_df['RMSE_podium_count']
    sorted_count_df = count_df.sort_values(by=['total_count', 'total_podiums'], ascending=False)
    return sorted_count_df

## Linear Regression

See the results:

In [158]:
linear_regression_highlighted_df = highlight_max_min(linear_regression_df)
linear_regression_highlighted_df

Unnamed: 0,CDC_RMSE,CDC_R^2,CS_RMSE,CS_R^2,CUP_RMSE,CUP_R^2,ECPGUANYEM_RMSE,ECPGUANYEM_R^2,ERC_RMSE,ERC_R^2,OTH_RMSE,OTH_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2,adj,dummy,shifts,type
basic,13.108765,0.0,13.812827,-57.485107,4.958795,-1.254258,6.306477,-1.711654,8.550081,-0.645683,1.422144,-0.924748,4.603915,-1.703169,5.488559,0.236516,1.213344,0.705913,False,False,False,0
basic_adj,2.035616,0.0,1.297133,0.484215,1.80456,0.701502,1.459422,0.854782,1.861985,0.921948,0.44167,0.814349,1.030653,0.864556,1.363938,0.952841,0.974992,0.810137,True,False,False,0
basic_dummy,182736950622.51627,0.0,73588623502.0171,-4.0309592455734126e+20,12257766942.734632,-5.179241365336898e+19,2417367951.512489,-3.019351162221708e+17,46685551196.79006,-2.831777156154457e+19,50760978738.98072,-1.751492697754193e+21,104601051337.83832,-2.958896840166712e+21,470782865043.05176,-4.70336668142542e+21,9446285977.9979,-4.417424379517617e+19,False,True,False,0
basic_dummy_adj,87298158024.0491,0.0,39574803442.54073,-1.1657592984849837e+20,10636482396.26872,-3.899042461799264e+19,112732235997.89204,-6.567054881299886e+20,194092315495.11108,-4.894329059653088e+20,26749279590.767857,-4.864731316890412e+20,223940549212.04477,-1.3560448984217628e+22,202204112189.93723,-8.67821667418416e+20,204146626.142018,-2.063208383185509e+16,True,True,False,0
basic_shifts_3,7.961561,0.0,14.963321,-67.633497,6.154034,-2.47193,15.824807,-16.074089,8.135234,-0.489862,1.058868,-0.067015,3.345187,-0.427116,8.999292,-1.052579,2.104707,0.115107,False,False,True,0
basic_shifts_3_adj,2.760815,0.0,2.267505,-0.576144,1.530232,0.785359,3.407564,0.208325,2.339258,0.876806,0.474149,0.786041,1.395379,0.751732,2.292908,0.866726,0.980738,0.807892,True,False,True,0
basic_shifts_3_dummy,1336944254206.2537,0.0,246504682830.0316,-8.021077796401549e+20,284967203579.6426,-1.4908917848873764e+22,433520344645.5184,-2.4565830827850018e+22,65064275067.514885,-1.0903251036545294e+20,35190464598.4118,-1.7026063198935525e+20,1402380368074.5603,-6.055347821984632e+23,1041743676446.2268,-4.284334336299106e+22,285326441.180413,0.0,False,True,True,0
basic_shifts_3_dummy_adj,5.129867,0.0,4.771934,0.699434,1.540647,0.564298,3.470314,-0.574495,6.474384,-0.079592,1.723516,0.591646,2.052015,-0.296283,2.763613,0.698453,0.083506,0.0,True,True,True,0
complete,13.661693,0.0,13.727114,-56.761525,4.904627,-1.205277,6.270167,-1.680519,8.74755,-0.722577,1.411996,-0.897378,4.669197,-1.780372,5.463109,0.24358,1.210587,0.707248,False,False,False,3
complete_adj,2.108698,0.0,1.304098,0.478661,1.809431,0.699888,1.463062,0.854057,1.862828,0.921877,0.440045,0.815713,1.029153,0.864949,1.363043,0.952903,0.971739,0.811401,True,False,False,3


See the best datasets:

In [159]:
linear_regression_counts = count_max_min_highlights_and_podiums(linear_regression_df)
linear_regression_counts

Unnamed: 0,max_R2_count,min_RMSE_count,R2_podium_count,RMSE_podium_count,R2_valid,total_count,total_podiums
ist_adj,4,4,6,5,8,8,11
basic_adj,2,2,4,5,8,4,9
complete_dummy_adj,3,0,3,1,8,3,4
complete_adj,2,0,6,7,8,2,13
no_ist_dummy_adj,1,1,3,1,8,2,4
basic_shifts_3_adj,2,0,2,1,7,2,3
complete_shifts_3_adj,1,1,2,1,7,2,3
complete_shifts_3_dummy_adj,1,1,1,1,4,2,2
no_ist_adj,1,0,3,3,8,1,6
basic_shifts_3_dummy_adj,1,0,2,0,4,1,2


Let's get the names of those datasets that have less than 4 valid $R^2$ values:

In [160]:
linear_regression_invalid_datasets = linear_regression_counts[
    linear_regression_counts["R2_valid"] < 4
].index.to_list()
linear_regression_invalid_datasets

['basic',
 'basic_dummy',
 'basic_dummy_adj',
 'basic_shifts_3',
 'basic_shifts_3_dummy',
 'complete',
 'complete_dummy',
 'complete_shifts_3',
 'complete_shifts_3_dummy',
 'ist',
 'ist_dummy',
 'ist_dummy_adj',
 'ist_shifts_3',
 'ist_shifts_3_dummy',
 'no_ist',
 'no_ist_dummy',
 'no_ist_shifts_3',
 'no_ist_shifts_3_dummy']

In [161]:
linear_regression_best_datasets = linear_regression_counts[
    linear_regression_counts["R2_valid"] > 4
].index.to_list()
linear_regression_best_datasets

['ist_adj',
 'basic_adj',
 'complete_dummy_adj',
 'complete_adj',
 'no_ist_dummy_adj',
 'basic_shifts_3_adj',
 'complete_shifts_3_adj',
 'no_ist_adj',
 'ist_shifts_3_adj',
 'no_ist_shifts_3_adj']

In [162]:
linear_regression_counts = linear_regression_df.merge(
    linear_regression_counts, left_index=True, right_index=True
)

Analyze by dataset's completness:

In [163]:
linear_regression_counts[["type", "total_count", "total_podiums", "R2_valid"]].groupby(
    "type"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,12,19,2.625
1,15,20,2.625
2,9,18,3.75
3,13,26,3.875


Analyze by presence of dummy variables:

In [164]:
linear_regression_counts[["dummy", "total_count", "total_podiums", "R2_valid"]].groupby(
    "dummy"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
dummy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,29,57,4.1875
True,20,26,2.25


Analyze by number of shifted variables:

In [165]:
linear_regression_counts[
    ["shifts", "total_count", "total_podiums", "R2_valid"]
].groupby("shifts").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
shifts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,30,57,3.75
True,19,26,2.6875


Analyze by presence of adjacenct variables:

In [166]:
linear_regression_counts[["adj", "total_count", "total_podiums", "R2_valid"]].groupby(
    "adj"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,16,16,0.75
True,33,67,5.6875


## K-Nearest Neighbors Regression

See the results:

In [167]:
knn_highlighted_df = highlight_max_min(knn_df)
knn_highlighted_df

Unnamed: 0,CDC_RMSE,CDC_R^2,CS_RMSE,CS_R^2,CUP_RMSE,CUP_R^2,ECPGUANYEM_RMSE,ECPGUANYEM_R^2,ERC_RMSE,ERC_R^2,OTH_RMSE,OTH_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2,adj,dummy,shifts,type
basic,17.273929,0.0,5.270261,-7.51421,5.645406,-1.92174,3.950214,-0.063903,2.728067,0.832461,1.7086,-1.778227,2.923085,-0.089686,4.265914,0.538781,3.39158,-1.297798,False,False,False,0
basic_adj,8.069779,0.0,4.876885,-6.290961,5.298776,-1.573654,3.680839,0.076254,3.005672,0.796616,1.43521,-0.960342,2.65078,0.10405,4.793271,0.417578,2.982721,-0.776905,True,False,False,0
basic_dummy,20.895409,0.0,5.219388,-1.027805,2.675369,-1.467238,5.361412,-0.485205,8.951638,-0.041116,1.668545,-0.892448,2.111984,-0.206254,7.709621,-0.261344,2.972565,-3.37431,False,True,False,0
basic_dummy_adj,7.553551,0.0,3.880084,-0.120608,2.087247,-0.501446,5.46884,-0.545485,6.254861,0.49171,3.949191,-9.60354,2.641881,-0.887275,8.151653,-0.410399,2.985107,-3.411421,True,True,False,0
basic_shifts_3,19.622448,0.0,14.094688,-59.896323,5.674494,-1.951926,6.554042,-1.928729,3.014144,0.795481,1.729849,-1.847757,3.241059,-0.339654,3.126815,0.752208,4.418884,-2.900614,False,False,True,0
basic_shifts_3_adj,13.700863,0.0,8.868485,-23.110075,5.816569,-2.101221,6.317843,-1.721427,3.430169,0.735111,1.693118,-1.728196,3.237711,-0.336635,2.647027,0.822381,4.300245,-2.69339,True,False,True,0
basic_shifts_3_dummy,23.140274,0.0,14.979679,-1.962016,3.053741,-0.712067,8.591873,-8.649131,12.205487,-2.836905,4.853735,-2.239041,4.807005,-6.114701,3.152311,0.607699,0.083705,0.0,False,True,True,0
basic_shifts_3_dummy_adj,10.575473,0.0,15.845156,-2.313938,4.478,-2.680881,7.716684,-6.785117,7.830433,-0.57919,5.137705,-2.628645,4.552259,-5.379586,3.391989,0.545735,0.111753,0.0,True,True,True,0
complete,13.456629,0.0,6.828834,-13.294642,4.00798,-0.47266,2.571578,0.549121,6.473118,0.056737,0.945909,0.148499,2.004282,0.487686,3.820354,0.630095,4.125613,-2.400045,False,False,False,3
complete_adj,8.23479,0.0,6.175269,-10.689917,4.460253,-0.823551,2.426557,0.598542,4.816595,0.477707,0.720247,0.506299,1.994035,0.493007,3.780908,0.637618,3.921997,-2.072226,True,False,False,3


In [168]:
knn_counts = count_max_min_highlights_and_podiums(knn_df)
knn_counts

Unnamed: 0,max_R2_count,min_RMSE_count,R2_podium_count,RMSE_podium_count,R2_valid,total_count,total_podiums
complete_adj,4,3,4,3,5,7,7
complete_dummy,2,1,3,2,1,3,5
complete_dummy_adj,2,1,3,2,1,3,5
basic,2,1,2,1,2,3,3
complete_shifts_3_dummy,2,1,2,1,1,3,3
ist_shifts_3_adj,2,1,2,1,2,3,3
complete_shifts_3_dummy_adj,2,0,2,1,1,2,3
no_ist_shifts_3_dummy,2,0,2,1,1,2,3
basic_dummy_adj,1,1,1,1,1,2,2
basic_shifts_3_dummy,2,0,2,0,1,2,2


Let's get the names of those datasets that have less than 4 valid $R^2$ values:

In [169]:
knn_invalid_datasets = knn_counts[
    knn_counts["R2_valid"] < 4
].index.to_list()
knn_invalid_datasets

['complete_dummy',
 'complete_dummy_adj',
 'basic',
 'complete_shifts_3_dummy',
 'ist_shifts_3_adj',
 'complete_shifts_3_dummy_adj',
 'no_ist_shifts_3_dummy',
 'basic_dummy_adj',
 'basic_shifts_3_dummy',
 'basic_shifts_3_dummy_adj',
 'ist_shifts_3_dummy',
 'ist_shifts_3_dummy_adj',
 'no_ist_shifts_3_dummy_adj',
 'complete',
 'ist_shifts_3',
 'basic_shifts_3_adj',
 'ist',
 'no_ist_dummy',
 'no_ist_dummy_adj',
 'basic_adj',
 'ist_dummy_adj',
 'basic_dummy',
 'basic_shifts_3',
 'complete_shifts_3',
 'complete_shifts_3_adj',
 'ist_adj',
 'ist_dummy',
 'no_ist',
 'no_ist_shifts_3',
 'no_ist_shifts_3_adj']

In [170]:
knn_best_datasets = knn_counts[
    knn_counts["R2_valid"] > 4
].index.to_list()
knn_best_datasets

['complete_adj', 'no_ist_adj']

In [171]:
knn_counts = knn_df.merge(
    knn_counts, left_index=True, right_index=True
)

Analyze by dataset's completness:

In [172]:
knn_counts[["type", "total_count", "total_podiums", "R2_valid"]].groupby("type").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,13,16,1.375
1,12,19,1.625
2,10,21,2.25
3,21,32,2.25


Analyze by presence of dummy variables:

In [173]:
knn_counts[["dummy", "total_count", "total_podiums", "R2_valid"]].groupby("dummy").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
dummy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,26,47,2.875
True,30,41,0.875


Analyze by number of shifted variables:

In [174]:
knn_counts[["shifts", "total_count", "total_podiums", "R2_valid"]].groupby(
    "shifts"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
shifts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,29,53,2.0
True,27,35,1.75


Analyze by presence of adjacenct variables:

In [175]:
knn_counts[["adj", "total_count", "total_podiums", "R2_valid"]].groupby("adj").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,25,42,1.6875
True,31,46,2.0625


## Decision Tree Regression

See the results:

In [176]:
decision_tree_highlighted_df = highlight_max_min(decision_tree_df)
decision_tree_highlighted_df

Unnamed: 0,CDC_RMSE,CDC_R^2,CS_RMSE,CS_R^2,CUP_RMSE,CUP_R^2,ECPGUANYEM_RMSE,ECPGUANYEM_R^2,ERC_RMSE,ERC_R^2,OTH_RMSE,OTH_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2,adj,dummy,shifts,type
basic,5.983002,0.0,5.201733,-7.294233,4.682818,-1.010323,6.183077,-1.606574,5.829618,0.234957,1.108808,-0.170036,2.447745,0.235899,7.589988,-0.460042,3.508544,-1.459017,False,False,False,0
basic_adj,8.358506,0.0,5.297528,-7.602925,5.089108,-1.374009,4.945702,-0.667688,6.354182,0.091023,1.385036,-0.825673,3.335527,-0.418618,7.878373,-0.573428,2.758687,-0.52,True,False,False,0
basic_dummy,18.570479,0.0,3.823646,-0.088284,1.907491,-0.254205,6.681338,-1.306508,12.21052,-0.937146,1.64535,-0.8402,4.185753,-3.738107,6.87559,-0.0032,2.772237,-2.804588,False,True,False,0
basic_dummy_adj,36.292524,0.0,4.822965,-0.731409,5.101918,-7.970752,6.992871,-1.526885,20.543102,-4.482878,1.643818,-0.837139,2.854881,-1.203864,11.181889,-1.653877,2.997683,-3.44867,True,True,False,0
basic_shifts_3,7.935193,0.0,3.619578,-3.016021,4.40375,-0.777856,4.134108,-0.165264,8.820768,-0.751535,1.394397,-0.850375,3.458709,-0.525621,4.762696,0.425105,2.969567,-0.761546,False,False,True,0
basic_shifts_3_adj,5.342414,0.0,2.90204,-1.581703,4.264176,-0.666745,3.594235,0.119211,9.245836,-0.924536,1.088694,-0.128009,3.470699,-0.535926,5.309253,0.285437,2.98238,-0.776498,True,False,True,0
basic_shifts_3_dummy,14.532169,0.0,14.230866,-1.673284,3.247638,-0.936384,8.498529,-8.440609,12.766745,-3.197892,4.559064,-1.857693,6.265241,-11.086013,4.854382,0.069686,0.002513,0.0,False,True,True,0
basic_shifts_3_dummy_adj,20.543716,0.0,13.084694,-1.259844,2.409279,-0.06551,5.834363,-3.450314,12.905119,-3.289304,4.767317,-2.124311,4.364949,-4.865392,5.14142,-0.043679,0.002481,0.0,True,True,True,0
complete,15.087023,0.0,5.895352,-9.653679,5.096292,-1.381003,4.64713,-0.472415,4.849657,0.470546,1.438922,-0.970431,2.983448,-0.135156,6.835912,-0.184339,2.449228,-0.198302,False,False,False,3
complete_adj,7.452532,0.0,3.147023,-2.035984,4.047673,-0.501792,3.954034,-0.065957,6.750506,-0.025903,2.385216,-4.414477,3.461888,-0.528138,4.907409,0.38951,3.934515,-2.091869,True,False,False,3


In [177]:
decision_tree_counts = count_max_min_highlights_and_podiums(decision_tree_df)
decision_tree_counts

Unnamed: 0,max_R2_count,min_RMSE_count,R2_podium_count,RMSE_podium_count,R2_valid,total_count,total_podiums
basic_shifts_3_adj,3,2,3,3,1,5,6
no_ist_shifts_3_dummy,3,1,4,2,1,4,6
basic_dummy,2,1,2,1,0,3,3
complete,2,1,2,1,1,3,3
no_ist_shifts_3,2,1,2,1,1,3,3
basic_shifts_3_dummy_adj,2,0,3,1,0,2,4
complete_shifts_3,2,0,2,2,1,2,4
no_ist_shifts_3_adj,1,1,2,2,0,2,4
basic_shifts_3_dummy,2,0,2,1,0,2,3
complete_shifts_3_dummy_adj,2,0,2,1,1,2,3


Let's get the names of those datasets that have less than 4 valid $R^2$ values:

In [178]:
decision_tree_invalid_datasets = decision_tree_counts[
    decision_tree_counts["R2_valid"] < 4
].index.to_list()
decision_tree_invalid_datasets

['basic_shifts_3_adj',
 'no_ist_shifts_3_dummy',
 'basic_dummy',
 'complete',
 'no_ist_shifts_3',
 'basic_shifts_3_dummy_adj',
 'complete_shifts_3',
 'no_ist_shifts_3_adj',
 'basic_shifts_3_dummy',
 'complete_shifts_3_dummy_adj',
 'ist_dummy_adj',
 'ist_shifts_3_dummy_adj',
 'no_ist_shifts_3_dummy_adj',
 'complete_shifts_3_dummy',
 'ist_shifts_3_dummy',
 'no_ist_adj',
 'basic',
 'ist_adj',
 'complete_adj',
 'no_ist',
 'basic_shifts_3',
 'ist_dummy',
 'ist_shifts_3',
 'ist_shifts_3_adj',
 'no_ist_dummy',
 'no_ist_dummy_adj',
 'basic_adj',
 'basic_dummy_adj',
 'complete_dummy',
 'complete_dummy_adj',
 'complete_shifts_3_adj',
 'ist']

In [179]:
decision_tree_best_datasets = decision_tree_counts[
    decision_tree_counts["R2_valid"] > 4
].index.to_list()
decision_tree_best_datasets

[]

In [180]:
decision_tree_counts = decision_tree_df.merge(
    decision_tree_counts, left_index=True, right_index=True
)

Analyze by dataset's completness:

In [181]:
decision_tree_counts[["type", "total_count", "total_podiums", "R2_valid"]].groupby(
    "type"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,16,26,0.5
1,11,19,0.375
2,16,25,0.5
3,13,18,0.5


Analyze by presence of dummy variables:

In [182]:
decision_tree_counts[["dummy", "total_count", "total_podiums", "R2_valid"]].groupby(
    "dummy"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
dummy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,27,47,0.6875
True,29,41,0.25


Analyze by number of shifted variables:

In [183]:
decision_tree_counts[["shifts", "total_count", "total_podiums", "R2_valid"]].groupby(
    "shifts"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
shifts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,22,38,0.375
True,34,50,0.5625


Analyze by presence of adjacenct variables:

In [184]:
decision_tree_counts[["adj", "total_count", "total_podiums", "R2_valid"]].groupby(
    "adj"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,29,45,0.625
True,27,43,0.3125


## XGBoost Single-Output Regression

See the results:

In [185]:
xgboost_single_output_highlighted_df = highlight_max_min(xgboost_single_output_df)
xgboost_single_output_highlighted_df

Unnamed: 0,CDC_RMSE,CDC_R^2,CS_RMSE,CS_R^2,CUP_RMSE,CUP_R^2,ECPGUANYEM_RMSE,ECPGUANYEM_R^2,ERC_RMSE,ERC_R^2,OTH_RMSE,OTH_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2,adj,dummy,shifts,type
basic,5.77851,0.0,5.605069,-8.630351,4.55036,-0.898204,3.872283,-0.022339,3.273822,0.758723,0.933075,0.171448,1.8214,0.576913,3.658543,0.660766,1.506321,0.546745,False,False,False,0
basic_adj,0.87847,0.0,1.962937,-0.18117,1.837219,0.6906,1.726324,0.796809,2.545937,0.854075,0.618944,0.635411,1.405696,0.748047,2.27292,0.869039,1.425858,0.593939,True,False,False,0
basic_dummy,6.973349,0.0,4.045393,-0.218172,4.762538,-6.818445,6.064964,-0.900573,15.472059,-2.110215,5.121569,-16.830123,3.597576,-2.500078,12.509509,-2.320841,4.288959,-8.106483,False,True,False,0
basic_dummy_adj,1.442228,0.0,1.937727,0.720516,1.390817,0.333344,2.808871,0.592303,4.996068,0.67571,1.059833,0.236323,1.219797,0.597669,2.766423,0.837562,1.265578,0.207067,True,True,False,0
basic_shifts_3,8.771249,0.0,3.069299,-1.887744,4.403223,-0.77743,3.885481,-0.02932,5.044191,0.427219,1.984397,-2.747521,1.698914,0.631904,3.323659,0.720027,1.57254,0.506018,False,False,True,0
basic_shifts_3_adj,1.314947,0.0,1.973572,-0.194004,1.854025,0.684913,2.028744,0.719383,3.349825,0.747374,0.680231,0.559634,1.563446,0.688325,1.75865,0.921597,1.709627,0.41623,True,False,True,0
basic_shifts_3_dummy,8.387128,0.0,15.976511,-2.369351,2.757153,-0.395655,3.12745,-0.278478,12.206569,-2.837585,2.73666,-0.029689,2.893776,-1.578322,6.233435,-0.533969,4.96067,0.0,False,True,True,0
basic_shifts_3_dummy_adj,7.755222,0.0,10.337865,-0.41063,1.919563,0.323624,2.270796,0.325845,7.358902,-0.394726,3.728862,-0.911431,1.581824,0.229709,2.596807,0.733756,0.554728,0.0,True,True,True,0
complete,5.769476,0.0,5.073814,-6.891311,4.304056,-0.698271,3.739526,0.046559,3.933774,0.651643,1.160628,-0.281955,1.855339,0.561,3.860012,0.622375,1.588096,0.496197,False,False,False,3
complete_adj,0.861656,0.0,1.952748,-0.168939,1.797499,0.703833,1.667435,0.810436,2.49464,0.859896,0.618397,0.636055,1.40464,0.748426,2.249663,0.871705,1.384361,0.61723,True,False,False,3


In [186]:
xgboost_single_output_counts = count_max_min_highlights_and_podiums(xgboost_single_output_df)
xgboost_single_output_counts

Unnamed: 0,max_R2_count,min_RMSE_count,R2_podium_count,RMSE_podium_count,R2_valid,total_count,total_podiums
complete_adj,4,3,7,4,7,7,11
ist_shifts_3_adj,2,2,2,2,7,4,4
no_ist_adj,3,0,7,4,7,3,11
ist_adj,2,1,3,2,7,3,5
ist_dummy_adj,2,1,2,3,7,3,5
basic_dummy_adj,1,1,1,1,8,2,2
complete_shifts_3_dummy_adj,1,1,1,1,4,2,2
basic_adj,1,0,5,2,7,1,7
no_ist_dummy_adj,1,0,2,2,7,1,4
basic_shifts_3_adj,1,0,2,1,7,1,3


Let's get the names of those datasets that have less than 4 valid $R^2$ values:

In [187]:
xgboost_single_output_invalid_datasets = xgboost_single_output_counts[
    xgboost_single_output_counts["R2_valid"] < 4
].index.to_list()
xgboost_single_output_invalid_datasets

['ist_shifts_3_dummy_adj',
 'basic_dummy',
 'basic_shifts_3_dummy',
 'complete_dummy',
 'complete_shifts_3_dummy',
 'ist_dummy',
 'ist_shifts_3_dummy',
 'no_ist_dummy',
 'no_ist_shifts_3_dummy',
 'no_ist_shifts_3_dummy_adj']

In [188]:
xgboost_single_output_best_datasets = xgboost_single_output_counts[
    xgboost_single_output_counts["R2_valid"] > 4
].index.to_list()
xgboost_single_output_best_datasets

['complete_adj',
 'ist_shifts_3_adj',
 'no_ist_adj',
 'ist_adj',
 'ist_dummy_adj',
 'basic_dummy_adj',
 'basic_adj',
 'no_ist_dummy_adj',
 'basic_shifts_3_adj',
 'complete_dummy_adj',
 'no_ist_shifts_3_adj',
 'complete_shifts_3_adj']

In [189]:
xgboost_single_output_counts = xgboost_single_output_df.merge(
    xgboost_single_output_counts, left_index=True, right_index=True
)

Analyze by dataset's completness:

In [190]:
xgboost_single_output_counts[
    ["type", "total_count", "total_podiums", "R2_valid"]
].groupby("type").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9,18,4.25
1,15,20,4.0
2,10,23,4.0
3,15,22,4.125


Analyze by presence of dummy variables:

In [191]:
xgboost_single_output_counts[
    ["dummy", "total_count", "total_podiums", "R2_valid"]
].groupby("dummy").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
dummy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,29,54,5.5
True,20,29,2.6875


Analyze by number of shifted variables:

In [192]:
xgboost_single_output_counts[
    ["shifts", "total_count", "total_podiums", "R2_valid"]
].groupby("shifts").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
shifts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,29,56,4.5625
True,20,27,3.625


Analyze by presence of adjacenct variables:

In [193]:
xgboost_single_output_counts[
    ["adj", "total_count", "total_podiums", "R2_valid"]
].groupby("adj").agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,16,16,2.0
True,33,67,6.1875


## XGBoost Multi-Output Regression

See the results:

In [194]:
xgboost_multi_output_highlighted_df = highlight_max_min(xgboost_multi_output_df)
xgboost_multi_output_highlighted_df

Unnamed: 0,CDC_RMSE,CDC_R^2,CS_RMSE,CS_R^2,CUP_RMSE,CUP_R^2,ECPGUANYEM_RMSE,ECPGUANYEM_R^2,ERC_RMSE,ERC_R^2,OTH_RMSE,OTH_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2,adj,dummy,shifts,type
basic,6.912863,0.0,3.965014,-3.81914,3.893145,-0.389481,2.706174,0.500688,5.114751,0.411082,1.824544,-2.168074,2.176135,0.396065,2.766259,0.806059,1.98706,0.211268,False,False,False,0
basic_adj,1.804412,0.0,6.293718,-11.14267,2.757898,0.302803,2.322399,0.632267,2.709542,0.834718,2.049461,-2.997424,2.453194,0.232638,2.654518,0.821374,2.393367,-0.144083,True,False,False,0
basic_dummy,6.978979,0.0,4.010917,-0.197497,4.815888,-6.99459,6.087027,-0.914426,15.456491,-2.103959,5.129501,-16.885397,3.615487,-2.535016,12.571362,-2.353762,4.292633,-8.122092,False,True,False,0
basic_dummy_adj,3.44383,0.0,2.903251,0.372606,2.84256,-1.784718,5.542597,-0.587454,7.519377,0.265418,3.986608,-9.805424,1.77942,0.143819,8.737487,-0.620405,1.252809,0.222987,True,True,False,0
basic_shifts_3,6.904218,0.0,4.29715,-4.660323,3.841069,-0.352557,4.13516,-0.165857,6.937369,-0.083416,1.358566,-0.756502,2.617992,0.125912,5.27176,0.29564,3.125693,-0.951643,False,False,True,0
basic_shifts_3_adj,3.037084,0.0,8.281557,-20.024403,2.746335,0.308637,2.775309,0.474852,5.441625,0.333361,2.629305,-5.57935,2.589708,0.144858,4.279635,0.535712,2.963255,-0.753787,True,False,True,0
basic_shifts_3_dummy,8.388105,0.0,16.005813,-2.381722,2.362189,-0.024438,2.387287,0.255059,10.727091,-1.963703,3.030764,-0.262899,2.892235,-1.575577,5.179332,-0.059032,3.120069,0.0,False,True,True,0
basic_shifts_3_dummy_adj,8.341634,0.0,14.193432,-1.659048,2.446374,-0.098574,2.039424,0.456226,12.24273,-2.860285,3.047085,-0.276366,3.903053,-3.689727,4.049446,0.352571,3.187998,0.0,True,True,True,0
complete,6.747306,0.0,4.079625,-4.101766,3.901088,-0.395156,2.649296,0.521456,4.916937,0.455754,1.683067,-1.695813,2.037605,0.470509,2.755683,0.807539,2.008371,0.194259,False,False,False,3
complete_adj,1.643768,0.0,6.339746,-11.320926,2.798203,0.282276,2.145148,0.686257,2.544213,0.854273,1.891844,-2.406212,2.406441,0.261608,2.747918,0.808583,2.479356,-0.227769,True,False,False,3


In [195]:
xgboost_multi_output_counts = count_max_min_highlights_and_podiums(xgboost_multi_output_df)
xgboost_multi_output_counts

Unnamed: 0,max_R2_count,min_RMSE_count,R2_podium_count,RMSE_podium_count,R2_valid,total_count,total_podiums
basic_dummy_adj,3,2,3,2,3,5,5
complete_adj,3,2,3,2,5,5,5
basic_adj,2,1,3,2,5,3,5
no_ist_dummy_adj,1,1,2,3,3,2,5
basic_shifts_3_dummy,1,1,2,1,1,2,3
no_ist_shifts_3_dummy_adj,2,0,2,1,2,2,3
complete,2,0,2,0,4,2,2
complete_shifts_3,1,1,1,1,2,2,2
ist_shifts_3_dummy_adj,1,1,1,1,2,2,2
no_ist_shifts_3_adj,2,0,2,0,4,2,2


Let's get the names of those datasets that have less than 4 valid $R^2$ values:

In [196]:
xgboost_multi_output_invalid_datasets = xgboost_multi_output_counts[
    xgboost_multi_output_counts["R2_valid"] < 4
].index.to_list()
xgboost_multi_output_invalid_datasets

['basic_dummy_adj',
 'no_ist_dummy_adj',
 'basic_shifts_3_dummy',
 'no_ist_shifts_3_dummy_adj',
 'complete_shifts_3',
 'ist_shifts_3_dummy_adj',
 'ist_dummy_adj',
 'basic_shifts_3_dummy_adj',
 'complete_dummy_adj',
 'complete_shifts_3_dummy',
 'complete_shifts_3_dummy_adj',
 'ist_shifts_3',
 'ist_shifts_3_dummy',
 'no_ist_shifts_3',
 'basic_dummy',
 'basic_shifts_3',
 'complete_dummy',
 'ist_dummy',
 'no_ist_dummy',
 'no_ist_shifts_3_dummy']

In [197]:
xgboost_multi_output_best_datasets = xgboost_multi_output_counts[
    xgboost_multi_output_counts["R2_valid"] > 4
].index.to_list()
xgboost_multi_output_best_datasets

['complete_adj', 'basic_adj', 'no_ist_adj', 'ist_adj', 'basic']

In [198]:
xgboost_multi_output_counts = xgboost_multi_output_df.merge(
    xgboost_multi_output_counts, left_index=True, right_index=True
)

Analyze by dataset's completness:

In [199]:
xgboost_multi_output_counts[["type", "total_count", "total_podiums", "R2_valid"]].groupby("type").agg({     "total_count": "sum",     "total_podiums": "sum",     "R2_valid": "mean" })

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,15,20,2.625
1,9,20,2.625
2,11,25,2.5
3,14,18,2.5


Analyze by presence of dummy variables:

In [200]:
xgboost_multi_output_counts[
    ["dummy", "total_count", "total_podiums", "R2_valid"]
].groupby("dummy").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
dummy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,25,45,3.75
True,24,38,1.375


Analyze by number of shifted variables:

In [201]:
xgboost_multi_output_counts[
    ["shifts", "total_count", "total_podiums", "R2_valid"]
].groupby("shifts").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
shifts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,28,52,3.0
True,21,31,2.125


Analyze by presence of adjacenct variables:

In [202]:
xgboost_multi_output_counts[
    ["adj", "total_count", "total_podiums", "R2_valid"]
].groupby("adj").agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,19,30,1.6875
True,30,53,3.4375


In [203]:
xgboost_multi_output_filtered_df = xgboost_multi_output_df[
    ~xgboost_multi_output_df["dummy"] & ~xgboost_multi_output_df["shifts"]
]
xgboost_multi_output_resumed_df = highlight_max_min(xgboost_multi_output_filtered_df)
xgboost_multi_output_resumed_df

Unnamed: 0,CDC_RMSE,CDC_R^2,CS_RMSE,CS_R^2,CUP_RMSE,CUP_R^2,ECPGUANYEM_RMSE,ECPGUANYEM_R^2,ERC_RMSE,ERC_R^2,OTH_RMSE,OTH_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2,adj,dummy,shifts,type
basic,6.912863,0.0,3.965014,-3.81914,3.893145,-0.389481,2.706174,0.500688,5.114751,0.411082,1.824544,-2.168074,2.176135,0.396065,2.766259,0.806059,1.98706,0.211268,False,False,False,0
basic_adj,1.804412,0.0,6.293718,-11.14267,2.757898,0.302803,2.322399,0.632267,2.709542,0.834718,2.049461,-2.997424,2.453194,0.232638,2.654518,0.821374,2.393367,-0.144083,True,False,False,0
complete,6.747306,0.0,4.079625,-4.101766,3.901088,-0.395156,2.649296,0.521456,4.916937,0.455754,1.683067,-1.695813,2.037605,0.470509,2.755683,0.807539,2.008371,0.194259,False,False,False,3
complete_adj,1.643768,0.0,6.339746,-11.320926,2.798203,0.282276,2.145148,0.686257,2.544213,0.854273,1.891844,-2.406212,2.406441,0.261608,2.747918,0.808583,2.479356,-0.227769,True,False,False,3
ist,6.819112,0.0,4.061365,-4.056201,3.883551,-0.38264,2.664062,0.516107,5.029999,0.430437,1.70378,-1.762573,2.064604,0.456384,2.740233,0.809692,2.00537,0.196665,False,False,False,1
ist_adj,1.709108,0.0,6.393748,-11.531718,2.822666,0.269672,2.22039,0.663862,2.784576,0.825437,1.975504,-2.714128,2.478794,0.216539,2.702664,0.814835,2.464913,-0.213506,True,False,False,1
no_ist,6.837161,0.0,4.052825,-4.034959,3.836103,-0.349062,2.605601,0.537111,5.359188,0.353447,1.779435,-2.01336,2.094839,0.440346,2.691774,0.816363,2.0272,0.17908,False,False,False,2
no_ist_adj,1.695294,0.0,6.390995,-11.520931,2.770819,0.296255,2.2181,0.664555,2.6618,0.840491,1.967444,-2.683884,2.414236,0.256817,2.674611,0.818659,2.436174,-0.185374,True,False,False,2


## Identify Invalid Datasets

We define a dataset as invalid if it has less than 4 valid $R^2$ values in any of the models.

In [204]:
invalid_datasets = (
    set(linear_regression_invalid_datasets)
    & set(knn_invalid_datasets)
    & set(decision_tree_invalid_datasets)
    & set(xgboost_single_output_invalid_datasets)
    & set(xgboost_multi_output_invalid_datasets)
)
invalid_datasets

{'basic_dummy',
 'basic_shifts_3_dummy',
 'complete_dummy',
 'complete_shifts_3_dummy',
 'ist_dummy',
 'ist_shifts_3_dummy',
 'no_ist_dummy',
 'no_ist_shifts_3_dummy'}

## Identify Best Datasets

We define a dataset as the best if it has more than 4 valid $R^2$ values in all models.

In [205]:
best_datasets = (
    set(linear_regression_best_datasets)
    | set(knn_best_datasets)
    | set(decision_tree_best_datasets)
    | set(xgboost_single_output_best_datasets)
    | set(xgboost_multi_output_best_datasets)
)
best_datasets

{'basic',
 'basic_adj',
 'basic_dummy_adj',
 'basic_shifts_3_adj',
 'complete_adj',
 'complete_dummy_adj',
 'complete_shifts_3_adj',
 'ist_adj',
 'ist_dummy_adj',
 'ist_shifts_3_adj',
 'no_ist_adj',
 'no_ist_dummy_adj',
 'no_ist_shifts_3_adj'}

In [206]:
best_of_best_datasets = (
    set(linear_regression_best_datasets)
    # & set(knn_best_datasets)
    # & set(decision_tree_best_datasets)
    & set(xgboost_single_output_best_datasets)
    & set(xgboost_multi_output_best_datasets)
)
best_of_best_datasets

{'basic_adj', 'complete_adj', 'ist_adj', 'no_ist_adj'}

## Summary

In [207]:
def summary_dataframe(df):
    df = df.copy()

    # Drop the unwanted columns
    df = df.drop(columns=["adj", "dummy", "shifts", "type"])

    # Rename the columns
    # Replace CDC for Junts
    # Replace OTH for Altres
    # Replace CS for C's
    # Replace ECPGUANYEM for Comuns
    replacements = {
        "CDC": "Junts",
        "OTH": "Altres",
        "CS": "C's",
        "ECPGUANYEM": "Comuns",
    }
    df.columns = df.columns.str.replace(
        "CDC|OTH|CS|ECPGUANYEM", lambda m: replacements[m.group(0)], regex=True
    )

    # Rename the index
    df.index = [
        "Basic",
        "Basic Adj.",
        "Complet",
        "Complet Adj.",
        "IST",
        "IST Adj.",
        "No IST",
        "No IST Adj.",
    ]

    # Sort the index based on the specified order
    order = [
        "Basic",
        "No IST",
        "IST",
        "Complet",
        "Basic Adj.",
        "No IST Adj.",
        "IST Adj.",
        "Complet Adj.",
    ]
    df = df.reindex(order)

    # Rename the index label
    df.index.name = "Dataset"

    return df

In [208]:
def evaluate_datasets(df):
    def count_above_threshold(row, threshold=0.2):
        return sum(value > threshold for value in row)

    df = df.copy()

    # Adding a column to count the number of models with R^2 > 0.2
    df["count_above_0.2"] = df.apply(
        lambda row: count_above_threshold(row.filter(like="_R^2")), axis=1
    )

    # Get the R^2 columns
    r2_columns = df.filter(like="_R^2").columns.tolist()

    # Finding the best dataset based on the criteria provided
    sort_by = ["count_above_0.2"] + r2_columns
    ascending_order = [False] * len(sort_by)

    best_overall = df.sort_values(by=sort_by, ascending=ascending_order).iloc[0]

    # Filtering adj. and non-adj. datasets
    non_adj_df = df[df["adj"] == False]
    adj_df = df[df["adj"] == True]

    best_non_adj = (
        non_adj_df.sort_values(by=sort_by, ascending=ascending_order).iloc[0]
        if not non_adj_df.empty
        else None
    )
    best_adj = (
        adj_df.sort_values(by=sort_by, ascending=ascending_order).iloc[0]
        if not adj_df.empty
        else None
    )

    # Finding the best dataset of each type
    types = [0, 1, 2, 3]
    best_by_type = {}
    for type_ in types:
        type_df = df[df["type"] == type_]
        best_by_type[type_] = (
            type_df.sort_values(by=sort_by, ascending=ascending_order).iloc[0]
            if not type_df.empty
            else None
        )

    results = {
        "best_overall": best_overall,
        "best_non_adj": best_non_adj,
        "best_adj": best_adj,
        "best_by_type": best_by_type,
    }

    # Creating a summary dataframe for results
    summary_data = {
        "Category": ["Best Overall", "Best Non-Adj", "Best Adj"]
        + [f"Best {type_}" for type_ in types],
        "Dataset": [
            results["best_overall"].name,
            (
                results["best_non_adj"].name
                if results["best_non_adj"] is not None
                else "N/A"
            ),
            results["best_adj"].name if results["best_adj"] is not None else "N/A",
        ]
        + [
            (
                results["best_by_type"][type_].name
                if results["best_by_type"][type_] is not None
                else "N/A"
            )
            for type_ in types
        ],
        "Count_R2>0.2": [
            results["best_overall"]["count_above_0.2"],
            (
                results["best_non_adj"]["count_above_0.2"]
                if results["best_non_adj"] is not None
                else "N/A"
            ),
            results["best_adj"]["count_above_0.2"]
            if results["best_adj"] is not None
            else "N/A",
        ]
        + [
            (
                results["best_by_type"][type_]["count_above_0.2"]
                if results["best_by_type"][type_] is not None
                else "N/A"
            )
            for type_ in types
        ],
    }

    for col in r2_columns:
        summary_data[col] = [
            results["best_overall"][col],
            (
                results["best_non_adj"][col]
                if results["best_non_adj"] is not None
                else "N/A"
            ),
            results["best_adj"][col] if results["best_adj"] is not None else "N/A",
        ] + [
            (
                results["best_by_type"][type_][col]
                if results["best_by_type"][type_] is not None
                else "N/A"
            )
            for type_ in types
        ]

    summary_df = pd.DataFrame(summary_data)
    return summary_df

In [209]:
linear_regression_filtered_df = linear_regression_df[
    ~linear_regression_df["dummy"] & ~linear_regression_df["shifts"]
]
linear_regression_summary_df = summary_dataframe(linear_regression_filtered_df)
# linear_regression_summary_df = highlight_max_min(linear_regression_summary_df)
linear_regression_summary_df

Unnamed: 0_level_0,Junts_RMSE,Junts_R^2,C's_RMSE,C's_R^2,CUP_RMSE,CUP_R^2,Comuns_RMSE,Comuns_R^2,ERC_RMSE,ERC_R^2,Altres_RMSE,Altres_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Basic,13.108765,0.0,13.812827,-57.485107,4.958795,-1.254258,6.306477,-1.711654,8.550081,-0.645683,1.422144,-0.924748,4.603915,-1.703169,5.488559,0.236516,1.213344,0.705913
No IST,13.64314,0.0,13.721157,-56.7114,4.923106,-1.221926,6.28185,-1.690518,8.746869,-0.722309,1.410472,-0.893285,4.677902,-1.790749,5.463862,0.243372,1.21046,0.70731
IST,13.559294,0.0,13.778561,-57.195293,4.887827,-1.190195,6.232639,-1.648529,8.648102,-0.683633,1.426564,-0.936731,4.609349,-1.709554,5.458752,0.244787,1.211121,0.70699
Complet,13.661693,0.0,13.727114,-56.761525,4.904627,-1.205277,6.270167,-1.680519,8.74755,-0.722577,1.411996,-0.897378,4.669197,-1.780372,5.463109,0.24358,1.210587,0.707248
Basic Adj.,2.035616,0.0,1.297133,0.484215,1.80456,0.701502,1.459422,0.854782,1.861985,0.921948,0.44167,0.814349,1.030653,0.864556,1.363938,0.952841,0.974992,0.810137
No IST Adj.,2.109802,0.0,1.307231,0.476153,1.81086,0.699414,1.482743,0.850104,1.872924,0.921028,0.440345,0.815461,1.029534,0.864849,1.364012,0.952836,0.97311,0.810869
IST Adj.,2.11424,0.0,1.293992,0.48671,1.80483,0.701412,1.457292,0.855205,1.864453,0.921741,0.440669,0.81519,1.024261,0.86623,1.351399,0.953704,0.973224,0.810825
Complet Adj.,2.108698,0.0,1.304098,0.478661,1.809431,0.699888,1.463062,0.854057,1.862828,0.921877,0.440045,0.815713,1.029153,0.864949,1.363043,0.952903,0.971739,0.811401


In [210]:
linear_regression_results_df = evaluate_datasets(linear_regression_filtered_df)
linear_regression_results_df

Unnamed: 0,Category,Dataset,Count_R2>0.2,CDC_R^2,CS_R^2,CUP_R^2,ECPGUANYEM_R^2,ERC_R^2,OTH_R^2,PP_R^2,PSC_R^2,VOX_R^2
0,Best Overall,ist_adj,8,0.0,0.48671,0.701412,0.855205,0.921741,0.81519,0.86623,0.953704,0.810825
1,Best Non-Adj,no_ist,2,0.0,-56.7114,-1.221926,-1.690518,-0.722309,-0.893285,-1.790749,0.243372,0.70731
2,Best Adj,ist_adj,8,0.0,0.48671,0.701412,0.855205,0.921741,0.81519,0.86623,0.953704,0.810825
3,Best 0,basic_adj,8,0.0,0.484215,0.701502,0.854782,0.921948,0.814349,0.864556,0.952841,0.810137
4,Best 1,ist_adj,8,0.0,0.48671,0.701412,0.855205,0.921741,0.81519,0.86623,0.953704,0.810825
5,Best 2,no_ist_adj,8,0.0,0.476153,0.699414,0.850104,0.921028,0.815461,0.864849,0.952836,0.810869
6,Best 3,complete_adj,8,0.0,0.478661,0.699888,0.854057,0.921877,0.815713,0.864949,0.952903,0.811401


In [211]:
knn_filtered_df = knn_df[
    ~knn_df["dummy"] & ~knn_df["shifts"]
]
knn_summary_df = summary_dataframe(knn_filtered_df)
# knn_summary_df = highlight_max_min(knn_summary_df)
knn_summary_df

Unnamed: 0_level_0,Junts_RMSE,Junts_R^2,C's_RMSE,C's_R^2,CUP_RMSE,CUP_R^2,Comuns_RMSE,Comuns_R^2,ERC_RMSE,ERC_R^2,Altres_RMSE,Altres_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Basic,17.273929,0.0,5.270261,-7.51421,5.645406,-1.92174,3.950214,-0.063903,2.728067,0.832461,1.7086,-1.778227,2.923085,-0.089686,4.265914,0.538781,3.39158,-1.297798
No IST,13.561278,0.0,6.895001,-13.572995,4.025701,-0.485711,2.600666,0.538863,6.453595,0.062418,0.959949,0.123033,2.018554,0.480364,3.86147,0.62209,4.119302,-2.389651
IST,18.784101,0.0,5.53805,-8.40143,5.631912,-1.907789,3.213358,0.29599,2.905402,0.809972,1.706866,-1.772589,2.840307,-0.028843,3.898957,0.614717,3.407206,-1.31902
Complet,13.456629,0.0,6.828834,-13.294642,4.00798,-0.47266,2.571578,0.549121,6.473118,0.056737,0.945909,0.148499,2.004282,0.487686,3.820354,0.630095,4.125613,-2.400045
Basic Adj.,8.069779,0.0,4.876885,-6.290961,5.298776,-1.573654,3.680839,0.076254,3.005672,0.796616,1.43521,-0.960342,2.65078,0.10405,4.793271,0.417578,2.982721,-0.776905
No IST Adj.,8.173918,0.0,6.169635,-10.668596,4.458847,-0.822402,2.461689,0.586833,4.829849,0.474829,0.721619,0.504416,2.001314,0.489299,3.807893,0.632427,3.916109,-2.063009
IST Adj.,9.081717,0.0,5.232008,-7.391439,4.728259,-1.049281,3.390604,0.216186,3.130899,0.779316,1.259305,-0.509256,2.768989,0.02236,4.977022,0.372067,2.886056,-0.663597
Complet Adj.,8.23479,0.0,6.175269,-10.689917,4.460253,-0.823551,2.426557,0.598542,4.816595,0.477707,0.720247,0.506299,1.994035,0.493007,3.780908,0.637618,3.921997,-2.072226


In [212]:
knn_results_df = evaluate_datasets(knn_filtered_df)
knn_results_df

Unnamed: 0,Category,Dataset,Count_R2>0.2,CDC_R^2,CS_R^2,CUP_R^2,ECPGUANYEM_R^2,ERC_R^2,OTH_R^2,PP_R^2,PSC_R^2,VOX_R^2
0,Best Overall,no_ist_adj,5,0.0,-10.668596,-0.822402,0.586833,0.474829,0.504416,0.489299,0.632427,-2.063009
1,Best Non-Adj,ist,3,0.0,-8.40143,-1.907789,0.29599,0.809972,-1.772589,-0.028843,0.614717,-1.31902
2,Best Adj,no_ist_adj,5,0.0,-10.668596,-0.822402,0.586833,0.474829,0.504416,0.489299,0.632427,-2.063009
3,Best 0,basic_adj,2,0.0,-6.290961,-1.573654,0.076254,0.796616,-0.960342,0.10405,0.417578,-0.776905
4,Best 1,ist_adj,3,0.0,-7.391439,-1.049281,0.216186,0.779316,-0.509256,0.02236,0.372067,-0.663597
5,Best 2,no_ist_adj,5,0.0,-10.668596,-0.822402,0.586833,0.474829,0.504416,0.489299,0.632427,-2.063009
6,Best 3,complete_adj,5,0.0,-10.689917,-0.823551,0.598542,0.477707,0.506299,0.493007,0.637618,-2.072226


In [213]:
decision_tree_filtered_df = decision_tree_df[
    ~decision_tree_df["dummy"] & ~decision_tree_df["shifts"]
]
decision_tree_summary_df = summary_dataframe(decision_tree_filtered_df)
# decision_tree_summary_df = highlight_max_min(decision_tree_summary_df)
decision_tree_summary_df

Unnamed: 0_level_0,Junts_RMSE,Junts_R^2,C's_RMSE,C's_R^2,CUP_RMSE,CUP_R^2,Comuns_RMSE,Comuns_R^2,ERC_RMSE,ERC_R^2,Altres_RMSE,Altres_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Basic,5.983002,0.0,5.201733,-7.294233,4.682818,-1.010323,6.183077,-1.606574,5.829618,0.234957,1.108808,-0.170036,2.447745,0.235899,7.589988,-0.460042,3.508544,-1.459017
No IST,4.391446,0.0,6.36788,-11.429965,4.895411,-1.196998,5.764761,-1.265809,6.566478,0.029332,1.285584,-0.572852,2.489357,0.209698,9.0574,-1.079171,2.949165,-0.737423
IST,8.98463,0.0,6.874511,-13.48651,5.311679,-1.586514,5.683975,-1.202749,7.584762,-0.29506,1.461755,-1.033461,3.314173,-0.400777,9.135307,-1.115093,2.775397,-0.538714
Complet,15.087023,0.0,5.895352,-9.653679,5.096292,-1.381003,4.64713,-0.472415,4.849657,0.470546,1.438922,-0.970431,2.983448,-0.135156,6.835912,-0.184339,2.449228,-0.198302
Basic Adj.,8.358506,0.0,5.297528,-7.602925,5.089108,-1.374009,4.945702,-0.667688,6.354182,0.091023,1.385036,-0.825673,3.335527,-0.418618,7.878373,-0.573428,2.758687,-0.52
No IST Adj.,18.917779,0.0,2.321323,-0.65185,5.585867,-1.860093,5.92646,-1.394692,6.789033,-0.037647,1.672025,-1.660642,3.384939,-0.460959,6.081831,0.062346,3.794133,-1.875172
IST Adj.,3.305182,0.0,12.804547,-49.260649,3.883788,-0.382643,5.080432,-0.759788,5.19415,0.392617,1.934544,-2.561708,3.21006,-0.313902,6.101577,0.056248,4.582038,-3.193303
Complet Adj.,7.452532,0.0,3.147023,-2.035984,4.047673,-0.501792,3.954034,-0.065957,6.750506,-0.025903,2.385216,-4.414477,3.461888,-0.528138,4.907409,0.38951,3.934515,-2.091869


In [214]:
decision_tree_results_df = evaluate_datasets(decision_tree_filtered_df)
decision_tree_results_df

Unnamed: 0,Category,Dataset,Count_R2>0.2,CDC_R^2,CS_R^2,CUP_R^2,ECPGUANYEM_R^2,ERC_R^2,OTH_R^2,PP_R^2,PSC_R^2,VOX_R^2
0,Best Overall,basic,2,0.0,-7.294233,-1.010323,-1.606574,0.234957,-0.170036,0.235899,-0.460042,-1.459017
1,Best Non-Adj,basic,2,0.0,-7.294233,-1.010323,-1.606574,0.234957,-0.170036,0.235899,-0.460042,-1.459017
2,Best Adj,complete_adj,1,0.0,-2.035984,-0.501792,-0.065957,-0.025903,-4.414477,-0.528138,0.38951,-2.091869
3,Best 0,basic,2,0.0,-7.294233,-1.010323,-1.606574,0.234957,-0.170036,0.235899,-0.460042,-1.459017
4,Best 1,ist_adj,1,0.0,-49.260649,-0.382643,-0.759788,0.392617,-2.561708,-0.313902,0.056248,-3.193303
5,Best 2,no_ist,1,0.0,-11.429965,-1.196998,-1.265809,0.029332,-0.572852,0.209698,-1.079171,-0.737423
6,Best 3,complete_adj,1,0.0,-2.035984,-0.501792,-0.065957,-0.025903,-4.414477,-0.528138,0.38951,-2.091869


In [215]:
xgboost_single_output_filtered_df = xgboost_single_output_df[
    ~xgboost_single_output_df["dummy"] & ~xgboost_single_output_df["shifts"]
]
xgboost_single_output_summary_df = summary_dataframe(xgboost_single_output_filtered_df)
# xgboost_single_output_summary_df = highlight_max_min(xgboost_single_output_summary_df)
xgboost_single_output_summary_df

Unnamed: 0_level_0,Junts_RMSE,Junts_R^2,C's_RMSE,C's_R^2,CUP_RMSE,CUP_R^2,Comuns_RMSE,Comuns_R^2,ERC_RMSE,ERC_R^2,Altres_RMSE,Altres_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Basic,5.77851,0.0,5.605069,-8.630351,4.55036,-0.898204,3.872283,-0.022339,3.273822,0.758723,0.933075,0.171448,1.8214,0.576913,3.658543,0.660766,1.506321,0.546745
No IST,5.815251,0.0,5.33761,-7.733207,4.430247,-0.799315,3.788002,0.02168,3.639352,0.701837,1.01651,0.016645,1.812461,0.581056,3.552469,0.680152,1.530346,0.532171
IST,5.735107,0.0,5.260429,-7.482472,4.372141,-0.752425,3.730729,0.05104,3.661359,0.69822,1.081461,-0.113035,1.831133,0.57238,3.666678,0.659256,1.562662,0.512205
Complet,5.769476,0.0,5.073814,-6.891311,4.304056,-0.698271,3.739526,0.046559,3.933774,0.651643,1.160628,-0.281955,1.855339,0.561,3.860012,0.622375,1.588096,0.496197
Basic Adj.,0.87847,0.0,1.962937,-0.18117,1.837219,0.6906,1.726324,0.796809,2.545937,0.854075,0.618944,0.635411,1.405696,0.748047,2.27292,0.869039,1.425858,0.593939
No IST Adj.,0.867902,0.0,1.933969,-0.146565,1.824354,0.694918,1.673746,0.808998,2.517334,0.857336,0.62794,0.624736,1.392896,0.752615,2.248857,0.871797,1.379394,0.619972
IST Adj.,0.987831,0.0,1.989862,-0.213796,1.843644,0.688432,1.707704,0.801169,2.408742,0.869379,0.729476,0.493566,1.408213,0.747144,2.303612,0.865478,1.431351,0.590804
Complet Adj.,0.861656,0.0,1.952748,-0.168939,1.797499,0.703833,1.667435,0.810436,2.49464,0.859896,0.618397,0.636055,1.40464,0.748426,2.249663,0.871705,1.384361,0.61723


In [216]:
xgboost_single_output_results_df = evaluate_datasets(xgboost_single_output_filtered_df)
xgboost_single_output_results_df

Unnamed: 0,Category,Dataset,Count_R2>0.2,CDC_R^2,CS_R^2,CUP_R^2,ECPGUANYEM_R^2,ERC_R^2,OTH_R^2,PP_R^2,PSC_R^2,VOX_R^2
0,Best Overall,no_ist_adj,7,0.0,-0.146565,0.694918,0.808998,0.857336,0.624736,0.752615,0.871797,0.619972
1,Best Non-Adj,complete,4,0.0,-6.891311,-0.698271,0.046559,0.651643,-0.281955,0.561,0.622375,0.496197
2,Best Adj,no_ist_adj,7,0.0,-0.146565,0.694918,0.808998,0.857336,0.624736,0.752615,0.871797,0.619972
3,Best 0,basic_adj,7,0.0,-0.18117,0.6906,0.796809,0.854075,0.635411,0.748047,0.869039,0.593939
4,Best 1,ist_adj,7,0.0,-0.213796,0.688432,0.801169,0.869379,0.493566,0.747144,0.865478,0.590804
5,Best 2,no_ist_adj,7,0.0,-0.146565,0.694918,0.808998,0.857336,0.624736,0.752615,0.871797,0.619972
6,Best 3,complete_adj,7,0.0,-0.168939,0.703833,0.810436,0.859896,0.636055,0.748426,0.871705,0.61723


In [217]:
xgboost_multi_output_filtered_df = xgboost_multi_output_df[
    ~xgboost_multi_output_df["dummy"] & ~xgboost_multi_output_df["shifts"]
]
xgboost_multi_output_summary_df = summary_dataframe(xgboost_multi_output_filtered_df)
# xgboost_multi_output_summary_df = highlight_max_min(xgboost_multi_output_summary_df)
xgboost_multi_output_summary_df

Unnamed: 0_level_0,Junts_RMSE,Junts_R^2,C's_RMSE,C's_R^2,CUP_RMSE,CUP_R^2,Comuns_RMSE,Comuns_R^2,ERC_RMSE,ERC_R^2,Altres_RMSE,Altres_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Basic,6.912863,0.0,3.965014,-3.81914,3.893145,-0.389481,2.706174,0.500688,5.114751,0.411082,1.824544,-2.168074,2.176135,0.396065,2.766259,0.806059,1.98706,0.211268
No IST,6.837161,0.0,4.052825,-4.034959,3.836103,-0.349062,2.605601,0.537111,5.359188,0.353447,1.779435,-2.01336,2.094839,0.440346,2.691774,0.816363,2.0272,0.17908
IST,6.819112,0.0,4.061365,-4.056201,3.883551,-0.38264,2.664062,0.516107,5.029999,0.430437,1.70378,-1.762573,2.064604,0.456384,2.740233,0.809692,2.00537,0.196665
Complet,6.747306,0.0,4.079625,-4.101766,3.901088,-0.395156,2.649296,0.521456,4.916937,0.455754,1.683067,-1.695813,2.037605,0.470509,2.755683,0.807539,2.008371,0.194259
Basic Adj.,1.804412,0.0,6.293718,-11.14267,2.757898,0.302803,2.322399,0.632267,2.709542,0.834718,2.049461,-2.997424,2.453194,0.232638,2.654518,0.821374,2.393367,-0.144083
No IST Adj.,1.695294,0.0,6.390995,-11.520931,2.770819,0.296255,2.2181,0.664555,2.6618,0.840491,1.967444,-2.683884,2.414236,0.256817,2.674611,0.818659,2.436174,-0.185374
IST Adj.,1.709108,0.0,6.393748,-11.531718,2.822666,0.269672,2.22039,0.663862,2.784576,0.825437,1.975504,-2.714128,2.478794,0.216539,2.702664,0.814835,2.464913,-0.213506
Complet Adj.,1.643768,0.0,6.339746,-11.320926,2.798203,0.282276,2.145148,0.686257,2.544213,0.854273,1.891844,-2.406212,2.406441,0.261608,2.747918,0.808583,2.479356,-0.227769


In [218]:
xgboost_multi_output_results_df = evaluate_datasets(xgboost_multi_output_filtered_df)
xgboost_multi_output_results_df

Unnamed: 0,Category,Dataset,Count_R2>0.2,CDC_R^2,CS_R^2,CUP_R^2,ECPGUANYEM_R^2,ERC_R^2,OTH_R^2,PP_R^2,PSC_R^2,VOX_R^2
0,Best Overall,basic,5,0.0,-3.81914,-0.389481,0.500688,0.411082,-2.168074,0.396065,0.806059,0.211268
1,Best Non-Adj,basic,5,0.0,-3.81914,-0.389481,0.500688,0.411082,-2.168074,0.396065,0.806059,0.211268
2,Best Adj,basic_adj,5,0.0,-11.14267,0.302803,0.632267,0.834718,-2.997424,0.232638,0.821374,-0.144083
3,Best 0,basic,5,0.0,-3.81914,-0.389481,0.500688,0.411082,-2.168074,0.396065,0.806059,0.211268
4,Best 1,ist_adj,5,0.0,-11.531718,0.269672,0.663862,0.825437,-2.714128,0.216539,0.814835,-0.213506
5,Best 2,no_ist_adj,5,0.0,-11.520931,0.296255,0.664555,0.840491,-2.683884,0.256817,0.818659,-0.185374
6,Best 3,complete_adj,5,0.0,-11.320926,0.282276,0.686257,0.854273,-2.406212,0.261608,0.808583,-0.227769


In [225]:
# List of dataframes
dataframes = [
    (linear_regression_filtered_df, 'Linear Regression'),
    (knn_filtered_df, 'KNN'),
    (decision_tree_filtered_df, 'Decision Tree'),
    (xgboost_multi_output_filtered_df, 'XGBoost Multi-Output'),
    (xgboost_single_output_filtered_df, 'XGBoost Single-Output')
]

# Create an empty dataframe to hold the concatenated data
concatenated_df = pd.DataFrame()

# Concatenate the dataframes
for df, model in dataframes:
    df = df.copy()  # Create a copy to avoid modifying the original dataframe
    df['Model'] = model
    df['Dataset'] = df.index
    df['Experiment'] = df['Dataset'] + ' ' + df['Model']
    df.set_index('Experiment', inplace=True)
    concatenated_df = pd.concat([concatenated_df, df])

concatenated_df

Unnamed: 0_level_0,CDC_RMSE,CDC_R^2,CS_RMSE,CS_R^2,CUP_RMSE,CUP_R^2,ECPGUANYEM_RMSE,ECPGUANYEM_R^2,ERC_RMSE,ERC_R^2,...,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2,adj,dummy,shifts,type,Model,Dataset
Experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
basic Linear Regression,13.108765,0.0,13.812827,-57.485107,4.958795,-1.254258,6.306477,-1.711654,8.550081,-0.645683,...,5.488559,0.236516,1.213344,0.705913,False,False,False,0,Linear Regression,basic
basic_adj Linear Regression,2.035616,0.0,1.297133,0.484215,1.80456,0.701502,1.459422,0.854782,1.861985,0.921948,...,1.363938,0.952841,0.974992,0.810137,True,False,False,0,Linear Regression,basic_adj
complete Linear Regression,13.661693,0.0,13.727114,-56.761525,4.904627,-1.205277,6.270167,-1.680519,8.74755,-0.722577,...,5.463109,0.24358,1.210587,0.707248,False,False,False,3,Linear Regression,complete
complete_adj Linear Regression,2.108698,0.0,1.304098,0.478661,1.809431,0.699888,1.463062,0.854057,1.862828,0.921877,...,1.363043,0.952903,0.971739,0.811401,True,False,False,3,Linear Regression,complete_adj
ist Linear Regression,13.559294,0.0,13.778561,-57.195293,4.887827,-1.190195,6.232639,-1.648529,8.648102,-0.683633,...,5.458752,0.244787,1.211121,0.70699,False,False,False,1,Linear Regression,ist
ist_adj Linear Regression,2.11424,0.0,1.293992,0.48671,1.80483,0.701412,1.457292,0.855205,1.864453,0.921741,...,1.351399,0.953704,0.973224,0.810825,True,False,False,1,Linear Regression,ist_adj
no_ist Linear Regression,13.64314,0.0,13.721157,-56.7114,4.923106,-1.221926,6.28185,-1.690518,8.746869,-0.722309,...,5.463862,0.243372,1.21046,0.70731,False,False,False,2,Linear Regression,no_ist
no_ist_adj Linear Regression,2.109802,0.0,1.307231,0.476153,1.81086,0.699414,1.482743,0.850104,1.872924,0.921028,...,1.364012,0.952836,0.97311,0.810869,True,False,False,2,Linear Regression,no_ist_adj
basic KNN,17.273929,0.0,5.270261,-7.51421,5.645406,-1.92174,3.950214,-0.063903,2.728067,0.832461,...,4.265914,0.538781,3.39158,-1.297798,False,False,False,0,KNN,basic
basic_adj KNN,8.069779,0.0,4.876885,-6.290961,5.298776,-1.573654,3.680839,0.076254,3.005672,0.796616,...,4.793271,0.417578,2.982721,-0.776905,True,False,False,0,KNN,basic_adj


In [226]:
results_df = evaluate_datasets(concatenated_df)
results_df

Unnamed: 0,Category,Dataset,Count_R2>0.2,CDC_R^2,CS_R^2,CUP_R^2,ECPGUANYEM_R^2,ERC_R^2,OTH_R^2,PP_R^2,PSC_R^2,VOX_R^2
0,Best Overall,ist_adj Linear Regression,8,0.0,0.48671,0.701412,0.855205,0.921741,0.81519,0.86623,0.953704,0.810825
1,Best Non-Adj,basic XGBoost Multi-Output,5,0.0,-3.81914,-0.389481,0.500688,0.411082,-2.168074,0.396065,0.806059,0.211268
2,Best Adj,ist_adj Linear Regression,8,0.0,0.48671,0.701412,0.855205,0.921741,0.81519,0.86623,0.953704,0.810825
3,Best 0,basic_adj Linear Regression,8,0.0,0.484215,0.701502,0.854782,0.921948,0.814349,0.864556,0.952841,0.810137
4,Best 1,ist_adj Linear Regression,8,0.0,0.48671,0.701412,0.855205,0.921741,0.81519,0.86623,0.953704,0.810825
5,Best 2,no_ist_adj Linear Regression,8,0.0,0.476153,0.699414,0.850104,0.921028,0.815461,0.864849,0.952836,0.810869
6,Best 3,complete_adj Linear Regression,8,0.0,0.478661,0.699888,0.854057,0.921877,0.815713,0.864949,0.952903,0.811401
