# Study ML Results

## Load Data

Load libraries:

In [282]:
import json
import pathlib
import pandas as pd
from typing import Dict, Any

Define constants:

In [283]:
RESULTS_PATH = "../../results/"
DATE_EXPERIMENTS = "20240607_0952"
LINEAR_REGRESSION_FILENAMME = "linear_regression.json"
DECISION_TREE_FILENAME = "decision_tree_regressor.json"
KNN_FILENAME = "knn_regressor.json"
XGBOOST_SINGLE_OUTPUT_FILENAME = "xgboost_single_output_regressor.json"
XGBOOST_MULTI_OUTPUT_FILENAME = "xgboost_multi_output_regressor.json"

Load data:

In [284]:
experiments_path = pathlib.Path(RESULTS_PATH) / DATE_EXPERIMENTS

def load_experiment_results(file_path: pathlib.Path) -> Dict[str, Any]:
    """
    Load JSON data from a file.

    Args:
        file_path (pathlib.Path): Path to the JSON file.

    Returns:
        Dict[str, Any]: The loaded JSON data.
    """
    with file_path.open('r') as file:
        return json.load(file)


linear_regression_results = load_experiment_results(
    experiments_path / LINEAR_REGRESSION_FILENAMME
)
decision_tree_results = load_experiment_results(
    experiments_path / DECISION_TREE_FILENAME
)
knn_results = load_experiment_results(experiments_path / KNN_FILENAME)
xgboost_single_output_results = load_experiment_results(
    experiments_path / XGBOOST_SINGLE_OUTPUT_FILENAME
)
xgboost_multi_output_results = load_experiment_results(
    experiments_path / XGBOOST_MULTI_OUTPUT_FILENAME
)

Convert to dataframe:

In [285]:
def create_metrics_dataframe(results: Dict[str, Any]) -> pd.DataFrame:
    """
    Create a DataFrame with single-level columns from experiment results.

    Args:
        results (Dict[str, Any]): Dictionary containing the experiment results.

    Returns:
        pd.DataFrame: A DataFrame with single-level columns (formatted as {colname}_{metric}).
    """
    data = results["dataset_metrics"]

    rows = []
    index = []
    columns = set()

    for dataset_name, dataset_metrics in data.items():
        index.append(dataset_name)
        row = {}
        for col_name, metrics in dataset_metrics["metrics"].items():
            # Extract uppercase letters from column name
            col_name_processed = "".join(filter(str.isupper, col_name))
            for metric_name, metric_value in metrics.items():
                col_metric_name = f"{col_name_processed}_{metric_name}"
                row[col_metric_name] = metric_value
                columns.add(col_metric_name)
        rows.append(row)

    # Convert to DataFrame
    df = pd.DataFrame(rows, index=index)

    # Add boolean columns dummy, shifts and adj
    # Columns are True if the dataset name contains the respective string
    df["dummy"] = df.index.str.contains("dummy")
    df["shifts"] = df.index.str.contains("shifts")
    df["adj"] = df.index.str.contains("adj")

    # Add categorical column type
    # if the strings constains basic then 0
    # if the strings not constains no_ist then 1
    # if the strings constains no_ist then 2
    # if the strings constains complete then 3
    df["type"] = None
    df.loc[df.index.str.contains("basic"), "type"] = 0
    df.loc[
        (df.index.str.contains("ist")) & (~df.index.str.contains("no_ist")), "type"
    ] = 1
    df.loc[df.index.str.contains("no_ist"), "type"] = 2
    df.loc[df.index.str.contains("complete"), "type"] = 3

    # Ensure all columns are present and sort them alphabetically
    df = df.reindex(columns=sorted(df.columns), fill_value=None)
    return df

In [286]:
linear_regression_df = create_metrics_dataframe(linear_regression_results)
decision_tree_df = create_metrics_dataframe(decision_tree_results)
knn_df = create_metrics_dataframe(knn_results)
xgboost_single_output_df = create_metrics_dataframe(xgboost_single_output_results)
xgboost_multi_output_df = create_metrics_dataframe(xgboost_multi_output_results)

Add auxiliary function to highlight the maximum value in a column:

In [287]:
def highlight_max_min(data: pd.DataFrame) -> pd.DataFrame:
    """
    Highlight the maximum value in each column for R^2 and the minimum value in each column for RMSE.

    Args:
        data (pd.DataFrame): DataFrame to highlight.

    Returns:
        pd.DataFrame: DataFrame with highlighted values.
    """
    def highlight_values(column):
        is_r2 = "R^2" in column.name
        is_rmse = "RMSE" in column.name
        if not is_r2 and not is_rmse:
            return ['' for _ in column]
        if is_r2:
            highlight_value = column.max()
        if is_rmse:
            highlight_value = column.min()
            
        return ['background: yellow' if v == highlight_value and is_r2 and v >= 0.2 else 'background: #185ed7' if v >= 0.2 and v != highlight_value and is_r2 else '' for v in column]
    
    return data.style.apply(highlight_values, axis=0)

In [288]:
def count_max_min_highlights_and_podiums(data: pd.DataFrame) -> pd.DataFrame:
    """
    Count the number of maximum R^2 and minimum RMSE for each dataset and sort the results.
    Additionally, count the top 3 values for R^2 and the lowest 3 values for RMSE.

    Args:
        data (pd.DataFrame): DataFrame containing the metrics.

    Returns:
        pd.DataFrame: DataFrame with counts of max R^2, min RMSE, and podium finishes per dataset, sorted by the total count.
    """
    count_df = pd.DataFrame(index=data.index, columns=['max_R2_count', 'min_RMSE_count', 'R2_podium_count', 'RMSE_podium_count'])
    count_df['max_R2_count'] = 0
    count_df['min_RMSE_count'] = 0
    count_df['R2_podium_count'] = 0
    count_df['RMSE_podium_count'] = 0
    count_df['R2_valid'] = 0

    for column in data.columns:
        if "R^2" in column:
            max_value = data[column].max()
            count_df['max_R2_count'] += data[column] == max_value
            top_3_r2 = data[column].nlargest(3).values
            count_df['R2_podium_count'] += data[column].isin(top_3_r2)
            count_df['R2_valid'] += data[column] > 0.2
        elif "RMSE" in column:
            min_value = data[column].min()
            count_df['min_RMSE_count'] += data[column] == min_value
            top_3_rmse = data[column].nsmallest(3).values
            count_df['RMSE_podium_count'] += data[column].isin(top_3_rmse)

    count_df['total_count'] = count_df['max_R2_count'] + count_df['min_RMSE_count']
    count_df['total_podiums'] = count_df['R2_podium_count'] + count_df['RMSE_podium_count']
    sorted_count_df = count_df.sort_values(by=['total_count', 'total_podiums'], ascending=False)
    return sorted_count_df

## Linear Regression

See the results:

In [289]:
linear_regression_highlighted_df = highlight_max_min(linear_regression_df)
linear_regression_highlighted_df

Unnamed: 0,CS_RMSE,CS_R^2,CUP_RMSE,CUP_R^2,ECPGUANYEM_RMSE,ECPGUANYEM_R^2,ERC_RMSE,ERC_R^2,JXCATJUNTS_RMSE,JXCATJUNTS_R^2,OTH_RMSE,OTH_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2,adj,dummy,shifts,type
basic,15.779397,-75.323971,4.412181,-0.78467,9.179665,-4.745322,10.829473,-1.640101,9.682021,-0.449893,0.148069,-0.264108,5.417407,-2.742843,5.181779,0.31948,1.127446,0.746079,False,False,False,0
basic_adj,1.325316,0.461558,1.787324,0.707177,1.43313,0.859967,1.920743,0.916944,1.989852,0.93876,0.114051,0.249897,1.085512,0.849753,1.329069,0.955222,0.970673,0.811815,True,False,False,0
complete,15.81133,-75.633201,4.410317,-0.783162,8.764135,-4.236956,11.082844,-1.765084,10.106541,-0.579825,0.131545,0.002296,5.633565,-3.047484,5.162273,0.324594,1.128052,0.745806,False,False,False,3
complete_adj,1.327721,0.459603,1.789184,0.706567,1.463872,0.853895,1.931093,0.916046,1.917787,0.943116,0.114872,0.239049,1.126381,0.838227,1.319179,0.955885,0.965903,0.81366,True,False,False,3
ist,15.779976,-75.329569,4.372709,-0.752881,8.783183,-4.259744,11.056287,-1.751848,10.061338,-0.565725,0.131427,0.004079,5.595098,-2.992398,5.154955,0.326508,1.125585,0.746916,False,False,False,1
ist_adj,1.315292,0.469673,1.786248,0.707529,1.452531,0.85615,1.935656,0.915649,1.944941,0.941493,0.115039,0.236839,1.109617,0.843006,1.317865,0.955973,0.967281,0.813128,True,False,False,1
no_ist,15.771049,-75.243239,4.412431,-0.784872,9.01255,-4.53804,11.041402,-1.744444,9.987281,-0.54276,0.13715,-0.084536,5.557009,-2.938227,5.19614,0.315703,1.128222,0.745729,False,False,False,2
no_ist_adj,1.342144,0.447798,1.790533,0.706124,1.497278,0.84715,1.944009,0.91492,1.924263,0.942731,0.115398,0.232076,1.127083,0.838025,1.318062,0.95596,0.968395,0.812697,True,False,False,2


See the best datasets:

In [290]:
linear_regression_counts = count_max_min_highlights_and_podiums(linear_regression_df)
linear_regression_counts

Unnamed: 0,max_R2_count,min_RMSE_count,R2_podium_count,RMSE_podium_count,R2_valid,total_count,total_podiums
basic_adj,4,4,6,6,9,8,12
ist_adj,3,3,9,9,9,6,18
complete_adj,2,2,9,9,9,4,18
no_ist_adj,0,0,3,3,9,0,6
basic,0,0,0,0,2,0,0
complete,0,0,0,0,2,0,0
ist,0,0,0,0,2,0,0
no_ist,0,0,0,0,2,0,0


Let's get the names of those datasets that have less than 4 valid $R^2$ values:

In [291]:
linear_regression_invalid_datasets = linear_regression_counts[
    linear_regression_counts["R2_valid"] < 4
].index.to_list()
linear_regression_invalid_datasets

['basic', 'complete', 'ist', 'no_ist']

In [292]:
linear_regression_best_datasets = linear_regression_counts[
    linear_regression_counts["R2_valid"] > 4
].index.to_list()
linear_regression_best_datasets

['basic_adj', 'ist_adj', 'complete_adj', 'no_ist_adj']

In [293]:
linear_regression_counts = linear_regression_df.merge(
    linear_regression_counts, left_index=True, right_index=True
)

Analyze by dataset's completness:

In [294]:
linear_regression_counts[["type", "total_count", "total_podiums", "R2_valid"]].groupby(
    "type"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8,12,5.5
1,6,18,5.5
2,0,6,5.5
3,4,18,5.5


Analyze by presence of dummy variables:

In [295]:
linear_regression_counts[["dummy", "total_count", "total_podiums", "R2_valid"]].groupby(
    "dummy"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
dummy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,18,54,5.5


Analyze by number of shifted variables:

In [296]:
linear_regression_counts[
    ["shifts", "total_count", "total_podiums", "R2_valid"]
].groupby("shifts").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
shifts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,18,54,5.5


Analyze by presence of adjacenct variables:

In [297]:
linear_regression_counts[["adj", "total_count", "total_podiums", "R2_valid"]].groupby(
    "adj"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0,0,2.0
True,18,54,9.0


## K-Nearest Neighbors Regression

See the results:

In [298]:
knn_highlighted_df = highlight_max_min(knn_df)
knn_highlighted_df

Unnamed: 0,CS_RMSE,CS_R^2,CUP_RMSE,CUP_R^2,ECPGUANYEM_RMSE,ECPGUANYEM_R^2,ERC_RMSE,ERC_R^2,JXCATJUNTS_RMSE,JXCATJUNTS_R^2,OTH_RMSE,OTH_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2,adj,dummy,shifts,type
basic,5.90326,-9.682282,4.542381,-0.891552,4.555096,-0.414672,3.809411,0.673321,4.569198,0.677088,0.125551,0.091139,2.456761,0.23026,4.742536,0.429961,3.129825,-0.956806,False,False,False,0
basic_adj,4.494425,-5.192243,5.417902,-1.690675,3.626935,0.103112,2.700899,0.835771,3.177918,0.843801,0.119341,0.178691,2.320475,0.313422,4.11724,0.570279,3.156583,-0.990092,True,False,False,0
complete,5.869157,-9.559215,3.8801,-0.380184,2.224856,0.662507,5.657277,0.279522,6.226988,0.400264,0.170281,-0.671807,1.84678,0.56504,3.587216,0.673864,4.152088,-2.443823,False,False,False,3
complete_adj,4.637,-5.591343,4.502836,-0.858537,2.029772,0.719099,4.294645,0.58477,4.271278,0.717831,0.131719,-0.000517,1.812138,0.581284,3.349782,0.715549,3.95884,-2.130219,True,False,False,3
ist,6.28342,-11.102424,4.285248,-0.683461,3.476862,0.175794,3.33436,0.749717,4.645694,0.666185,0.12393,0.114466,2.41744,0.254702,4.300366,0.531301,3.191553,-1.034753,False,False,False,1
ist_adj,4.627819,-5.565267,5.2004,-1.478977,3.306673,0.25451,2.729611,0.832261,3.301615,0.831405,0.122368,0.136505,2.303414,0.32348,4.283623,0.534847,2.990601,-0.786306,True,False,False,1
no_ist,5.947137,-9.841667,3.862562,-0.367735,2.275099,0.647092,5.583878,0.298096,6.293924,0.387301,0.168751,-0.641895,1.845951,0.565431,3.635312,0.66506,4.141697,-2.426607,False,False,False,2
no_ist_adj,4.626827,-5.562452,4.510813,-0.865128,2.079378,0.705201,4.264679,0.590545,4.266759,0.718428,0.130873,0.0123,1.822407,0.576526,3.393498,0.708076,3.950731,-2.117408,True,False,False,2


In [299]:
knn_counts = count_max_min_highlights_and_podiums(knn_df)
knn_counts

Unnamed: 0,max_R2_count,min_RMSE_count,R2_podium_count,RMSE_podium_count,R2_valid,total_count,total_podiums
basic_adj,4,4,5,5,4,8,10
complete_adj,3,3,3,3,5,6,6
ist_adj,1,1,5,5,5,2,10
no_ist,1,1,2,2,5,2,4
no_ist_adj,0,0,5,5,5,0,10
complete,0,0,3,3,5,0,6
ist,0,0,3,3,4,0,6
basic,0,0,1,1,4,0,2


Let's get the names of those datasets that have less than 4 valid $R^2$ values:

In [300]:
knn_invalid_datasets = knn_counts[
    knn_counts["R2_valid"] < 4
].index.to_list()
knn_invalid_datasets

[]

In [301]:
knn_best_datasets = knn_counts[
    knn_counts["R2_valid"] > 4
].index.to_list()
knn_best_datasets

['complete_adj', 'ist_adj', 'no_ist', 'no_ist_adj', 'complete']

In [302]:
knn_counts = knn_df.merge(
    knn_counts, left_index=True, right_index=True
)

Analyze by dataset's completness:

In [303]:
knn_counts[["type", "total_count", "total_podiums", "R2_valid"]].groupby("type").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8,12,4.0
1,2,16,4.5
2,2,14,5.0
3,6,12,5.0


Analyze by presence of dummy variables:

In [304]:
knn_counts[["dummy", "total_count", "total_podiums", "R2_valid"]].groupby("dummy").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
dummy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,18,54,4.625


Analyze by number of shifted variables:

In [305]:
knn_counts[["shifts", "total_count", "total_podiums", "R2_valid"]].groupby(
    "shifts"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
shifts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,18,54,4.625


Analyze by presence of adjacenct variables:

In [306]:
knn_counts[["adj", "total_count", "total_podiums", "R2_valid"]].groupby("adj").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,2,18,4.5
True,16,36,4.75


## Decision Tree Regression

See the results:

In [307]:
decision_tree_highlighted_df = highlight_max_min(decision_tree_df)
decision_tree_highlighted_df

Unnamed: 0,CS_RMSE,CS_R^2,CUP_RMSE,CUP_R^2,ECPGUANYEM_RMSE,ECPGUANYEM_R^2,ERC_RMSE,ERC_R^2,JXCATJUNTS_RMSE,JXCATJUNTS_R^2,OTH_RMSE,OTH_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2,adj,dummy,shifts,type
basic,6.558479,-12.185192,4.439703,-0.807003,6.224744,-1.641823,4.98644,0.440259,7.047472,0.231807,0.152559,-0.341937,2.610875,0.130658,8.352714,-0.768228,2.959547,-0.749678,False,False,False,0
basic_adj,8.440767,-20.840543,4.719179,-1.041418,4.282063,-0.250159,6.117535,0.157468,5.75589,0.48759,0.164212,-0.555016,2.426852,0.249029,4.395062,0.510329,3.745934,-1.802587,True,False,False,0
complete,10.520813,-32.929601,4.509628,-0.864373,4.543579,-0.407527,4.960601,0.446045,5.949225,0.452575,0.188383,-1.046162,2.667437,0.092583,5.606658,0.203307,4.654623,-3.327894,False,False,False,3
complete_adj,6.397301,-11.54565,4.206873,-0.62225,3.745183,0.043676,6.190062,0.137372,6.598178,0.32665,0.135676,-0.061535,3.253559,-0.349752,6.621583,-0.111469,3.023541,-0.825873,True,False,False,3
ist,5.508329,-8.30079,4.860161,-1.165472,4.946183,-0.668019,5.081792,0.418647,6.623759,0.321401,0.164082,-0.552296,2.777788,0.015951,7.533521,-0.438398,3.223599,-1.075819,False,False,False,1
ist_adj,6.199269,-10.780957,4.600247,-0.93982,3.097255,0.345947,6.473279,0.05663,6.02418,0.438708,0.14015,-0.132686,2.761606,0.027567,5.550141,0.219124,4.189866,-2.50622,True,False,False,1
no_ist,5.330188,-7.708938,5.036845,-1.325779,6.289172,-1.696794,5.220571,0.386461,5.766946,0.485606,0.2028,-1.371318,2.816742,-0.011842,5.845079,0.134108,3.686253,-1.714425,False,False,False,2
no_ist_adj,6.446214,-11.73823,4.866303,-1.170687,5.083022,-0.761582,6.245287,0.121911,5.611423,0.512989,0.178126,-0.829701,3.513956,-0.574451,6.805887,-0.174203,4.108603,-2.37153,True,False,False,2


In [308]:
decision_tree_counts = count_max_min_highlights_and_podiums(decision_tree_df)
decision_tree_counts

Unnamed: 0,max_R2_count,min_RMSE_count,R2_podium_count,RMSE_podium_count,R2_valid,total_count,total_podiums
basic_adj,2,2,4,4,3,4,8
complete_adj,2,2,4,4,1,4,8
basic,1,1,5,5,2,2,10
complete,1,1,4,4,3,2,8
ist_adj,1,1,4,4,3,2,8
no_ist,1,1,2,2,2,2,4
no_ist_adj,1,1,1,1,1,2,2
ist,0,0,3,3,2,0,6


Let's get the names of those datasets that have less than 4 valid $R^2$ values:

In [309]:
decision_tree_invalid_datasets = decision_tree_counts[
    decision_tree_counts["R2_valid"] < 4
].index.to_list()
decision_tree_invalid_datasets

['basic_adj',
 'complete_adj',
 'basic',
 'complete',
 'ist_adj',
 'no_ist',
 'no_ist_adj',
 'ist']

In [310]:
decision_tree_best_datasets = decision_tree_counts[
    decision_tree_counts["R2_valid"] > 4
].index.to_list()
decision_tree_best_datasets

[]

In [311]:
decision_tree_counts = decision_tree_df.merge(
    decision_tree_counts, left_index=True, right_index=True
)

Analyze by dataset's completness:

In [312]:
decision_tree_counts[["type", "total_count", "total_podiums", "R2_valid"]].groupby(
    "type"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,6,18,2.5
1,2,14,2.5
2,4,6,1.5
3,6,16,2.0


Analyze by presence of dummy variables:

In [313]:
decision_tree_counts[["dummy", "total_count", "total_podiums", "R2_valid"]].groupby(
    "dummy"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
dummy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,18,54,2.125


Analyze by number of shifted variables:

In [314]:
decision_tree_counts[["shifts", "total_count", "total_podiums", "R2_valid"]].groupby(
    "shifts"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
shifts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,18,54,2.125


Analyze by presence of adjacenct variables:

In [315]:
decision_tree_counts[["adj", "total_count", "total_podiums", "R2_valid"]].groupby(
    "adj"
).agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,6,28,2.25
True,12,26,2.0


## XGBoost Single-Output Regression

See the results:

In [316]:
xgboost_single_output_highlighted_df = highlight_max_min(xgboost_single_output_df)
xgboost_single_output_highlighted_df

Unnamed: 0,CS_RMSE,CS_R^2,CUP_RMSE,CUP_R^2,ECPGUANYEM_RMSE,ECPGUANYEM_R^2,ERC_RMSE,ERC_R^2,JXCATJUNTS_RMSE,JXCATJUNTS_R^2,OTH_RMSE,OTH_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2,adj,dummy,shifts,type
basic,6.604884,-12.372438,4.049565,-0.503377,5.881713,-1.358676,2.673913,0.839047,5.350012,0.557296,1.210934,-83.546494,1.748998,0.609881,3.724095,0.648501,1.428328,0.592466,False,False,False,0
basic_adj,2.471621,-0.87268,1.861966,0.682208,1.790849,0.781336,2.849646,0.817183,2.576074,0.897362,0.493514,-13.045089,1.32938,0.774662,2.28087,0.868121,1.354208,0.633723,True,False,False,0
complete,6.609354,-12.390545,4.055174,-0.507545,5.970642,-1.43054,2.674836,0.838935,5.296578,0.566095,1.20338,-82.494955,1.758439,0.605658,3.063865,0.762085,1.410832,0.602389,False,False,False,3
complete_adj,2.49422,-0.907082,1.818256,0.696954,1.725907,0.796907,2.860045,0.815847,2.488638,0.904211,0.395704,-8.029545,1.328076,0.775104,2.202618,0.877015,1.307065,0.658781,True,False,False,3
ist,6.622328,-12.443165,4.050509,-0.504078,5.960576,-1.422351,2.640426,0.843053,5.309212,0.564023,1.203383,-82.49541,1.753199,0.608005,3.114418,0.754169,1.412391,0.60151,False,False,False,1
ist_adj,2.481455,-0.887611,1.831924,0.692381,1.729279,0.796113,2.826726,0.820112,2.550959,0.899353,0.425479,-9.439555,1.324006,0.77648,2.215812,0.875537,1.317546,0.653287,True,False,False,1
no_ist,6.586429,-12.29781,4.048545,-0.50262,5.867871,-1.347588,2.704132,0.835388,5.350006,0.557297,1.21214,-83.714912,1.756331,0.606603,3.116825,0.753789,1.413467,0.600903,False,False,False,2
no_ist_adj,2.487225,-0.896399,1.843565,0.688459,1.725181,0.797078,2.861518,0.815657,2.492158,0.90394,0.410008,-8.694153,1.323071,0.776796,2.218847,0.875196,1.313546,0.655389,True,False,False,2


In [317]:
xgboost_single_output_counts = count_max_min_highlights_and_podiums(xgboost_single_output_df)
xgboost_single_output_counts

Unnamed: 0,max_R2_count,min_RMSE_count,R2_podium_count,RMSE_podium_count,R2_valid,total_count,total_podiums
complete_adj,5,5,7,7,7,10,14
no_ist_adj,2,2,8,8,7,4,16
basic_adj,1,1,1,1,7,2,2
ist,1,1,1,1,5,2,2
ist_adj,0,0,8,8,7,0,16
basic,0,0,1,1,5,0,2
complete,0,0,1,1,5,0,2
no_ist,0,0,0,0,5,0,0


Let's get the names of those datasets that have less than 4 valid $R^2$ values:

In [318]:
xgboost_single_output_invalid_datasets = xgboost_single_output_counts[
    xgboost_single_output_counts["R2_valid"] < 4
].index.to_list()
xgboost_single_output_invalid_datasets

[]

In [319]:
xgboost_single_output_best_datasets = xgboost_single_output_counts[
    xgboost_single_output_counts["R2_valid"] > 4
].index.to_list()
xgboost_single_output_best_datasets

['complete_adj',
 'no_ist_adj',
 'basic_adj',
 'ist',
 'ist_adj',
 'basic',
 'complete',
 'no_ist']

In [320]:
xgboost_single_output_counts = xgboost_single_output_df.merge(
    xgboost_single_output_counts, left_index=True, right_index=True
)

Analyze by dataset's completness:

In [321]:
xgboost_single_output_counts[
    ["type", "total_count", "total_podiums", "R2_valid"]
].groupby("type").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,4,6.0
1,2,18,6.0
2,4,16,6.0
3,10,16,6.0


Analyze by presence of dummy variables:

In [322]:
xgboost_single_output_counts[
    ["dummy", "total_count", "total_podiums", "R2_valid"]
].groupby("dummy").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
dummy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,18,54,6.0


Analyze by number of shifted variables:

In [323]:
xgboost_single_output_counts[
    ["shifts", "total_count", "total_podiums", "R2_valid"]
].groupby("shifts").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
shifts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,18,54,6.0


Analyze by presence of adjacenct variables:

In [324]:
xgboost_single_output_counts[
    ["adj", "total_count", "total_podiums", "R2_valid"]
].groupby("adj").agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,2,6,5.0
True,16,48,7.0


## XGBoost Multi-Output Regression

See the results:

In [325]:
xgboost_multi_output_highlighted_df = highlight_max_min(xgboost_multi_output_df)
xgboost_multi_output_highlighted_df

Unnamed: 0,CS_RMSE,CS_R^2,CUP_RMSE,CUP_R^2,ECPGUANYEM_RMSE,ECPGUANYEM_R^2,ERC_RMSE,ERC_R^2,JXCATJUNTS_RMSE,JXCATJUNTS_R^2,OTH_RMSE,OTH_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2,adj,dummy,shifts,type
basic,4.399473,-4.933097,4.256328,-0.660815,3.672982,0.08019,4.343811,0.575236,5.177592,0.585371,1.61316,-149.040867,2.203686,0.380676,3.277475,0.727754,2.226568,0.009671,False,False,False,0
basic_adj,3.893913,-3.648064,3.875659,-0.376861,2.35293,0.622535,3.298682,0.755029,3.94627,0.759139,1.127348,-72.289385,2.246566,0.356461,4.002266,0.593944,2.432609,-0.181908,True,False,False,0
complete,4.417788,-4.982598,4.229398,-0.639866,3.55928,0.136256,4.367289,0.570632,5.05537,0.604715,1.617464,-149.842433,2.11195,0.431166,3.281483,0.727088,2.26961,-0.028988,False,False,False,3
complete_adj,3.952826,-3.789774,3.885351,-0.383756,2.173119,0.678022,3.36529,0.745036,3.773532,0.779764,1.060047,-63.800098,2.154922,0.407894,3.985297,0.59738,2.456536,-0.205272,True,False,False,3
ist,4.460168,-5.097932,4.246994,-0.653539,3.55332,0.139146,4.284774,0.586704,5.070928,0.602279,1.596398,-145.938866,2.124206,0.424544,3.306046,0.722987,2.243341,-0.005306,False,False,False,1
ist_adj,3.933207,-3.742346,3.857701,-0.364131,2.159933,0.681918,3.319534,0.751922,3.828611,0.773288,1.040543,-61.437418,2.187898,0.389634,4.019024,0.590537,2.502905,-0.251202,True,False,False,1
no_ist,4.46637,-5.114902,4.250221,-0.656053,3.618283,0.107382,4.201909,0.602535,5.047546,0.605938,1.57471,-141.973529,2.038958,0.469806,3.269386,0.729096,2.234209,0.002862,False,False,False,2
no_ist_adj,3.908467,-3.682874,3.963973,-0.440324,2.250831,0.654582,3.35549,0.746519,3.868922,0.768488,1.0547,-63.148017,2.243624,0.358146,4.054045,0.58337,2.513241,-0.261557,True,False,False,2


In [326]:
xgboost_multi_output_counts = count_max_min_highlights_and_podiums(xgboost_multi_output_df)
xgboost_multi_output_counts

Unnamed: 0,max_R2_count,min_RMSE_count,R2_podium_count,RMSE_podium_count,R2_valid,total_count,total_podiums
ist_adj,3,3,6,6,5,6,12
basic_adj,2,2,3,3,5,4,6
no_ist,2,2,3,3,4,4,6
complete_adj,1,1,4,4,5,2,8
basic,1,1,2,2,4,2,4
no_ist_adj,0,0,5,5,5,0,10
complete,0,0,2,2,4,0,4
ist,0,0,2,2,4,0,4


Let's get the names of those datasets that have less than 4 valid $R^2$ values:

In [327]:
xgboost_multi_output_invalid_datasets = xgboost_multi_output_counts[
    xgboost_multi_output_counts["R2_valid"] < 4
].index.to_list()
xgboost_multi_output_invalid_datasets

[]

In [328]:
xgboost_multi_output_best_datasets = xgboost_multi_output_counts[
    xgboost_multi_output_counts["R2_valid"] > 4
].index.to_list()
xgboost_multi_output_best_datasets

['ist_adj', 'basic_adj', 'complete_adj', 'no_ist_adj']

In [329]:
xgboost_multi_output_counts = xgboost_multi_output_df.merge(
    xgboost_multi_output_counts, left_index=True, right_index=True
)

Analyze by dataset's completness:

In [330]:
xgboost_multi_output_counts[["type", "total_count", "total_podiums", "R2_valid"]].groupby("type").agg({     "total_count": "sum",     "total_podiums": "sum",     "R2_valid": "mean" })

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,6,10,4.5
1,6,16,4.5
2,4,16,4.5
3,2,12,4.5


Analyze by presence of dummy variables:

In [331]:
xgboost_multi_output_counts[
    ["dummy", "total_count", "total_podiums", "R2_valid"]
].groupby("dummy").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
dummy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,18,54,4.5


Analyze by number of shifted variables:

In [332]:
xgboost_multi_output_counts[
    ["shifts", "total_count", "total_podiums", "R2_valid"]
].groupby("shifts").agg(
    {"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"}
)

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
shifts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,18,54,4.5


Analyze by presence of adjacenct variables:

In [333]:
xgboost_multi_output_counts[
    ["adj", "total_count", "total_podiums", "R2_valid"]
].groupby("adj").agg({"total_count": "sum", "total_podiums": "sum", "R2_valid": "mean"})

Unnamed: 0_level_0,total_count,total_podiums,R2_valid
adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,6,18,4.0
True,12,36,5.0


In [334]:
xgboost_multi_output_filtered_df = xgboost_multi_output_df[
    ~xgboost_multi_output_df["dummy"] & ~xgboost_multi_output_df["shifts"]
]
xgboost_multi_output_resumed_df = highlight_max_min(xgboost_multi_output_filtered_df)
xgboost_multi_output_resumed_df

Unnamed: 0,CS_RMSE,CS_R^2,CUP_RMSE,CUP_R^2,ECPGUANYEM_RMSE,ECPGUANYEM_R^2,ERC_RMSE,ERC_R^2,JXCATJUNTS_RMSE,JXCATJUNTS_R^2,OTH_RMSE,OTH_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2,adj,dummy,shifts,type
basic,4.399473,-4.933097,4.256328,-0.660815,3.672982,0.08019,4.343811,0.575236,5.177592,0.585371,1.61316,-149.040867,2.203686,0.380676,3.277475,0.727754,2.226568,0.009671,False,False,False,0
basic_adj,3.893913,-3.648064,3.875659,-0.376861,2.35293,0.622535,3.298682,0.755029,3.94627,0.759139,1.127348,-72.289385,2.246566,0.356461,4.002266,0.593944,2.432609,-0.181908,True,False,False,0
complete,4.417788,-4.982598,4.229398,-0.639866,3.55928,0.136256,4.367289,0.570632,5.05537,0.604715,1.617464,-149.842433,2.11195,0.431166,3.281483,0.727088,2.26961,-0.028988,False,False,False,3
complete_adj,3.952826,-3.789774,3.885351,-0.383756,2.173119,0.678022,3.36529,0.745036,3.773532,0.779764,1.060047,-63.800098,2.154922,0.407894,3.985297,0.59738,2.456536,-0.205272,True,False,False,3
ist,4.460168,-5.097932,4.246994,-0.653539,3.55332,0.139146,4.284774,0.586704,5.070928,0.602279,1.596398,-145.938866,2.124206,0.424544,3.306046,0.722987,2.243341,-0.005306,False,False,False,1
ist_adj,3.933207,-3.742346,3.857701,-0.364131,2.159933,0.681918,3.319534,0.751922,3.828611,0.773288,1.040543,-61.437418,2.187898,0.389634,4.019024,0.590537,2.502905,-0.251202,True,False,False,1
no_ist,4.46637,-5.114902,4.250221,-0.656053,3.618283,0.107382,4.201909,0.602535,5.047546,0.605938,1.57471,-141.973529,2.038958,0.469806,3.269386,0.729096,2.234209,0.002862,False,False,False,2
no_ist_adj,3.908467,-3.682874,3.963973,-0.440324,2.250831,0.654582,3.35549,0.746519,3.868922,0.768488,1.0547,-63.148017,2.243624,0.358146,4.054045,0.58337,2.513241,-0.261557,True,False,False,2


## Identify Invalid Datasets

We define a dataset as invalid if it has less than 4 valid $R^2$ values in any of the models.

In [335]:
invalid_datasets = (
    set(linear_regression_invalid_datasets)
    & set(knn_invalid_datasets)
    & set(decision_tree_invalid_datasets)
    & set(xgboost_single_output_invalid_datasets)
    & set(xgboost_multi_output_invalid_datasets)
)
invalid_datasets

set()

## Identify Best Datasets

We define a dataset as the best if it has more than 4 valid $R^2$ values in all models.

In [336]:
best_datasets = (
    set(linear_regression_best_datasets)
    | set(knn_best_datasets)
    | set(decision_tree_best_datasets)
    | set(xgboost_single_output_best_datasets)
    | set(xgboost_multi_output_best_datasets)
)
best_datasets

{'basic',
 'basic_adj',
 'complete',
 'complete_adj',
 'ist',
 'ist_adj',
 'no_ist',
 'no_ist_adj'}

In [337]:
best_of_best_datasets = (
    set(linear_regression_best_datasets)
    # & set(knn_best_datasets)
    # & set(decision_tree_best_datasets)
    & set(xgboost_single_output_best_datasets)
    & set(xgboost_multi_output_best_datasets)
)
best_of_best_datasets

{'basic_adj', 'complete_adj', 'ist_adj', 'no_ist_adj'}

## Summary

In [338]:
def summary_dataframe(df):
    df = df.copy()

    # Drop the unwanted columns
    df = df.drop(columns=["adj", "dummy", "shifts", "type"])

    # Rename the columns
    replacements = {
        "CDC": "Junts",
        "OTH": "Altres",
        "CS": "C's",
        "ECPGUANYEM": "Comuns",
        "JXCATJUNTS": "Junts",
    }
    df.columns = df.columns.str.replace(
        "CDC|OTH|CS|ECPGUANYEM|JXCATJUNTS",
        lambda m: replacements[m.group(0)],
        regex=True,
    )

    # Sort the columns based on the specified order
    column_order = [
        "Junts_RMSE",
        "Junts_R^2",
        "C's_RMSE",
        "C's_R^2",
        "CUP_RMSE",
        "CUP_R^2",
        "Comuns_RMSE",
        "Comuns_R^2",
        "ERC_RMSE",
        "ERC_R^2",
        "Altres_RMSE",
        "Altres_R^2",
        "PP_RMSE",
        "PP_R^2",
        "PSC_RMSE",
        "PSC_R^2",
        "VOX_RMSE",
        "VOX_R^2",
    ]
    df = df[column_order]

    # Rename the index
    df.index = [
        "Basic",
        "Basic Adj.",
        "Complet",
        "Complet Adj.",
        "IST",
        "IST Adj.",
        "No IST",
        "No IST Adj.",
    ]

    # Sort the index based on the specified order
    order = [
        "Basic",
        "No IST",
        "IST",
        "Complet",
        "Basic Adj.",
        "No IST Adj.",
        "IST Adj.",
        "Complet Adj.",
    ]
    df = df.reindex(order)

    # Rename the index label
    df.index.name = "Dataset"

    return df

In [339]:
def evaluate_datasets(df):
    def count_above_threshold(row, threshold=0.2):
        return sum(value > threshold for value in row)

    df = df.copy()

    # Adding a column to count the number of models with R^2 > 0.2
    df["count_above_0.2"] = df.apply(
        lambda row: count_above_threshold(row.filter(like="_R^2")), axis=1
    )

    # Get the R^2 columns
    r2_columns = df.filter(like="_R^2").columns.tolist()

    # Finding the best dataset based on the criteria provided
    sort_by = ["count_above_0.2"] + r2_columns
    ascending_order = [False] * len(sort_by)

    best_overall = df.sort_values(by=sort_by, ascending=ascending_order).iloc[0]

    # Filtering adj. and non-adj. datasets
    non_adj_df = df[df["adj"] == False]
    adj_df = df[df["adj"] == True]

    best_non_adj = (
        non_adj_df.sort_values(by=sort_by, ascending=ascending_order).iloc[0]
        if not non_adj_df.empty
        else None
    )
    best_adj = (
        adj_df.sort_values(by=sort_by, ascending=ascending_order).iloc[0]
        if not adj_df.empty
        else None
    )

    # Finding the best dataset of each type
    types = [0, 1, 2, 3]
    best_by_type = {}
    for type_ in types:
        type_df = df[df["type"] == type_]
        best_by_type[type_] = (
            type_df.sort_values(by=sort_by, ascending=ascending_order).iloc[0]
            if not type_df.empty
            else None
        )

    results = {
        "best_overall": best_overall,
        "best_non_adj": best_non_adj,
        "best_adj": best_adj,
        "best_by_type": best_by_type,
    }

    # Creating a summary dataframe for results
    summary_data = {
        "Category": ["Best Overall", "Best Non-Adj", "Best Adj"]
        + [f"Best {type_}" for type_ in types],
        "Dataset": [
            results["best_overall"].name,
            (
                results["best_non_adj"].name
                if results["best_non_adj"] is not None
                else "N/A"
            ),
            results["best_adj"].name if results["best_adj"] is not None else "N/A",
        ]
        + [
            (
                results["best_by_type"][type_].name
                if results["best_by_type"][type_] is not None
                else "N/A"
            )
            for type_ in types
        ],
        "Count_R2>0.2": [
            results["best_overall"]["count_above_0.2"],
            (
                results["best_non_adj"]["count_above_0.2"]
                if results["best_non_adj"] is not None
                else "N/A"
            ),
            results["best_adj"]["count_above_0.2"]
            if results["best_adj"] is not None
            else "N/A",
        ]
        + [
            (
                results["best_by_type"][type_]["count_above_0.2"]
                if results["best_by_type"][type_] is not None
                else "N/A"
            )
            for type_ in types
        ],
    }

    for col in r2_columns:
        summary_data[col] = [
            results["best_overall"][col],
            (
                results["best_non_adj"][col]
                if results["best_non_adj"] is not None
                else "N/A"
            ),
            results["best_adj"][col] if results["best_adj"] is not None else "N/A",
        ] + [
            (
                results["best_by_type"][type_][col]
                if results["best_by_type"][type_] is not None
                else "N/A"
            )
            for type_ in types
        ]

    summary_df = pd.DataFrame(summary_data)
    return summary_df

In [340]:
linear_regression_filtered_df = linear_regression_df[
    ~linear_regression_df["dummy"] & ~linear_regression_df["shifts"]
]
linear_regression_summary_df = summary_dataframe(linear_regression_filtered_df)
# linear_regression_summary_df = highlight_max_min(linear_regression_summary_df)
linear_regression_summary_df

Unnamed: 0_level_0,Junts_RMSE,Junts_R^2,C's_RMSE,C's_R^2,CUP_RMSE,CUP_R^2,Comuns_RMSE,Comuns_R^2,ERC_RMSE,ERC_R^2,Altres_RMSE,Altres_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Basic,9.682021,-0.449893,15.779397,-75.323971,4.412181,-0.78467,9.179665,-4.745322,10.829473,-1.640101,0.148069,-0.264108,5.417407,-2.742843,5.181779,0.31948,1.127446,0.746079
No IST,9.987281,-0.54276,15.771049,-75.243239,4.412431,-0.784872,9.01255,-4.53804,11.041402,-1.744444,0.13715,-0.084536,5.557009,-2.938227,5.19614,0.315703,1.128222,0.745729
IST,10.061338,-0.565725,15.779976,-75.329569,4.372709,-0.752881,8.783183,-4.259744,11.056287,-1.751848,0.131427,0.004079,5.595098,-2.992398,5.154955,0.326508,1.125585,0.746916
Complet,10.106541,-0.579825,15.81133,-75.633201,4.410317,-0.783162,8.764135,-4.236956,11.082844,-1.765084,0.131545,0.002296,5.633565,-3.047484,5.162273,0.324594,1.128052,0.745806
Basic Adj.,1.989852,0.93876,1.325316,0.461558,1.787324,0.707177,1.43313,0.859967,1.920743,0.916944,0.114051,0.249897,1.085512,0.849753,1.329069,0.955222,0.970673,0.811815
No IST Adj.,1.924263,0.942731,1.342144,0.447798,1.790533,0.706124,1.497278,0.84715,1.944009,0.91492,0.115398,0.232076,1.127083,0.838025,1.318062,0.95596,0.968395,0.812697
IST Adj.,1.944941,0.941493,1.315292,0.469673,1.786248,0.707529,1.452531,0.85615,1.935656,0.915649,0.115039,0.236839,1.109617,0.843006,1.317865,0.955973,0.967281,0.813128
Complet Adj.,1.917787,0.943116,1.327721,0.459603,1.789184,0.706567,1.463872,0.853895,1.931093,0.916046,0.114872,0.239049,1.126381,0.838227,1.319179,0.955885,0.965903,0.81366


In [341]:
linear_regression_results_df = evaluate_datasets(linear_regression_filtered_df)
linear_regression_results_df

Unnamed: 0,Category,Dataset,Count_R2>0.2,CS_R^2,CUP_R^2,ECPGUANYEM_R^2,ERC_R^2,JXCATJUNTS_R^2,OTH_R^2,PP_R^2,PSC_R^2,VOX_R^2
0,Best Overall,ist_adj,9,0.469673,0.707529,0.85615,0.915649,0.941493,0.236839,0.843006,0.955973,0.813128
1,Best Non-Adj,no_ist,2,-75.243239,-0.784872,-4.53804,-1.744444,-0.54276,-0.084536,-2.938227,0.315703,0.745729
2,Best Adj,ist_adj,9,0.469673,0.707529,0.85615,0.915649,0.941493,0.236839,0.843006,0.955973,0.813128
3,Best 0,basic_adj,9,0.461558,0.707177,0.859967,0.916944,0.93876,0.249897,0.849753,0.955222,0.811815
4,Best 1,ist_adj,9,0.469673,0.707529,0.85615,0.915649,0.941493,0.236839,0.843006,0.955973,0.813128
5,Best 2,no_ist_adj,9,0.447798,0.706124,0.84715,0.91492,0.942731,0.232076,0.838025,0.95596,0.812697
6,Best 3,complete_adj,9,0.459603,0.706567,0.853895,0.916046,0.943116,0.239049,0.838227,0.955885,0.81366


In [342]:
knn_filtered_df = knn_df[
    ~knn_df["dummy"] & ~knn_df["shifts"]
]
knn_summary_df = summary_dataframe(knn_filtered_df)
# knn_summary_df = highlight_max_min(knn_summary_df)
knn_summary_df

Unnamed: 0_level_0,Junts_RMSE,Junts_R^2,C's_RMSE,C's_R^2,CUP_RMSE,CUP_R^2,Comuns_RMSE,Comuns_R^2,ERC_RMSE,ERC_R^2,Altres_RMSE,Altres_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Basic,4.569198,0.677088,5.90326,-9.682282,4.542381,-0.891552,4.555096,-0.414672,3.809411,0.673321,0.125551,0.091139,2.456761,0.23026,4.742536,0.429961,3.129825,-0.956806
No IST,6.293924,0.387301,5.947137,-9.841667,3.862562,-0.367735,2.275099,0.647092,5.583878,0.298096,0.168751,-0.641895,1.845951,0.565431,3.635312,0.66506,4.141697,-2.426607
IST,4.645694,0.666185,6.28342,-11.102424,4.285248,-0.683461,3.476862,0.175794,3.33436,0.749717,0.12393,0.114466,2.41744,0.254702,4.300366,0.531301,3.191553,-1.034753
Complet,6.226988,0.400264,5.869157,-9.559215,3.8801,-0.380184,2.224856,0.662507,5.657277,0.279522,0.170281,-0.671807,1.84678,0.56504,3.587216,0.673864,4.152088,-2.443823
Basic Adj.,3.177918,0.843801,4.494425,-5.192243,5.417902,-1.690675,3.626935,0.103112,2.700899,0.835771,0.119341,0.178691,2.320475,0.313422,4.11724,0.570279,3.156583,-0.990092
No IST Adj.,4.266759,0.718428,4.626827,-5.562452,4.510813,-0.865128,2.079378,0.705201,4.264679,0.590545,0.130873,0.0123,1.822407,0.576526,3.393498,0.708076,3.950731,-2.117408
IST Adj.,3.301615,0.831405,4.627819,-5.565267,5.2004,-1.478977,3.306673,0.25451,2.729611,0.832261,0.122368,0.136505,2.303414,0.32348,4.283623,0.534847,2.990601,-0.786306
Complet Adj.,4.271278,0.717831,4.637,-5.591343,4.502836,-0.858537,2.029772,0.719099,4.294645,0.58477,0.131719,-0.000517,1.812138,0.581284,3.349782,0.715549,3.95884,-2.130219


In [343]:
knn_results_df = evaluate_datasets(knn_filtered_df)
knn_results_df

Unnamed: 0,Category,Dataset,Count_R2>0.2,CS_R^2,CUP_R^2,ECPGUANYEM_R^2,ERC_R^2,JXCATJUNTS_R^2,OTH_R^2,PP_R^2,PSC_R^2,VOX_R^2
0,Best Overall,no_ist_adj,5,-5.562452,-0.865128,0.705201,0.590545,0.718428,0.0123,0.576526,0.708076,-2.117408
1,Best Non-Adj,complete,5,-9.559215,-0.380184,0.662507,0.279522,0.400264,-0.671807,0.56504,0.673864,-2.443823
2,Best Adj,no_ist_adj,5,-5.562452,-0.865128,0.705201,0.590545,0.718428,0.0123,0.576526,0.708076,-2.117408
3,Best 0,basic_adj,4,-5.192243,-1.690675,0.103112,0.835771,0.843801,0.178691,0.313422,0.570279,-0.990092
4,Best 1,ist_adj,5,-5.565267,-1.478977,0.25451,0.832261,0.831405,0.136505,0.32348,0.534847,-0.786306
5,Best 2,no_ist_adj,5,-5.562452,-0.865128,0.705201,0.590545,0.718428,0.0123,0.576526,0.708076,-2.117408
6,Best 3,complete_adj,5,-5.591343,-0.858537,0.719099,0.58477,0.717831,-0.000517,0.581284,0.715549,-2.130219


In [344]:
decision_tree_filtered_df = decision_tree_df[
    ~decision_tree_df["dummy"] & ~decision_tree_df["shifts"]
]
decision_tree_summary_df = summary_dataframe(decision_tree_filtered_df)
# decision_tree_summary_df = highlight_max_min(decision_tree_summary_df)
decision_tree_summary_df

Unnamed: 0_level_0,Junts_RMSE,Junts_R^2,C's_RMSE,C's_R^2,CUP_RMSE,CUP_R^2,Comuns_RMSE,Comuns_R^2,ERC_RMSE,ERC_R^2,Altres_RMSE,Altres_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Basic,7.047472,0.231807,6.558479,-12.185192,4.439703,-0.807003,6.224744,-1.641823,4.98644,0.440259,0.152559,-0.341937,2.610875,0.130658,8.352714,-0.768228,2.959547,-0.749678
No IST,5.766946,0.485606,5.330188,-7.708938,5.036845,-1.325779,6.289172,-1.696794,5.220571,0.386461,0.2028,-1.371318,2.816742,-0.011842,5.845079,0.134108,3.686253,-1.714425
IST,6.623759,0.321401,5.508329,-8.30079,4.860161,-1.165472,4.946183,-0.668019,5.081792,0.418647,0.164082,-0.552296,2.777788,0.015951,7.533521,-0.438398,3.223599,-1.075819
Complet,5.949225,0.452575,10.520813,-32.929601,4.509628,-0.864373,4.543579,-0.407527,4.960601,0.446045,0.188383,-1.046162,2.667437,0.092583,5.606658,0.203307,4.654623,-3.327894
Basic Adj.,5.75589,0.48759,8.440767,-20.840543,4.719179,-1.041418,4.282063,-0.250159,6.117535,0.157468,0.164212,-0.555016,2.426852,0.249029,4.395062,0.510329,3.745934,-1.802587
No IST Adj.,5.611423,0.512989,6.446214,-11.73823,4.866303,-1.170687,5.083022,-0.761582,6.245287,0.121911,0.178126,-0.829701,3.513956,-0.574451,6.805887,-0.174203,4.108603,-2.37153
IST Adj.,6.02418,0.438708,6.199269,-10.780957,4.600247,-0.93982,3.097255,0.345947,6.473279,0.05663,0.14015,-0.132686,2.761606,0.027567,5.550141,0.219124,4.189866,-2.50622
Complet Adj.,6.598178,0.32665,6.397301,-11.54565,4.206873,-0.62225,3.745183,0.043676,6.190062,0.137372,0.135676,-0.061535,3.253559,-0.349752,6.621583,-0.111469,3.023541,-0.825873


In [345]:
decision_tree_results_df = evaluate_datasets(decision_tree_filtered_df)
decision_tree_results_df

Unnamed: 0,Category,Dataset,Count_R2>0.2,CS_R^2,CUP_R^2,ECPGUANYEM_R^2,ERC_R^2,JXCATJUNTS_R^2,OTH_R^2,PP_R^2,PSC_R^2,VOX_R^2
0,Best Overall,ist_adj,3,-10.780957,-0.93982,0.345947,0.05663,0.438708,-0.132686,0.027567,0.219124,-2.50622
1,Best Non-Adj,complete,3,-32.929601,-0.864373,-0.407527,0.446045,0.452575,-1.046162,0.092583,0.203307,-3.327894
2,Best Adj,ist_adj,3,-10.780957,-0.93982,0.345947,0.05663,0.438708,-0.132686,0.027567,0.219124,-2.50622
3,Best 0,basic_adj,3,-20.840543,-1.041418,-0.250159,0.157468,0.48759,-0.555016,0.249029,0.510329,-1.802587
4,Best 1,ist_adj,3,-10.780957,-0.93982,0.345947,0.05663,0.438708,-0.132686,0.027567,0.219124,-2.50622
5,Best 2,no_ist,2,-7.708938,-1.325779,-1.696794,0.386461,0.485606,-1.371318,-0.011842,0.134108,-1.714425
6,Best 3,complete,3,-32.929601,-0.864373,-0.407527,0.446045,0.452575,-1.046162,0.092583,0.203307,-3.327894


In [346]:
xgboost_single_output_filtered_df = xgboost_single_output_df[
    ~xgboost_single_output_df["dummy"] & ~xgboost_single_output_df["shifts"]
]
xgboost_single_output_summary_df = summary_dataframe(xgboost_single_output_filtered_df)
# xgboost_single_output_summary_df = highlight_max_min(xgboost_single_output_summary_df)
xgboost_single_output_summary_df

Unnamed: 0_level_0,Junts_RMSE,Junts_R^2,C's_RMSE,C's_R^2,CUP_RMSE,CUP_R^2,Comuns_RMSE,Comuns_R^2,ERC_RMSE,ERC_R^2,Altres_RMSE,Altres_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Basic,5.350012,0.557296,6.604884,-12.372438,4.049565,-0.503377,5.881713,-1.358676,2.673913,0.839047,1.210934,-83.546494,1.748998,0.609881,3.724095,0.648501,1.428328,0.592466
No IST,5.350006,0.557297,6.586429,-12.29781,4.048545,-0.50262,5.867871,-1.347588,2.704132,0.835388,1.21214,-83.714912,1.756331,0.606603,3.116825,0.753789,1.413467,0.600903
IST,5.309212,0.564023,6.622328,-12.443165,4.050509,-0.504078,5.960576,-1.422351,2.640426,0.843053,1.203383,-82.49541,1.753199,0.608005,3.114418,0.754169,1.412391,0.60151
Complet,5.296578,0.566095,6.609354,-12.390545,4.055174,-0.507545,5.970642,-1.43054,2.674836,0.838935,1.20338,-82.494955,1.758439,0.605658,3.063865,0.762085,1.410832,0.602389
Basic Adj.,2.576074,0.897362,2.471621,-0.87268,1.861966,0.682208,1.790849,0.781336,2.849646,0.817183,0.493514,-13.045089,1.32938,0.774662,2.28087,0.868121,1.354208,0.633723
No IST Adj.,2.492158,0.90394,2.487225,-0.896399,1.843565,0.688459,1.725181,0.797078,2.861518,0.815657,0.410008,-8.694153,1.323071,0.776796,2.218847,0.875196,1.313546,0.655389
IST Adj.,2.550959,0.899353,2.481455,-0.887611,1.831924,0.692381,1.729279,0.796113,2.826726,0.820112,0.425479,-9.439555,1.324006,0.77648,2.215812,0.875537,1.317546,0.653287
Complet Adj.,2.488638,0.904211,2.49422,-0.907082,1.818256,0.696954,1.725907,0.796907,2.860045,0.815847,0.395704,-8.029545,1.328076,0.775104,2.202618,0.877015,1.307065,0.658781


In [347]:
xgboost_single_output_results_df = evaluate_datasets(xgboost_single_output_filtered_df)
xgboost_single_output_results_df

Unnamed: 0,Category,Dataset,Count_R2>0.2,CS_R^2,CUP_R^2,ECPGUANYEM_R^2,ERC_R^2,JXCATJUNTS_R^2,OTH_R^2,PP_R^2,PSC_R^2,VOX_R^2
0,Best Overall,basic_adj,7,-0.87268,0.682208,0.781336,0.817183,0.897362,-13.045089,0.774662,0.868121,0.633723
1,Best Non-Adj,no_ist,5,-12.29781,-0.50262,-1.347588,0.835388,0.557297,-83.714912,0.606603,0.753789,0.600903
2,Best Adj,basic_adj,7,-0.87268,0.682208,0.781336,0.817183,0.897362,-13.045089,0.774662,0.868121,0.633723
3,Best 0,basic_adj,7,-0.87268,0.682208,0.781336,0.817183,0.897362,-13.045089,0.774662,0.868121,0.633723
4,Best 1,ist_adj,7,-0.887611,0.692381,0.796113,0.820112,0.899353,-9.439555,0.77648,0.875537,0.653287
5,Best 2,no_ist_adj,7,-0.896399,0.688459,0.797078,0.815657,0.90394,-8.694153,0.776796,0.875196,0.655389
6,Best 3,complete_adj,7,-0.907082,0.696954,0.796907,0.815847,0.904211,-8.029545,0.775104,0.877015,0.658781


In [348]:
xgboost_multi_output_filtered_df = xgboost_multi_output_df[
    ~xgboost_multi_output_df["dummy"] & ~xgboost_multi_output_df["shifts"]
]
xgboost_multi_output_summary_df = summary_dataframe(xgboost_multi_output_filtered_df)
# xgboost_multi_output_summary_df = highlight_max_min(xgboost_multi_output_summary_df)
xgboost_multi_output_summary_df

Unnamed: 0_level_0,Junts_RMSE,Junts_R^2,C's_RMSE,C's_R^2,CUP_RMSE,CUP_R^2,Comuns_RMSE,Comuns_R^2,ERC_RMSE,ERC_R^2,Altres_RMSE,Altres_R^2,PP_RMSE,PP_R^2,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Basic,5.177592,0.585371,4.399473,-4.933097,4.256328,-0.660815,3.672982,0.08019,4.343811,0.575236,1.61316,-149.040867,2.203686,0.380676,3.277475,0.727754,2.226568,0.009671
No IST,5.047546,0.605938,4.46637,-5.114902,4.250221,-0.656053,3.618283,0.107382,4.201909,0.602535,1.57471,-141.973529,2.038958,0.469806,3.269386,0.729096,2.234209,0.002862
IST,5.070928,0.602279,4.460168,-5.097932,4.246994,-0.653539,3.55332,0.139146,4.284774,0.586704,1.596398,-145.938866,2.124206,0.424544,3.306046,0.722987,2.243341,-0.005306
Complet,5.05537,0.604715,4.417788,-4.982598,4.229398,-0.639866,3.55928,0.136256,4.367289,0.570632,1.617464,-149.842433,2.11195,0.431166,3.281483,0.727088,2.26961,-0.028988
Basic Adj.,3.94627,0.759139,3.893913,-3.648064,3.875659,-0.376861,2.35293,0.622535,3.298682,0.755029,1.127348,-72.289385,2.246566,0.356461,4.002266,0.593944,2.432609,-0.181908
No IST Adj.,3.868922,0.768488,3.908467,-3.682874,3.963973,-0.440324,2.250831,0.654582,3.35549,0.746519,1.0547,-63.148017,2.243624,0.358146,4.054045,0.58337,2.513241,-0.261557
IST Adj.,3.828611,0.773288,3.933207,-3.742346,3.857701,-0.364131,2.159933,0.681918,3.319534,0.751922,1.040543,-61.437418,2.187898,0.389634,4.019024,0.590537,2.502905,-0.251202
Complet Adj.,3.773532,0.779764,3.952826,-3.789774,3.885351,-0.383756,2.173119,0.678022,3.36529,0.745036,1.060047,-63.800098,2.154922,0.407894,3.985297,0.59738,2.456536,-0.205272


In [349]:
xgboost_multi_output_results_df = evaluate_datasets(xgboost_multi_output_filtered_df)
xgboost_multi_output_results_df

Unnamed: 0,Category,Dataset,Count_R2>0.2,CS_R^2,CUP_R^2,ECPGUANYEM_R^2,ERC_R^2,JXCATJUNTS_R^2,OTH_R^2,PP_R^2,PSC_R^2,VOX_R^2
0,Best Overall,basic_adj,5,-3.648064,-0.376861,0.622535,0.755029,0.759139,-72.289385,0.356461,0.593944,-0.181908
1,Best Non-Adj,basic,4,-4.933097,-0.660815,0.08019,0.575236,0.585371,-149.040867,0.380676,0.727754,0.009671
2,Best Adj,basic_adj,5,-3.648064,-0.376861,0.622535,0.755029,0.759139,-72.289385,0.356461,0.593944,-0.181908
3,Best 0,basic_adj,5,-3.648064,-0.376861,0.622535,0.755029,0.759139,-72.289385,0.356461,0.593944,-0.181908
4,Best 1,ist_adj,5,-3.742346,-0.364131,0.681918,0.751922,0.773288,-61.437418,0.389634,0.590537,-0.251202
5,Best 2,no_ist_adj,5,-3.682874,-0.440324,0.654582,0.746519,0.768488,-63.148017,0.358146,0.58337,-0.261557
6,Best 3,complete_adj,5,-3.789774,-0.383756,0.678022,0.745036,0.779764,-63.800098,0.407894,0.59738,-0.205272


In [350]:
# List of dataframes
dataframes = [
    (linear_regression_filtered_df, 'Linear Regression'),
    (knn_filtered_df, 'KNN'),
    (decision_tree_filtered_df, 'Decision Tree'),
    (xgboost_multi_output_filtered_df, 'XGBoost Multi-Output'),
    (xgboost_single_output_filtered_df, 'XGBoost Single-Output')
]

# Create an empty dataframe to hold the concatenated data
concatenated_df = pd.DataFrame()

# Concatenate the dataframes
for df, model in dataframes:
    df = df.copy()  # Create a copy to avoid modifying the original dataframe
    df['Model'] = model
    df['Dataset'] = df.index
    df['Experiment'] = df['Dataset'] + ' ' + df['Model']
    df.set_index('Experiment', inplace=True)
    concatenated_df = pd.concat([concatenated_df, df])

concatenated_df

Unnamed: 0_level_0,CS_RMSE,CS_R^2,CUP_RMSE,CUP_R^2,ECPGUANYEM_RMSE,ECPGUANYEM_R^2,ERC_RMSE,ERC_R^2,JXCATJUNTS_RMSE,JXCATJUNTS_R^2,...,PSC_RMSE,PSC_R^2,VOX_RMSE,VOX_R^2,adj,dummy,shifts,type,Model,Dataset
Experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
basic Linear Regression,15.779397,-75.323971,4.412181,-0.78467,9.179665,-4.745322,10.829473,-1.640101,9.682021,-0.449893,...,5.181779,0.31948,1.127446,0.746079,False,False,False,0,Linear Regression,basic
basic_adj Linear Regression,1.325316,0.461558,1.787324,0.707177,1.43313,0.859967,1.920743,0.916944,1.989852,0.93876,...,1.329069,0.955222,0.970673,0.811815,True,False,False,0,Linear Regression,basic_adj
complete Linear Regression,15.81133,-75.633201,4.410317,-0.783162,8.764135,-4.236956,11.082844,-1.765084,10.106541,-0.579825,...,5.162273,0.324594,1.128052,0.745806,False,False,False,3,Linear Regression,complete
complete_adj Linear Regression,1.327721,0.459603,1.789184,0.706567,1.463872,0.853895,1.931093,0.916046,1.917787,0.943116,...,1.319179,0.955885,0.965903,0.81366,True,False,False,3,Linear Regression,complete_adj
ist Linear Regression,15.779976,-75.329569,4.372709,-0.752881,8.783183,-4.259744,11.056287,-1.751848,10.061338,-0.565725,...,5.154955,0.326508,1.125585,0.746916,False,False,False,1,Linear Regression,ist
ist_adj Linear Regression,1.315292,0.469673,1.786248,0.707529,1.452531,0.85615,1.935656,0.915649,1.944941,0.941493,...,1.317865,0.955973,0.967281,0.813128,True,False,False,1,Linear Regression,ist_adj
no_ist Linear Regression,15.771049,-75.243239,4.412431,-0.784872,9.01255,-4.53804,11.041402,-1.744444,9.987281,-0.54276,...,5.19614,0.315703,1.128222,0.745729,False,False,False,2,Linear Regression,no_ist
no_ist_adj Linear Regression,1.342144,0.447798,1.790533,0.706124,1.497278,0.84715,1.944009,0.91492,1.924263,0.942731,...,1.318062,0.95596,0.968395,0.812697,True,False,False,2,Linear Regression,no_ist_adj
basic KNN,5.90326,-9.682282,4.542381,-0.891552,4.555096,-0.414672,3.809411,0.673321,4.569198,0.677088,...,4.742536,0.429961,3.129825,-0.956806,False,False,False,0,KNN,basic
basic_adj KNN,4.494425,-5.192243,5.417902,-1.690675,3.626935,0.103112,2.700899,0.835771,3.177918,0.843801,...,4.11724,0.570279,3.156583,-0.990092,True,False,False,0,KNN,basic_adj


In [351]:
results_df = evaluate_datasets(concatenated_df)
results_df

Unnamed: 0,Category,Dataset,Count_R2>0.2,CS_R^2,CUP_R^2,ECPGUANYEM_R^2,ERC_R^2,JXCATJUNTS_R^2,OTH_R^2,PP_R^2,PSC_R^2,VOX_R^2
0,Best Overall,ist_adj Linear Regression,9,0.469673,0.707529,0.85615,0.915649,0.941493,0.236839,0.843006,0.955973,0.813128
1,Best Non-Adj,complete KNN,5,-9.559215,-0.380184,0.662507,0.279522,0.400264,-0.671807,0.56504,0.673864,-2.443823
2,Best Adj,ist_adj Linear Regression,9,0.469673,0.707529,0.85615,0.915649,0.941493,0.236839,0.843006,0.955973,0.813128
3,Best 0,basic_adj Linear Regression,9,0.461558,0.707177,0.859967,0.916944,0.93876,0.249897,0.849753,0.955222,0.811815
4,Best 1,ist_adj Linear Regression,9,0.469673,0.707529,0.85615,0.915649,0.941493,0.236839,0.843006,0.955973,0.813128
5,Best 2,no_ist_adj Linear Regression,9,0.447798,0.706124,0.84715,0.91492,0.942731,0.232076,0.838025,0.95596,0.812697
6,Best 3,complete_adj Linear Regression,9,0.459603,0.706567,0.853895,0.916046,0.943116,0.239049,0.838227,0.955885,0.81366
