## Analysis of the results obtained by the models

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
import matplotlib.ticker as mticker
import re

import os

In [None]:
REPORTS_DIRECTORY = Path("..") / "reports_strat"
RESULTS_DIRECTORY = Path("..") / "results_strat"
FIGURES_DIRECTORY = Path("..") / "figures_strat" / "metrics"
DATA_DIRECTORY = Path("..") / "input"

FIGURES_DIRECTORY.mkdir(parents=True, exist_ok=True)

outputs = pd.read_pickle(DATA_DIRECTORY / "outputs_wp3_wp5.pkl") # blue, green, orange and pink columns
property_name_list = outputs.columns.tolist()

metrics_name_list = 'R2 RMSE MAE'.split()
models_name_list= 'PLS CNN2D CNN2D_DI CNN2D_DIPCA MLPCWT6_DI'.split()

#model names used in the paper.
model_names_for_paper= {'PLS': 'PLS', 
                        'CNN2D': '$CNN_{cwt}$', 
                        'CNN2D_DI': '$CNN_{cwt,cp}$',
                        'CNN2D_DIPCA': '$CNN_{cwt,pca(cp)}$',
                        'MLPCWT6_DI': 'MLP'}

target_cwt_coeff= 5
num_strats = 6

In [None]:
results_lst = []
property_no_data_lst= []
best_results_dict = {}
best_results_pollutant_dict = {}
  
for property_name in tqdm(property_name_list, desc="Properties..."):
    property_results_lst = []
    try:
        for strat_index in tqdm(range(1,num_strats+1), desc="Input strats..."):
            property_name_clean= re.sub(r'[^\w\s]', '', property_name)
            results_df = pd.read_csv(REPORTS_DIRECTORY / f"report_{property_name_clean}_input_priority_{strat_index}.csv", index_col='model')   
            _df = results_df.copy()
            _df= _df[metrics_name_list]
            _df= _df[_df.index.isin(models_name_list)]

            _df['property']= property_name
            _df['strat']= strat_index
            results_lst.append(_df)
            property_results_lst.append(_df)

            for metric in metrics_name_list:
                ax= _df[metric].replace([np.inf, -np.inf], np.nan).dropna().plot(grid=True, kind='bar')
                ax.set_ylabel(metric, fontsize=16)
                formatter = mticker.ScalarFormatter(useMathText=True)
                formatter.set_scientific(False)
                ax.yaxis.set_major_formatter(formatter)
                ax.set_title(property_name)
                plt.tight_layout()
                plt.savefig(FIGURES_DIRECTORY / f"{property_name_clean}_{metric}_input_priority_{strat_index}.pdf", dpi=600)
                plt.show()
            
        property_results_df= pd.concat(property_results_lst, axis=0)    
        for metric in tqdm(metrics_name_list, desc="Best metric..."):

            if metric != 'R2':
                best_value = property_results_df[metric].min()
                best_model = property_results_df[metric].idxmin()
            else:
                best_value = property_results_df[metric].max()
                best_model = property_results_df[metric].idxmax()
            
            best_strat = property_results_df[property_results_df[metric]==best_value].iloc[0]['strat']

            best_results = best_results_dict.get(metric, [])
            best_results.append({'property': property_name, 'best_value': best_value, 'best_model': best_model, 'best_strat':best_strat})
            best_results_dict[metric]= best_results

            best_results_prop = best_results_pollutant_dict.get(property_name_clean, [])
            best_results_prop.append({'metric': metric, 'best_value': best_value, 'best_model': best_model, 'best_strat':best_strat})
            best_results_pollutant_dict[property_name_clean]= best_results_prop

            property_metric_df = property_results_df[[metric, 'strat']]
            property_metric_df= property_metric_df.reset_index().pivot(index='strat', columns='model', values=metric)
            ax= property_metric_df.plot(grid=True, kind='line', marker='o')
            ax.set_ylabel(metric, fontsize=16)
            ax.set_xlabel("Input strat.", fontsize=16)
            formatter = mticker.ScalarFormatter(useMathText=True)
            formatter.set_scientific(False)
            ax.yaxis.set_major_formatter(formatter)
            ax.set_title(property_name)
            ax.legend(loc='upper left', bbox_to_anchor=(1, 1))

            plt.tight_layout()
            plt.savefig(FIGURES_DIRECTORY / f"{property_name_clean}_{metric}_evol.pdf", dpi=600)
            plt.show()
    except Exception as e:
        print("ERROR::", property_name, e)
        property_no_data_lst.append(property_name)


In [None]:
all_results_df = pd.concat(results_lst,axis=0).reset_index()
all_results_df

In [None]:
# Transformar el DataFrame para tener dos niveles de columnas
df_pivot = all_results_df.melt(id_vars=["property", "model", "strat"], var_name="metric", value_name="value")
df_pivot = df_pivot.pivot(index="property", columns=["model", "strat", "metric"], values="value")

df_pivot.columns = pd.MultiIndex.from_tuples(df_pivot.columns, names=["Model", "Strat", "Metric"])

df_pivot.reset_index(inplace=True)
df_pivot= df_pivot.set_index("property")
print(df_pivot)
df_pivot.to_csv(REPORTS_DIRECTORY / "all_metrics.csv")

In [None]:
def mean_plot(df, metric, strats=None):
    clean_df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=[metric])


    if strats:
        clean_df = clean_df[clean_df['strat'].isin(strats)]
    
    clean_df['model']= clean_df['model'].map(model_names_for_paper)

    _mean= clean_df.groupby('model')[metric].mean()
    _std= clean_df.groupby('model')[metric].std()
    ax=_mean.plot(grid=True, kind='bar', yerr=_std)
    ax.set_ylabel(metric, fontsize=15)
    ax.set_xlabel('Model', fontsize=15)

    formatter = mticker.ScalarFormatter(useMathText=True)
    formatter.set_scientific(False)
    ax.yaxis.set_major_formatter(formatter)
    if metric != 'R2':
        ax.set_yscale('log')

    for barra in ax.patches:
        altura = barra.get_height()
        ax.text(barra.get_x() + barra.get_width()/2, altura, f'{altura:.2f}', 
            ha='center', va='bottom', fontsize=9)

    plt.tight_layout()
    plt.savefig(FIGURES_DIRECTORY / f"{metric}_mean.pdf", dpi=600)
    plt.show()

for m in metrics_name_list:
    mean_plot(all_results_df, m)


Generate table with best model per property and evaluation metric

In [None]:
for metric, best_result in best_results_dict.items():
    best_result_df= pd.DataFrame(best_result)
    best_result_df= best_result_df.set_index('property')
    best_result_df.to_csv(REPORTS_DIRECTORY / f"best_report_metric_{metric}.csv")

    print("*"* 10, metric, "*"* 10)
    print(best_result_df)
    print("*"* 20)

In [None]:
best_results_dict

In [None]:
best_result_df.head()

In [None]:
for metric, best_result in best_results_dict.items():
        best_result_df= pd.DataFrame(best_result)
        best_result_df ['best_model']= best_result_df ['best_model'].map(model_names_for_paper)
        counts = best_result_df.groupby(["best_model", "best_strat"]).size().reset_index(name="count")
        counts = counts.sort_values(by="best_strat")

        # Graficar el diagrama de barras
        plt.figure(figsize=(10, 6))
        plt.bar(x=[f"{model}-{strat}" for model, strat in zip(counts["best_model"], counts["best_strat"])],
                height=counts["count"],
                color='skyblue')
        plt.xlabel("Model-input priority", fontsize= 16)
        plt.ylabel("Frequency", fontsize= 16)
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.grid(True)

        plt.savefig(FIGURES_DIRECTORY/ f"frequency_model_input_{metric}.png")
        plt.show()

In [None]:
for metric, best_result in best_results_dict.items():
    best_result_df= pd.DataFrame(best_result)
    best_result_df ['best_model']= best_result_df ['best_model'].map(model_names_for_paper)

    counts = best_result_df["best_model"].value_counts().reset_index()
    counts.columns = ["best_model", "count"]

    # Graficar el diagrama de barras
    plt.figure(figsize=(10, 6))
    plt.bar(x=counts["best_model"], height=counts["count"], color='skyblue')
    plt.xlabel("Model", fontsize= 16)
    plt.ylabel("Frequency", fontsize= 16)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.grid(True)
    plt.savefig(FIGURES_DIRECTORY/ f"frequency_model_{metric}.png")
    plt.show()


In [None]:

for metric, best_result in best_results_dict.items():
    best_result_df= pd.DataFrame(best_result)
    counts = best_result_df["best_strat"].value_counts().reset_index()
    counts.columns = ["best_strat", "count"]

    # Graficar el diagrama de barras
    plt.figure(figsize=(10, 6))
    plt.bar(x=counts["best_strat"], height=counts["count"], color='skyblue')
    plt.xlabel("CP group", fontsize= 16)
    plt.ylabel("Frequency", fontsize= 16)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.grid(True)
    plt.savefig(FIGURES_DIRECTORY/ f"frequency_input_priority_{metric}.png")
    plt.show()

In [None]:
for property, best_result in best_results_pollutant_dict.items():
    best_prop_result_df= pd.DataFrame(best_result)
    best_prop_result_df= best_prop_result_df.set_index('metric')
    best_prop_result_df.to_csv(REPORTS_DIRECTORY / f"best_report_property_{property}.csv")

    print("*"* 10, property, "*"* 10)
    print(best_prop_result_df)
    print("*"* 20)

In [None]:
print("That's all folks!")