In [None]:
experiment_ids = [
	256, # ETR
	231, # Random Forest
	215, # Elastic Net,
	214, # Ridge
	210, # LASSO
	257, # ANN
	258, # CNN
	144, # NGB
	140, # PLS
	136, # XGBoost
	134, # SVR
	45, # GBR
]

In [None]:
models = {
	"Ridge": "Ridge",
	"LASSO": "\\gls{lasso}",
	"ElasticNet": "\\gls{enet}",
	"PLS": "\\gls{pls}",
	"SVR": "\\gls{svr}",
	"RandomForest": "\\gls{rf}",
	"NGB": "\\gls{ngboost}",
	"GBR": "\\gls{gbr}",
	"XGB": "\\gls{xgboost}",
	"ExtraTrees": "\\gls{etr}",
	"ANN": "\\gls{ann}",
	"CNN": "\\gls{cnn}",
}

In [None]:
import mlflow
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.preprocessing import MinMaxScaler
from lib.reproduction import major_oxides

In [None]:
client = mlflow.tracking.MlflowClient()

data = {}

for experiment_id in experiment_ids:
	data[experiment_id] = client.search_runs(experiment_id)

In [None]:
results = {}
model_means = {}

# Iterate over the data dictionary and print the runs
for experiment_id, runs in data.items():
    for run in runs:
        # Check if the target parameter is present
        if "target" not in run.data.params:
            continue

        model_name = run.data.tags["mlflow.runName"].split("_")[0]
        latex_name = model_name

        if model_name in models:
            latex_name = models[latex_name]

        target = run.data.params["target"]
        rmse = run.data.metrics["rmse"]
        rmse_cv = run.data.metrics["rmse_cv"]
        std_dev = run.data.metrics["std_dev"]
        std_dev_cv = run.data.metrics["std_dev_cv"]

        # print(f"{model_name} - {target}, RMSE: {rmse}, RMSE CV: {rmse_cv}, STD DEV: {std_dev}, STD DEV CV: {std_dev_cv}")

        if model_name not in results:
            results[model_name] = {}
            model_means[model_name] = {"rmse": [], "rmse_cv": [], "std_dev": [], "std_dev_cv": []}

        if target not in results[model_name] or rmse_cv < results[model_name][target]["rmse_cv"]:
            results[model_name][target] = {
                "latex_name": latex_name,
                "rmse": rmse,
                "rmse_cv": rmse_cv,
                "std_dev": std_dev,
                "std_dev_cv": std_dev_cv,
            }

# Calculate the mean of the results
for model_name, targets in results.items():
    for target, data in targets.items():
        model_means[model_name]["rmse"].append(data["rmse"])
        model_means[model_name]["rmse_cv"].append(data["rmse_cv"])
        model_means[model_name]["std_dev"].append(data["std_dev"])
        model_means[model_name]["std_dev_cv"].append(data["std_dev_cv"])

    for metric, values in model_means[model_name].items():
        model_means[model_name][metric] = np.mean(values)

results

In [None]:
# Generate the full table showing all the metrics
n_decimals = 3

latex_table = "\\begin{table*}[]\n"
latex_table += "\\centering\n"
latex_table += "\\resizebox{1\\textwidth}{!}{%\n"
latex_table += "\\begin{tabular}{l|cccc|cccc|cccc}\n"

model_keys = list(models.keys())

for i in range(0, len(model_keys), 3):
    chunk = model_keys[i:i+3]

    # Header row
    header_row = "Model"
    for model in chunk:
        header_row += f" & \\multicolumn{{4}}{{c}}{{{models[model]}}}"

    latex_table += header_row + " \\\\\n"

    # Metric row
    metric_row = "Metric"
    for _ in chunk:
        metric_row += " & \\multicolumn{1}{c}{RMSEP} & \\multicolumn{1}{c}{RMSECV} & \\multicolumn{1}{c}{Std. dev.} & \\multicolumn{1}{c}{Std. dev. CV}"

    latex_table += metric_row + " \\\\\n"
    latex_table += "\\hline\n"

    # Data rows
    for target in major_oxides:
        row = f"$\\ce{{{target}}}$"

        for model in chunk:
            if model in results and target in results[model]:
                data = results[model][target]
                row += f" & {data['rmse']:.{n_decimals}f} & {data['rmse_cv']:.{n_decimals}f} & {data['std_dev']:.{n_decimals}f} & {data['std_dev_cv']:.{n_decimals}f}"
            else:
                print(f"Missing data for {model} - {target}")
                row += " & - & - & - & -"

        latex_table += row + " \\\\\n"

    latex_table += "\\hline\n"

    # Mean rows
    mean_row = "Mean"
    for model in chunk:
        if model in model_means:
            mean_metrics = model_means[model]
            mean_row += f" & {mean_metrics['rmse']:.{n_decimals}f} & {mean_metrics['rmse_cv']:.{n_decimals}f} & {mean_metrics['std_dev']:.{n_decimals}f} & {mean_metrics['std_dev_cv']:.{n_decimals}f}"
        else:
            mean_row += " & - & - & - & -"

    latex_table += mean_row + " \\\\\n"
    latex_table += "\\hline\n"

latex_table += "\\end{tabular}%\n"
latex_table += "}\n"
latex_table += "\\caption{Initial results for the different models and metrics.}\n"
latex_table += "\\label{tab:init_results}\n"
latex_table += "\\end{table*}\n"

# Write the LaTeX table string to a file
path = Path("./../report_thesis/src/sections/results/init_results_table.tex")

with open(path, "w") as file:
    file.write(latex_table)

In [None]:
# Identify the best model for each target including the metric value
best_models = {}

for target in major_oxides:
    for metric in ["rmse", "rmse_cv", "std_dev", "std_dev_cv"]:
        best_value = None
        best_model = None

        for model, data in results.items():
            if target in data:
                if best_value is None or data[target][metric] < best_value:
                    best_value = data[target][metric]
                    best_model = model

        if target not in best_models:
            best_models[target] = {}

        best_models[target][metric] = (best_model, best_value)

best_models


In [None]:
# Generate a LaTeX table showing the best metric for each oxide (RMSEP, RMSECV, Std. dev., Std. dev. CV)
latex_table = "\\begin{tabular}{l|llll}\n"
latex_table += "Oxide & RMSEP & RMSECV & Std. dev. & Std. dev. CV \\\\\n"
latex_table += "\\hline\n"

# Write the best value followed by the model in parentheses, for example "0.123 (Ridge)"

for target in major_oxides:
	row = f"$\\ce{{{target}}}$"

	for metric in ["rmse", "rmse_cv", "std_dev", "std_dev_cv"]:
		best_model, best_value = best_models[target][metric]
		row += f" & {best_value:.{n_decimals}f} ({models[best_model]})"

	latex_table += row + " \\\\\n"

latex_table += "\\end{tabular}%\n"
latex_table += "\\label{tab:best_results}\n"

# Write the LaTeX table string to a file
path = Path("./../report_thesis/src/sections/results/best_results_table.tex")

with open(path, "w") as file:
	file.write(latex_table)

In [None]:
# Create a dataframe showing how many times each of the best models was the best
best_model_counts = {}

for model in best_models.values():
	for best_model, _ in model.values():
		if best_model not in best_model_counts:
			best_model_counts[best_model] = 0

		best_model_counts[best_model] += 1

# Turn it into a dataframe
df = pd.DataFrame.from_dict(best_model_counts, orient="index", columns=["Occurrences"])
df = df.sort_values(by="Occurrences", ascending=False)

# Turn the dataframe into a LaTeX table
latex_table = "\\begin{tabular}{lc}\n"
latex_table += "Model & Occurrences \\\\\n"
latex_table += "\\hline\n"

for model, count in df.iterrows():
	latex_table += f"{models[model]} & {count['Occurrences']} \\\\\n"

latex_table += "\\end{tabular}\n"
latex_table += "\\label{tab:best_model_occurrences}\n"

# Write the LaTeX table string to a file
path = Path("./../report_thesis/src/sections/results/best_model_occurrences_table.tex")

with open(path, "w") as file:
	file.write(latex_table)

In [None]:
model_names = list(model_means.keys())
rmse_values = [metrics["rmse"] for metrics in model_means.values()]
rmse_cv_values = [metrics["rmse_cv"] for metrics in model_means.values()]
std_dev_values = [metrics["std_dev"] for metrics in model_means.values()]
std_dev_cv_values = [metrics["std_dev_cv"] for metrics in model_means.values()]

data = pd.DataFrame({
    'Model': model_names,
    'RMSE': rmse_values,
    'RMSE_CV': rmse_cv_values,
    'Std_Dev': std_dev_values,
    'Std_Dev_CV': std_dev_cv_values
})

metrics_sorted = {
    'RMSE': data.sort_values(by='RMSE'),
    'RMSE_CV': data.sort_values(by='RMSE_CV'),
    'Std_Dev': data.sort_values(by='Std_Dev'),
    'Std_Dev_CV': data.sort_values(by='Std_Dev_CV')
}

color = sns.color_palette("Blues")[2]

fig, axes = plt.subplots(2, 2, figsize=(18, 12))

titles = ["Mean RMSEP", "Mean RMSECV", "Mean Standard Deviation of Prediction Errors", "Mean of Standard Deviation of Cross-Validation Prediction Errors"]
y_labels = ["Mean RMSEP", "Mean RMSECV", "Mean Standard Deviation of Prediction Errors", "Mean of Standard Deviation of Cross-Validation Prediction Errors"]
columns = ['RMSE', 'RMSE_CV', 'Std_Dev', 'Std_Dev_CV']

for i, (col, ax) in enumerate(zip(columns, axes.flatten())):
    sns.barplot(x='Model', y=col, data=metrics_sorted[col], color=color, ax=ax)
    ax.set_title(titles[i], fontsize=16)
    ax.set_ylabel(y_labels[i], fontsize=14)
    ax.set_xlabel("Model", fontsize=14)
    ax.tick_params(axis='x', rotation=45, labelsize=12)
    ax.tick_params(axis='y', labelsize=12)
    ax.grid(True, which='both', axis='y', linestyle='--', linewidth=0.7)
    ax.grid(False, which='both', axis='x')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.subplots_adjust(hspace=0.75)
plt.show()

path = Path("./../report_thesis/src/images/init_results_means.png")
fig.savefig(path, dpi=300)


In [None]:
data_list = []

for model, oxides in results.items():
    for oxide, metrics in oxides.items():
        data_list.append({
            'Model': model,
            'Oxide': oxide,
            'RMSE_CV': metrics['rmse_cv']
        })

df = pd.DataFrame(data_list)

scaler = MinMaxScaler()
df['Normalized_RMSE_CV'] = df.groupby('Oxide')['RMSE_CV'].transform(lambda x: scaler.fit_transform(x.values.reshape(-1, 1)).flatten())

df['Average_Normalized_RMSE_CV'] = df.groupby('Model')['Normalized_RMSE_CV'].transform('mean')

df_sorted = df.sort_values(by='Average_Normalized_RMSE_CV').drop(columns='Average_Normalized_RMSE_CV')

heatmap_data = df_sorted.pivot(index="Model", columns="Oxide", values="Normalized_RMSE_CV")

plt.figure(figsize=(16, 9))
ax = sns.heatmap(heatmap_data,
                 annot=df_sorted.pivot(index="Model", columns="Oxide", values="RMSE_CV"),
                 fmt=".2f", cmap="Blues", linewidths=.5, cbar=False)

plt.title('RMSECV Comparison Across Models and Oxides', fontsize=20, pad=20)
plt.xlabel('Oxide', fontsize=16, labelpad=20)
plt.ylabel('Model', fontsize=16, labelpad=20)
plt.xticks(rotation=45, fontsize=14)
plt.yticks(fontsize=14)

plt.tight_layout()
plt.show()


In [None]:
data_list = []

for model, oxides in results.items():
    for oxide, metrics in oxides.items():
        data_list.append({
            'Model': model,
            'Oxide': oxide,
            'RMSE_CV': metrics['rmse_cv']
        })

df = pd.DataFrame(data_list)

average_rmse_cv = df.groupby('Model')['RMSE_CV'].mean().reset_index()
average_rmse_cv.columns = ['Model', 'Average_RMSE_CV']

best_performance = average_rmse_cv['Average_RMSE_CV'].min()
average_rmse_cv['Relative_Performance (%)'] = (average_rmse_cv['Average_RMSE_CV'] / best_performance) * 100

average_rmse_cv = average_rmse_cv.sort_values(by='Relative_Performance (%)')
average_rmse_cv['Percent Difference vs Next (%)'] = average_rmse_cv['Relative_Performance (%)'].diff(-1).abs()
average_rmse_cv['Percent Difference vs Next (%)'].iloc[-1] = np.nan  # or ''

final_table = average_rmse_cv[['Model', 'Relative_Performance (%)', 'Percent Difference vs Next (%)']].reset_index(drop=True)
latex_table = final_table.to_latex(index=False, escape=False)

print(latex_table)


In [None]:
def prepare_data_for_plotting(results, metric):
    data_list = []
    for model, oxides in results.items():
        for oxide, values in oxides.items():
            data_list.append({'Model': model, 'Oxide': oxide, 'Value': values[metric]})

    return pd.DataFrame(data_list)

metrics = ['rmse', 'rmse_cv', 'std_dev', 'std_dev_cv']

palette = sns.color_palette("tab10")

for metric in metrics:
    df = prepare_data_for_plotting(results, metric)
    plt.figure(figsize=(15, 10))
    sns.barplot(data=df, x='Oxide', y='Value', hue='Model', palette=palette)
    plt.title(f'{metric.upper()} for each Oxide and Model')
    plt.xticks(rotation=45)
    plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()
