In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import mlflow
from lib.config import AppConfig
from experiment_analysis.experiment_data_utils import get_full_runs_df
from IPython.display import display, HTML
from pathlib import Path

import pandas as pd


config = AppConfig()
mlflow.set_tracking_uri(config.mlflow_tracking_uri)

In [None]:
runs = get_full_runs_df(config.optimization_experiment_results_path)

In [None]:
len(runs)

In [None]:
from lib.reproduction import major_oxides

analysis_target = "SiO2"
n_splits=4
assert analysis_target in major_oxides, f"{analysis_target} is not a valid oxide. Please choose from {major_oxides}"

In [None]:
from experiment_analysis.experiment_data_utils import clean_experiment_data

filtered_runs = clean_experiment_data(runs)

filtered_runs = filtered_runs[filtered_runs['metrics.rmse_cv'] <= 50]

In [None]:
filtered_runs["params.oxide"].value_counts()

In [None]:
def display_table_with_options(df, max_columns=10, max_rows=100, display_func=lambda x: display(x)):
    original_max_columns = pd.get_option('display.max_columns')
    original_max_rows = pd.get_option('display.max_rows')

    pd.set_option('display.max_columns', max_columns)
    pd.set_option('display.max_rows', max_rows)

    display_func(df)

    pd.set_option('display.max_columns', original_max_columns)
    pd.set_option('display.max_rows', original_max_rows)

# Multivariate

In [None]:
overview_list = []
for oxide in major_oxides:
    overview_df = filtered_runs[["params.oxide", "params.model_type", "params.transformer_type", "params.pca_type", "params.scaler_type", "metrics.rmse_cv", "metrics.std_dev_cv", "metrics.rmse"]]

    overview_df = overview_df[overview_df['params.oxide'] == oxide].sort_values(by='metrics.rmse_cv')
    unique_model_types_df = overview_df.drop_duplicates(subset=['params.model_type'])
    overview_list.append(unique_model_types_df)

for oxide, df in zip(major_oxides, overview_list):
    display(HTML(f"<h2>{oxide}</h2>"))
    display_table_with_options(df, max_columns=10, max_rows=100)


In [None]:
cols = ""
for col in filtered_runs.columns:
    cols += f"{col}, "

cols

In [None]:
from pathlib import Path

for oxide in  major_oxides:
    oxide_name = "FeO_T" if oxide == "FeOT" else oxide

    overview_df = filtered_runs[["params.oxide", "params.model_type", "params.transformer_type", "params.pca_type", "params.scaler_type", "metrics.rmse_cv", "metrics.std_dev_cv", "metrics.rmse"]]
    overview_df = overview_df[overview_df['params.oxide'] == oxide].sort_values(by='metrics.rmse_cv')

    # Prepare the DataFrame for LaTeX conversion
    overview_df['params.oxide'] = overview_df['params.oxide'].apply(lambda x: f"")
    overview_df['params.model_type'] = overview_df['params.model_type'].apply(lambda x: f"\\texttt{{{x.replace('_', '\\_')}}}")
    overview_df['params.transformer_type'] = overview_df['params.transformer_type'].apply(lambda x: f"\\texttt{{{x.replace('_', '\\_')}}}")
    overview_df['params.pca_type'] = overview_df['params.pca_type'].apply(lambda x: f"\\texttt{{{x.replace('_', '\\_')}}}")
    overview_df['params.scaler_type'] = overview_df['params.scaler_type'].apply(lambda x: f"\\texttt{{{x.replace('_', '\\_')}}}")

    # Format numerical columns to show only 2 decimals
    overview_df['metrics.rmse_cv'] = overview_df['metrics.rmse_cv'].apply(lambda x: f"{x:.3f}")
    overview_df['metrics.std_dev_cv'] = overview_df['metrics.std_dev_cv'].apply(lambda x: f"{x:.3f}")
    overview_df['metrics.rmse'] = overview_df['metrics.rmse'].apply(lambda x: f"{x:.3f}")

    # Rename columns to match the required headers
    overview_df = overview_df.rename(columns={
        'params.oxide': "\\ce{" + oxide_name + "}",
        'params.model_type': 'Model Type',
        'params.transformer_type': 'Transformer Type',
        'params.pca_type': 'PCA Type',
        'params.scaler_type': 'Scaler Type',
        'metrics.rmse_cv': '\\gls{rmsecv}',
        'metrics.std_dev_cv': 'Std. dev. CV',
        'metrics.rmse': '\\gls{rmsep}'
    })

    path = Path(f"./../../report_thesis/src/sections/appendix/tables/{oxide}_overview.tex")

    unique_model_types_df = overview_df.drop_duplicates(subset=['Model Type'])

    # Generate the LaTeX table with table* environment
    latex_table = unique_model_types_df.to_latex(index=False, escape=False)

    with open(path, "w") as file:
        file.write("\\begin{table*}[htbp]\n")
        file.write("\\centering\n")
        file.write(latex_table)
        file.write("\\caption{Overview of model types for \\ce{" + oxide_name + "} oxide}\n")
        file.write("\\label{tab:" + oxide + "_overview}\n")
        file.write("\\end{table*}\n")


In [None]:
from experiment_analysis.experiment_data_utils import pretty_format_params

model_categories = {
    "gradient_boosting": ["gbr", "xgboost", "ngboost"],
    "tree_based": ["extra_trees", "random_forest"],
    "linear_models": ["lasso", "ridge", "elasticnet"],
    "svm": ["svr"],
    "pls": ["pls"]
}

overview_list = []
for oxide in major_oxides:
    overview_df = filtered_runs[["params.oxide", "params.model_type", "params.transformer_type", "params.pca_type", "params.scaler_type", "metrics.rmse_cv", "metrics.std_dev_cv"]]

    overview_df = overview_df[overview_df['params.oxide'] == oxide].sort_values(by='metrics.rmse_cv')
    unique_model_types_df = overview_df.drop_duplicates(subset=['params.model_type'])
    overview_list.append(unique_model_types_df)

max_models_per_category = 10
max_models_per_oxide = 10
for oxide, df in zip(major_oxides, overview_list):
    display(HTML(f"<h2>{oxide}</h2>"))
    display(HTML("<h3>Top 3 Configurations</h3>"))
    category_counter = {category: 0 for category in model_categories.keys()}

    model_counter = 0
    for i, row in df.iterrows():
        if model_counter >= max_models_per_oxide:
            break
        model_type = row['params.model_type']
        for category, models in model_categories.items():
            if model_type in models:
                if category_counter[category] < max_models_per_category:
                    category_counter[category] += 1
                    model_counter += 1
                    data_row = filtered_runs.loc[row.name]
                    print(pretty_format_params(data_row))
                    print(f"RMSEP: {data_row['metrics.rmse']}")
                    print(f"Std.Dev: {data_row['metrics.std_dev']}")
                    print(f"RMSE CV: {data_row['metrics.rmse_cv']}")
                    print(f"STD Dev CV: {data_row['metrics.std_dev_cv']}")
                    print("\n")
                break
    print("\n")


In [None]:
filtered_runs[filtered_runs['params.oxide'] == analysis_target]["metrics.rmse_cv"].describe()

In [None]:
# Group filtered_runs by the specified parameters and sort by metrics.rmsecv
grouped_runs = filtered_runs.groupby(
    ['params.model_type', 'params.transformer_type', 'params.pca_type', 'params.scaler_type', 'params.oxide']
).apply(lambda x: x.sort_values(by='metrics.rmse_cv').head(1)).reset_index(drop=True)

# Create a pivot table to show the best configurations for each oxide
pivot_table = grouped_runs.pivot_table(
    index=['params.model_type', 'params.transformer_type', 'params.scaler_type', 'params.pca_type'],
    columns='params.oxide',
    values='metrics.rmse_cv',
    aggfunc='first'
)

# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

# Display the pivot table
display(pivot_table)

# Univariate

In [None]:
filtered_runs = filtered_runs[filtered_runs['params.oxide'] == analysis_target]
len(filtered_runs)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

total_runs = len(runs)

sns.set_style('whitegrid')
plt.figure(figsize=(10, 6))
sns.boxplot(x='params.model_type', y='metrics.rmse', data=filtered_runs)
plt.title(f"{analysis_target}: RMSE for each model type - {len(filtered_runs)} runs out of {total_runs} total runs")
plt.xlabel("Model Type")
plt.ylabel("RMSEP")
plt.show()


In [None]:
# Find the runs that minimize rmse, rmse_cv, std_dev, and std_dev_cv
optimal_runs = filtered_runs.loc[filtered_runs[['metrics.rmse', 'metrics.rmse_cv', 'metrics.std_dev', 'metrics.std_dev_cv']].idxmin()]

# Display the optimal runs
optimal_runs[['metrics.rmse', 'metrics.rmse_cv', 'metrics.std_dev', 'metrics.std_dev_cv', 'params.model_type']]


In [None]:
# Setting up visualization style
sns.set(style="whitegrid")

# Plotting RMSE CV
plt.figure(figsize=(12, 7))
sns.boxplot(x='params.model_type', y='metrics.rmse_cv', data=filtered_runs)
plt.title(f'{analysis_target}: Average Cross-Validation RMSE by Model Type')
plt.ylabel('Average RMSE (Cross-Validation)')
plt.show()

# Plotting Standard Deviation of RMSE CV
plt.figure(figsize=(12, 7))
sns.boxplot(x='params.model_type', y='metrics.std_dev_cv', data=filtered_runs)
plt.title(f'{analysis_target}: Standard Deviation of Errors (Cross-Validation) by Model Type')
plt.ylabel('Standard Deviation of Errors (CV)')
plt.show()


In [None]:
# Prepare a melted DataFrame for seaborn plotting
melted_df = filtered_runs.melt(id_vars=['params.model_type'], value_vars=[f'metrics.rmse_cv_{i+1}' for i in range(n_splits)],
                               var_name='CV Fold', value_name='Fold RMSE')

# Plotting without outliers
plt.figure(figsize=(14, 8))
sns.boxplot(x='params.model_type', y='Fold RMSE', hue='CV Fold', data=melted_df, showfliers=False)
plt.title(f'{analysis_target}: Distribution of RMSE Across CV Folds by Model Type')
plt.show()


In [None]:
cv_columns = [
    'metrics.rmse_cv', 'params.model_type', 'params.scaler_type',
    'params.transformer_type', 'params.pca_type'
]
filtered_runs_new = runs[cv_columns]
filtered_runs_new = filtered_runs_new[filtered_runs_new['metrics.rmse_cv'] <= 50]


# Rename columns for clarity
rename_dict = {col: col.split('.')[-1] for col in cv_columns}
filtered_runs_new = filtered_runs_new.rename(columns=rename_dict)

In [None]:
sns.set(style="whitegrid")

# Individual Parameters
for parameter in ['model_type', 'scaler_type', 'transformer_type', 'pca_type']:
    plt.figure(figsize=(10, 6))
    chart = sns.barplot(x=parameter, y='rmse_cv', data=filtered_runs_new)
    chart.set_xticks(range(len(filtered_runs_new[parameter].unique())))
    chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
    plt.title(f'{analysis_target}: Average RMSE (CV) by {parameter.capitalize()}')
    plt.ylabel('Average RMSE (CV)')
    plt.show()

# Combinations of Parameters
# Considering combinations might result in a lot of categories, focus on the top few based on average RMSE
combination_data = filtered_runs_new.groupby(['model_type', 'scaler_type', 'transformer_type', 'pca_type']).mean()['rmse_cv']
combination_data = combination_data.reset_index().sort_values(by='rmse_cv', ascending=True)

# Display top 10 combinations
print(combination_data.head(10))

# Optionally, visualize these top combinations
plt.figure(figsize=(14, 8))
combination_data_top10 = combination_data[:10]
combination_labels = combination_data_top10.apply(lambda row: ', '.join([str(row[param]) for param in ['model_type', 'scaler_type', 'transformer_type', 'pca_type'] if row[param] != 'none']), axis=1)
sns.barplot(x='rmse_cv', y=combination_labels, data=combination_data_top10, orient='h')
plt.title(f'{analysis_target}: Top 10 Combinations for RMSE Performance')
plt.xlabel('Average RMSE (Cross-Validation)')
plt.ylabel('Combinations')
plt.show()

In [None]:
# Aggregate the data to compute mean and standard deviation of RMSE for each configuration
# Lower RMSE (lower is better) and lower STD RMSE (lower is better for consistency)
aggregated_data = filtered_runs_new.groupby(['model_type', 'scaler_type', 'transformer_type', 'pca_type']).agg({
    'rmse_cv': ['mean', 'std']
}).reset_index()

# Flatten the columns (multi-level index after aggregation)
aggregated_data.columns = ['Model Type', 'Scaler Type', 'Transformer Type', 'PCA Type', 'Mean RMSECV', 'STD RMSECV']

# Sort configurations first by mean RMSE (ascending, lower is better) and then by STD RMSE (ascending, lower is better for consistency)
sorted_data = aggregated_data.sort_values(by=['Mean RMSECV', 'STD RMSECV'], ascending=[True, True])

# Display the top 10 consistently good configurations
print(sorted_data.head(10))

In [None]:
sns.set(style="whitegrid")

# Plotting the top configurations based on Mean RMSE
plt.figure(figsize=(12, 8))
top_n = 50
for parameter in ['Model Type', 'Scaler Type', 'Transformer Type', 'PCA Type']:
    top_configurations = sns.barplot(x='Mean RMSECV', y=parameter, hue=parameter, data=sorted_data.head(top_n), dodge=False)
    plt.title(f'{analysis_target}: Top {top_n} Configurations by Mean RMSECV and Their Consistency')
    plt.xlabel('Mean RMSECV')
    plt.ylabel(parameter)
    # Annotate each bar with the value of Mean RMSE
    for p in top_configurations.patches:
        width = p.get_width()
        plt.text(width + 0.01, p.get_y()+0.2 + p.get_height() / 2, f'{width:.2f}', ha='left', va='center')
    plt.show()


In [None]:
first_row = filtered_runs.sort_values(by="metrics.rmse_cv").iloc[0]
non_none_columns = first_row[first_row.notna()].index.tolist()
first_row[non_none_columns]
