In [None]:
import pandas as pd

In [None]:
pred_PLS_SM = pd.read_csv("./data/data/PLS_SM/predictions/tar_pred.csv")
pred_PLS_SM.drop(columns=["Unnamed: 0"], axis=1, inplace=True)

pred_ICA = pd.read_csv("./data/data/jade/ica/tar_pred.csv")
pred_ICA["Sample Name"] = pred_ICA["target"]
pred_ICA.drop(columns=["target"], axis=1, inplace=True)

In [None]:
pred_ICA.columns

In [None]:
# add sample names to the predictions for ICA
pred_ICA["Sample Name"] = pred_ICA["Sample Name"].apply(lambda x: x.split("_")[0])

In [None]:
pred_ICA.head()

In [None]:
# filtered_pred_PLS_SM.set_index("ID", inplace=True)
pred_PLS_SM.set_index("ID", inplace=True)
pred_ICA.set_index("ID", inplace=True)

In [None]:
aligned_pred_PLS_SM, aligned_pred_ICA = pred_PLS_SM.align(pred_ICA, join="inner", axis=0)

In [None]:
aligned_pred_PLS_SM

In [None]:
merged_df = pd.merge(aligned_pred_ICA, aligned_pred_PLS_SM, on=['ID', 'Sample Name'], suffixes=('_ICA', '_PLS_SM'))

In [None]:
merged_df

In [None]:
from lib.reproduction import weighted_sum_oxide_percentages, major_oxides

moc_predictions = pd.DataFrame()

for oxide in major_oxides:
    w_ica = weighted_sum_oxide_percentages[oxide]['ICA'] / 100
    w_pls_sm = weighted_sum_oxide_percentages[oxide]['PLS1-SM'] / 100
    moc_predictions[oxide] = merged_df[oxide + '_ICA'] * w_ica + merged_df[oxide + '_PLS_SM'] * w_pls_sm

moc_predictions['Sample Name'] = merged_df['Sample Name']
moc_predictions['ID'] = aligned_pred_ICA.index

In [None]:
moc_predictions

In [None]:
from lib.data_handling import CompositionData

cd = CompositionData("data/data/calib/ccam_calibration_compositions.csv")

In [None]:
def merge_with_actual_data(moc_predictions):
    merged_data = pd.DataFrame()

    for index, row in moc_predictions.iterrows():
        actual_data = cd.get_composition_for_sample(row['Sample Name'])

        if not actual_data.empty:
            for oxide in major_oxides:
                merged_data.at[index, oxide + '_pred'] = row[oxide]
                merged_data.at[index, oxide + '_actual'] = actual_data[oxide].values[0]
            merged_data.at[index, 'Sample Name'] = row['Sample Name']

    return merged_data

In [None]:
from sklearn.metrics import mean_squared_error

# ignore future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def calculate_rmse(merged_data):
    rmse_values = {}
    for oxide in major_oxides:
        y_actual = merged_data[oxide + '_actual']
        y_pred = merged_data[oxide + '_pred']
        rmse = mean_squared_error(y_actual, y_pred, squared=False)
        rmse_values[oxide] = rmse
    return rmse_values

# Usage example
merged_moc_actual_df = merge_with_actual_data(moc_predictions)
moc_rmse_values = calculate_rmse(merged_moc_actual_df)

In [None]:
SiO2_eval_df = pd.merge(merged_df[["SiO2_ICA", "SiO2_PLS_SM", "Sample Name"]], 
                    merged_moc_actual_df[["SiO2_actual", "SiO2_pred", "Sample Name"]], 
                    on="Sample Name")

SiO2_eval_df = SiO2_eval_df[0:5]
SiO2_eval_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Setting the style for the plot
sns.set(style="whitegrid")

# Creating a figure for the plot
plt.figure(figsize=(12, 8))

# Creating aliases for the value_vars
aliases = {
    'SiO2_ICA': 'ICA Model',
    'SiO2_PLS_SM': 'PLS SM Model',
    'SiO2_pred': 'Predicted Value',
    'SiO2_actual': 'Actual Value'
}

# Merging the data for actual and predicted values for the plot
melted_data = pd.melt(SiO2_eval_df, id_vars='Sample Name', value_vars=list(aliases.keys()))

# Renaming the columns based on aliases
melted_data = melted_data.rename(columns={'variable': 'Model/Value', 'value': 'SiO2 Composition %'})
melted_data['Model/Value'] = melted_data['Model/Value'].map(aliases)

# Creating the grouped bar chart
sns.barplot(x='Sample Name', y='SiO2 Composition %', hue='Model/Value', data=melted_data)

# Adding labels and title
plt.xlabel('Sample Name')
plt.ylabel('SiO2 Composition %')
plt.title('SiO2 Composition % for Each Model and Actual Values by Sample')
plt.xticks(rotation=45)
plt.legend(title='Data Type')

plt.gca().invert_yaxis()

# Displaying the plot
plt.tight_layout()
plt.show()

In [None]:
merged_moc_actual_df

In [None]:
moc_rmse_values

In [None]:
ICA_actual_merged_df = merge_with_actual_data(aligned_pred_ICA)
ICA_rmses = calculate_rmse(ICA_actual_merged_df)

PLS_SM_actual_merged_df = merge_with_actual_data(aligned_pred_PLS_SM)
PLS_SM_rmses = calculate_rmse(PLS_SM_actual_merged_df)

ICA_rmses

In [None]:
PLS_SM_actual_merged_df

In [None]:
PLS_SM_rmses

In [None]:
RMSEs_df = pd.DataFrame([ICA_rmses, PLS_SM_rmses, moc_rmse_values], index=["ICA (replica)", "PLS_SM (replica)", "MOC (replica)"])

In [None]:
RMSEs_df.T

In [None]:
MOC_model_rmses = {
    'SiO2': 5.83,
    'TiO2': 1.10,
    'Al2O3': 3.18,
    'FeOT': 2.90,
    'MgO': 2.30,
    'CaO': 1.14,
    'Na2O': 1.34,
    'K2O': 1.49
 }

ICA_original_rmses = {
    'SiO2': 8.31,
    'TiO2': 1.44,
    'Al2O3': 4.77,
    'FeOT': 5.17,
    'MgO': 4.08,
    'CaO': 3.07,
    'Na2O': 2.29,
    'K2O': 0.98
 }

PLS_SM_original_rmses = {
    'SiO2': 4.33,
    'TiO2': 0.94,
    'Al2O3': 2.85,
    'FeOT': 2.01,
    'MgO': 1.06,
    'CaO': 2.65,
    'Na2O': 0.62,
    'K2O': 0.72
 }

In [None]:
R_df = RMSEs_df.T
R_df['MOC (original)'] = MOC_model_rmses.values()
R_df['ICA (original)'] = ICA_original_rmses.values()
R_df['PLS_SM (original)'] = PLS_SM_original_rmses.values()

In [None]:
R_df
# R_df.to_csv("eval.csv")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Reordering columns to alternate between replica and original
column_order = ['ICA (replica)', 'ICA (original)', 'PLS_SM (replica)', 'PLS_SM (original)', 'MOC (replica)', 'MOC (original)']
R_df = R_df[column_order]

# Melting the DataFrame
R_df_melted = R_df.reset_index().melt(id_vars='index')
R_df_melted.columns = ['Major Oxides', 'Model', 'RMSE']

# Setting Seaborn style
sns.set(style="whitegrid")

# Define a more balanced color palette
palette = {
    'MOC (original)': '#1f77b4',  # Slightly darker blue
    'ICA (original)': '#2ca02c',  # Slightly darker green
    'PLS_SM (original)': '#d62728',  # Slightly darker red
    'MOC (replica)': '#aec7e8',  # Lighter blue
    'ICA (replica)': '#98df8a',  # Lighter green
    'PLS_SM (replica)': '#ff9896'  # Lighter red
}

# Creating the plot
plt.figure(figsize=(15, 8))
ax = sns.barplot(x='Major Oxides', y='RMSE', hue='Model', data=R_df_melted, palette=palette)

# Adding labels and title
ax.set_xlabel('Major Oxides')
ax.set_ylabel('RMSE')
# ax.set_title('Grouped Histogram for Major Oxides')

# Adjust legend
plt.legend(title='Model')

# Display the plot
plt.show()


In [None]:
import numpy as np
np.around(np.array(list(moc_rmse_values.values())), 2)