In [1]:
import pandas as pd

In [17]:
pred_PLS_SM = pd.read_csv("./data/data/PLS_SM/tar_pred.csv")
pred_PLS_SM.drop(columns=["Unnamed: 0"], axis=1, inplace=True)

pred_ICA = pd.read_csv("./data/data/jade/ica/tar_pred.csv")
pred_ICA["Sample Name"] = pred_ICA["target"]
pred_ICA.drop(columns=["target", "Unnamed: 0"], axis=1, inplace=True)

In [18]:
pred_ICA.columns

Index(['SiO2', 'TiO2', 'Al2O3', 'FeOT', 'MgO', 'CaO', 'Na2O', 'K2O',
       'Sample Name'],
      dtype='object')

In [22]:
common_samples = pd.merge(pred_PLS_SM, pred_ICA, how="inner", on=["Sample Name"])["Sample Name"]
common_samples = common_samples.unique()

In [30]:
from lib.reproduction import weighted_sum_oxide_percentages, major_oxides

pred_PLS_SM_unique = pred_PLS_SM.drop_duplicates(subset='Sample Name')
merged_df = pd.merge(pred_ICA, pred_PLS_SM_unique, on='Sample Name', suffixes=('_ICA', '_PLS_SM'))
moc_predictions = pd.DataFrame()

for oxide in major_oxides:
    w_ica = weighted_sum_oxide_percentages[oxide]['ICA'] / 100
    w_pls_sm = weighted_sum_oxide_percentages[oxide]['PLS1-SM'] / 100
    moc_predictions[oxide] = merged_df[oxide + '_ICA'] * w_ica + merged_df[oxide + '_PLS_SM'] * w_pls_sm

moc_predictions['Sample Name'] = merged_df['Sample Name']

In [31]:
moc_predictions

Unnamed: 0,SiO2,TiO2,Al2O3,FeOT,MgO,CaO,Na2O,K2O,Sample Name
0,47.270106,1.231479,14.619428,11.609770,6.733275,20.085983,1.837830,0.448170,jsc1399
1,73.224912,-0.442000,5.192132,9.680539,-0.293107,0.822618,1.967540,2.122200,vzo106
2,69.930910,0.303280,7.975111,4.840909,2.853847,1.650559,1.812294,2.921538,201424
3,58.312666,1.069382,16.054505,12.980315,3.944293,11.720324,3.034553,1.383671,jsc1424
4,67.097752,0.760564,17.628106,11.443023,2.707780,0.393040,1.999986,4.368429,g11mt
...,...,...,...,...,...,...,...,...,...
66,67.792756,0.266962,9.636920,11.236940,7.034231,7.763452,2.193786,1.974555,221123
67,39.653174,2.340592,21.923825,13.297745,2.224052,16.359638,4.032146,-0.033680,jsc1408
68,43.073251,-0.194166,27.696565,-4.483689,3.055087,24.241609,3.852041,0.936910,jsc1440
69,62.720121,0.823097,22.159999,9.602230,3.576740,0.414386,0.644354,2.978943,pg6


In [33]:
from lib.data_handling import CompositionData

cd = CompositionData("data/data/calib/ccam_calibration_compositions.csv")

In [50]:
def merge_with_actual_data(moc_predictions):
    merged_data = pd.DataFrame()

    for index, row in moc_predictions.iterrows():
        actual_data = cd.get_composition_for_sample(row['Sample Name'])

        if not actual_data.empty:
            for oxide in major_oxides:
                merged_data.at[index, oxide + '_pred'] = row[oxide]
                merged_data.at[index, oxide + '_actual'] = actual_data[oxide].values[0]
            merged_data.at[index, 'Sample Name'] = row['Sample Name']

    return merged_data

In [51]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def calculate_rmse(merged_data):
    rmse_values = {}
    for oxide in major_oxides:
        y_actual = merged_data[oxide + '_actual']
        y_pred = merged_data[oxide + '_pred']
        rmse = sqrt(mean_squared_error(y_actual, y_pred))
        rmse_values[oxide] = rmse
    return rmse_values

# Usage example
merged_data = merge_with_actual_data(moc_predictions)
rmse_values = calculate_rmse(merged_data)

In [53]:
merged_data

Unnamed: 0,SiO2_pred,SiO2_actual,TiO2_pred,TiO2_actual,Al2O3_pred,Al2O3_actual,FeOT_pred,FeOT_actual,MgO_pred,MgO_actual,CaO_pred,CaO_actual,Na2O_pred,Na2O_actual,K2O_pred,K2O_actual,Sample Name
0,47.270106,51.03,1.231479,1,14.619428,13.83,11.609770,9.93,6.733275,6.33,20.085983,10.70,1.837830,2.01,0.448170,0.6,jsc1399
1,73.224912,69.08,-0.442000,0.49,5.192132,0.57,9.680539,6.65,-0.293107,0.02,0.822618,0.42,1.967540,2.72,2.122200,0.73,vzo106
2,69.930910,77.94,0.303280,0.31,7.975111,10.97,4.840909,2.76,2.853847,1.18,1.650559,1.33,1.812294,2.95,2.921538,1.6,201424
3,58.312666,54.23,1.069382,1.18,16.054505,17.37,12.980315,7.43,3.944293,4.57,11.720324,7.95,3.034553,3.42,1.383671,1,jsc1424
4,67.097752,58.71,0.760564,0.8,17.628106,17.90,11.443023,7.52,2.707780,2.34,0.393040,0.72,1.999986,2.57,4.368429,3.65,g11mt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,67.792756,58.56,0.266962,0.52,9.636920,12.79,11.236940,11.58,7.034231,7.48,7.763452,5.02,2.193786,1.6,1.974555,1.44,221123
67,39.653174,50.34,2.340592,0.74,21.923825,23.05,13.297745,5.78,2.224052,3.15,16.359638,9.99,4.032146,4.32,-0.033680,0.23,jsc1408
68,43.073251,48.22,-0.194166,0.04,27.696565,31.99,-4.483689,0.40,3.055087,0.23,24.241609,15.33,3.852041,2.66,0.936910,0.04,jsc1440
69,62.720121,60.22,0.823097,0.58,22.159999,19.87,9.602230,6.06,3.576740,4.28,0.414386,0.01,0.644354,0.31,2.978943,3.67,pg6


In [52]:
rmse_values

{'SiO2': 7.855909819491709,
 'TiO2': 0.5142222471126368,
 'Al2O3': 3.056659051643873,
 'FeOT': 6.713714395799662,
 'MgO': 2.017968089861998,
 'CaO': 4.545433495026475,
 'Na2O': 0.9787366680324986,
 'K2O': 1.3833947634655068}