# Accuracy Enhancement in Refractive Index Sensing via Full-Spectrum Machine Learning Modeling

This notebook contains the code to replicate select results from the [paper on arxiv](https://arxiv.org/abs/2504.06195).

# imports & load data

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.cross_decomposition import PLSRegression

import optuna

from sklearn.decomposition import PCA

from tqdm import tqdm

## load matrices

In [2]:
# TiTE
all_data = {}
all_data["Ti_TE"] = pd.read_excel("../p04_TE_TM_1D_compare/Ti_TE_matrix.xlsx", header=None).transpose()
all_data["Ti_TE"].columns = [f"feature_{i+1}" for i in range(all_data["Ti_TE"].shape[1])]
all_data["Ti_TE"]["target"] = list(range(1, all_data["Ti_TE"].shape[0] + 1))

In [5]:
for dat in ["Si_TE", "Si_TM", "Ti_TM"]:
    all_data[dat] = pd.read_excel(f"../p06_multvar_SI_fitting/{dat}_matrix.xlsx", header=None).transpose()
    all_data[dat].columns = [f"feature_{i+1}" for i in range(all_data[dat].shape[1])]
    all_data[dat]["target"] = list(range(1, all_data[dat].shape[0] + 1))

## build the 5-fold cross validation groups and construct PCs

In [6]:
# split the data into 5-folds cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=19890417)
#kf = KFold(n_splits=5, shuffle=True, random_state=42)
PC_folds = {}
folds = {} # this will not use PC to reduce the data

for dat in all_data.keys():
    PC_folds[dat] = []
    folds[dat] = []
    full_data = all_data[dat].copy()
    for train_index, test_index in kf.split(all_data[dat]):
        # because this doesn't check for out of range values we need to make sure that we move any 0 or 100 index values from test to train
        # if 0 in test_index:
        #     test_index = np.delete(test_index, np.where(test_index == 0)[0][0])
        #     train_index = np.append(train_index, 0)
        # if 100 in test_index:
        #     test_index = np.delete(test_index, np.where(test_index == 100)[0][0])
        #     train_index = np.append(train_index, 100)
        # train_index.sort()
        # print(f"train: {train_index}, test: {test_index}")
        train_data = full_data.iloc[train_index]
        test_data = full_data.iloc[test_index]
        # first put the raw data into the folds
        folds[dat].append((train_data.drop(columns=["target"]), train_data["target"].values, test_data.drop(columns=["target"]), test_data["target"].values))
        # now we also want to dimensionally reduce the data into 80 PCs following the training data
        pca = PCA(n_components=80, svd_solver='full', random_state=19890417)
        # first standardize the data according to the training data
        scaler = StandardScaler()
        scaler.fit(train_data.drop(columns=["target"]))
        train_data_scaled = scaler.transform(train_data.drop(columns=["target"]))
        test_data_scaled = scaler.transform(test_data.drop(columns=["target"]))
        pca.fit(train_data_scaled)
        train_data_pca = pca.transform(train_data_scaled)
        test_data_pca = pca.transform(test_data_scaled)
        PC_folds[dat].append((train_data_pca, train_data["target"].values, test_data_pca, test_data["target"].values))

# Identify best single variate performer for each dataset

In [7]:
results_dfs = {}
for dat in ["Ti_TM", "Ti_TE", "Si_TE", "Si_TM"]:
    result_list = []
    fold_i = 0
    for train_data_pca, train_target, test_data_pca, test_target in folds[dat]:
        for feat_i in tqdm(range(train_data_pca.shape[1])):
            X_train = train_data_pca[f"feature_{feat_i+1}"].to_numpy()
            y_train = train_target
            X_test = test_data_pca[f"feature_{feat_i+1}"].to_numpy()
            y_test = test_target
            # Fit a linear regression model
            model = LinearRegression()
            model.fit(X_train.reshape(-1, 1), y_train)
            # Make predictions
            y_pred = model.predict(X_test.reshape(-1, 1))
            # Calculate MSE, R2, RMSE, MAE
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = np.mean(np.abs(y_test - y_pred))
            # now calculate the same metrics on the training data
            y_train_pred = model.predict(X_train.reshape(-1, 1))
            mse_train = mean_squared_error(y_train, y_train_pred)
            r2_train = r2_score(y_train, y_train_pred)
            rmse_train = np.sqrt(mse_train)
            mae_train = np.mean(np.abs(y_train - y_train_pred))
            # Append results to the list
            result_list.append({
                "feature": f"feature_{feat_i+1}",
                "mse": mse,
                "r2": r2,
                "rmse": rmse,
                "mae": mae,
                "fold": fold_i,
                "mse.train": mse_train,
                "r2.train": r2_train,
                "rmse.train": rmse_train,
                "mae.train": mae_train,
            })
        fold_i += 1
    results_dfs[dat] = pd.DataFrame(result_list)


100%|██████████| 10000/10000 [00:30<00:00, 325.06it/s]
100%|██████████| 10000/10000 [00:34<00:00, 289.81it/s]
100%|██████████| 10000/10000 [00:40<00:00, 246.68it/s]
100%|██████████| 10000/10000 [00:39<00:00, 250.55it/s]
100%|██████████| 10000/10000 [00:35<00:00, 278.34it/s]
100%|██████████| 10000/10000 [00:36<00:00, 271.31it/s]
100%|██████████| 10000/10000 [00:37<00:00, 263.37it/s]
100%|██████████| 10000/10000 [00:33<00:00, 302.71it/s]
100%|██████████| 10000/10000 [00:30<00:00, 324.97it/s]
100%|██████████| 10000/10000 [00:37<00:00, 266.66it/s]
100%|██████████| 10000/10000 [00:36<00:00, 275.31it/s]
100%|██████████| 10000/10000 [00:37<00:00, 268.79it/s]
100%|██████████| 10000/10000 [00:32<00:00, 308.09it/s]
100%|██████████| 10000/10000 [00:29<00:00, 334.57it/s]
100%|██████████| 10000/10000 [00:36<00:00, 274.33it/s]
100%|██████████| 10000/10000 [00:40<00:00, 249.47it/s]
100%|██████████| 10000/10000 [00:31<00:00, 320.62it/s]
100%|██████████| 10000/10000 [00:29<00:00, 335.10it/s]
100%|█████

In [8]:
# for each dataset we want to print the mean and std of the metrics for the best feature
# also save them to a lookup table
best_results = {}
for dat in ["Ti_TM", "Ti_TE", "Si_TE", "Si_TM"]:
    print(f"Dataset: {dat}")
    # first group by feature to find the feature with lowest mean mse
    mean_results = results_dfs[dat].groupby("feature").agg({
        "mse": ["mean","std"],
        "r2": ["mean","std"],
        "rmse": ["mean","std"],
        "mae": ["mean","std"],
        "mse.train": ["mean","std"],
        "r2.train": ["mean","std"],
        "rmse.train": ["mean","std"],
        "mae.train": ["mean","std"],
    })
    mean_results.columns = ["_".join(col).strip() for col in mean_results.columns.values]
    mean_results = mean_results.reset_index()
    # now we want to find the feature with the lowest mean mse for training
    best_feature_idx = mean_results["mse.train_mean"].idxmin()
    best_feature = mean_results.iloc[best_feature_idx]
    print(f"Best feature: {best_feature['feature']} (train MSE: {best_feature['mse.train_mean']:.4f} std ({best_feature['mse.train_std']:.4f}))")
    print(f"Mean MSE: {best_feature['mse_mean']:.4f} std ({best_feature['mse_std']:.4f})")
    print(f"Mean R2: {best_feature['r2_mean']:.4f} std ({best_feature['r2_std']:.4f})")
    print(f"Mean RMSE: {best_feature['rmse_mean']:.4f} std ({best_feature['rmse_std']:.4f})")
    print(f"Mean MAE: {best_feature['mae_mean']:.4f} std ({best_feature['mae_std']:.4f})")
    print()
    # save the best feature to the lookup table
    best_results[dat] = {
        "feature": best_feature["feature"],
        "mse_mean": best_feature["mse_mean"],
        "mse_std": best_feature["mse_std"],
        "r2_mean": best_feature["r2_mean"],
        "r2_std": best_feature["r2_std"],
        "rmse_mean": best_feature["rmse_mean"],
        "rmse_std": best_feature["rmse_std"],
        "mae_mean": best_feature["mae_mean"],
        "mae_std": best_feature["mae_std"],
        "mse.train_mean": best_feature["mse.train_mean"],
        "mse.train_std": best_feature["mse.train_std"], 
        "r2.train_mean": best_feature["r2.train_mean"],
        "r2.train_std": best_feature["r2.train_std"],
        "rmse.train_mean": best_feature["rmse.train_mean"],
        "rmse.train_std": best_feature["rmse.train_std"],
        "mae.train_mean": best_feature["mae.train_mean"],
        "mae.train_std": best_feature["mae.train_std"],
    }

Dataset: Ti_TM
Best feature: feature_5094 (train MSE: 0.4017 std (0.0247))
Mean MSE: 0.4462 std (0.1050)
Mean R2: 0.9994 std (0.0002)
Mean RMSE: 0.6636 std (0.0855)
Mean MAE: 0.5234 std (0.0653)

Dataset: Ti_TE
Best feature: feature_4956 (train MSE: 0.0628 std (0.0038))
Mean MSE: 0.0655 std (0.0150)
Mean R2: 0.9999 std (0.0000)
Mean RMSE: 0.2547 std (0.0284)
Mean MAE: 0.2151 std (0.0306)

Dataset: Si_TE
Best feature: feature_390 (train MSE: 0.2336 std (0.0204))
Mean MSE: 0.2416 std (0.0866)
Mean R2: 0.9997 std (0.0001)
Mean RMSE: 0.4853 std (0.0873)
Mean MAE: 0.4001 std (0.0558)

Dataset: Si_TM
Best feature: feature_1206 (train MSE: 0.1355 std (0.0089))
Mean MSE: 0.1417 std (0.0378)
Mean R2: 0.9998 std (0.0001)
Mean RMSE: 0.3736 std (0.0509)
Mean MAE: 0.3097 std (0.0525)



# Now Evaluate the performance when the entire peak is used (Figure 7)

In [9]:
results_entire_wave_dfs = {}
for dat in ["Ti_TM", "Ti_TE", "Si_TE", "Si_TM"]:
    result_list = []
    fold_i = 0
    for train_data_pca, train_target, test_data_pca, test_target in PC_folds[dat]:
        # now we want to use the entire wave data
        # Fit a linear regression model
        model = LinearRegression()
        model.fit(train_data_pca, train_target)
        # Make predictions
        y_pred = model.predict(test_data_pca)
        # Calculate MSE, R2, RMSE, MAE
        mse = mean_squared_error(test_target, y_pred)
        r2 = r2_score(test_target, y_pred)
        rmse = np.sqrt(mse)
        mae = np.mean(np.abs(test_target - y_pred))
        # Append results to the list
        result_list.append({
            "mse": mse,
            "r2": r2,
            "rmse": rmse,
            "mae": mae,
            "fold": fold_i,
        })
        fold_i += 1
    results_entire_wave_dfs[dat] = pd.DataFrame(result_list)

# for each dataset we want to print the mean and std of the metrics for the best feature
for dat in ["Ti_TM", "Ti_TE", "Si_TE", "Si_TM"]:
    print(f"Dataset: {dat}")
    # first group by feature to find the feature with lowest mean mse
    mean_results = results_entire_wave_dfs[dat].agg({
        "mse": ["mean","std"],
        "r2": ["mean","std"],
        "rmse": ["mean","std"],
        "mae": ["mean","std"],
    })
    # mean_results.columns = ["_".join(col).strip() for col in mean_results.columns.values]
    
    # just print the mean and std
    # calculate the fold improvement for mse compared to the best feature
    fold_improvement = (mean_results["mse"].values[0] / best_results[dat]["mse_mean"])**-1
    print(f"Mean MSE: {mean_results['mse'].values[0]:.4e} std ({mean_results['mse'].values[1]:.4e}). {fold_improvement:.2f}x fold improvement")
    fold_improvement = (mean_results["r2"].values[0] / best_results[dat]["r2_mean"])**-1
    print(f"Mean R2: {mean_results['r2'].values[0]:.4f} std ({mean_results['r2'].values[1]:.4f}). {fold_improvement:.2f}x fold improvement")
    fold_improvement = (mean_results["rmse"].values[0] / best_results[dat]["rmse_mean"])**-1
    print(f"Mean RMSE: {mean_results['rmse'].values[0]:.4f} std ({mean_results['rmse'].values[1]:.4f}). {fold_improvement:.2f}x fold improvement")
    fold_improvement = (mean_results["mae"].values[0] / best_results[dat]["mae_mean"])**-1
    print(f"Mean MAE: {mean_results['mae'].values[0]:.4f} std ({mean_results['mae'].values[1]:.4f}). {fold_improvement:.2f}x fold improvement")
    print()

Dataset: Ti_TM
Mean MSE: 6.6233e-05 std (2.1992e-05). 6736.42x fold improvement
Mean R2: 1.0000 std (0.0000). 1.00x fold improvement
Mean RMSE: 0.0080 std (0.0014). 82.52x fold improvement
Mean MAE: 0.0066 std (0.0010). 79.74x fold improvement

Dataset: Ti_TE
Mean MSE: 1.0488e-04 std (6.4012e-05). 624.73x fold improvement
Mean R2: 1.0000 std (0.0000). 1.00x fold improvement
Mean RMSE: 0.0099 std (0.0031). 25.82x fold improvement
Mean MAE: 0.0072 std (0.0016). 29.84x fold improvement

Dataset: Si_TE
Mean MSE: 2.4732e-02 std (9.9685e-03). 9.77x fold improvement
Mean R2: 1.0000 std (0.0000). 1.00x fold improvement
Mean RMSE: 0.1547 std (0.0314). 3.14x fold improvement
Mean MAE: 0.1257 std (0.0288). 3.18x fold improvement

Dataset: Si_TM
Mean MSE: 1.0345e-01 std (2.1553e-01). 1.37x fold improvement
Mean R2: 0.9999 std (0.0003). 1.00x fold improvement
Mean RMSE: 0.2035 std (0.2785). 1.84x fold improvement
Mean MAE: 0.0871 std (0.0670). 3.56x fold improvement



# check the sv Si when using the peak shifts 1D datasets (Table 1)

In [10]:
Si_TE_1D = pd.read_excel("../p07_1D_SI_fitting/Si_TE_1Dfitting_4peaks.xlsx", header=None)
Si_TE_1D.columns = ["target","feature_1", "feature_2", "feature_3", "feature_4"]

Si_TM_1D = pd.read_excel("../p07_1D_SI_fitting/Si_TM_1Dfitting_4peaks.xlsx", header=None)
Si_TM_1D.columns = ["target","feature_1", "feature_2", "feature_3", "feature_4"]

Si_TE_1D_results = []
Si_TM_1D_results = []
# perform 5-fold cross validation on the 1D data
fold_i = 0
for train_index, test_index in kf.split(Si_TE_1D):
    for df, result in zip([Si_TE_1D, Si_TM_1D],[Si_TE_1D_results, Si_TM_1D_results]):
        train_data = df.iloc[train_index]
        test_data = df.iloc[test_index]
        for feat_name in [f"feature_{i+1}" for i in range(4)]:
            X_train = train_data[feat_name].to_numpy()
            y_train = train_data["target"].values
            X_test = test_data[feat_name].to_numpy()
            y_test = test_data["target"].values
            # Fit a linear regression model
            model = LinearRegression()
            model.fit(X_train.reshape(-1, 1), y_train)
            # Make predictions
            y_pred = model.predict(X_test.reshape(-1, 1))
            # Calculate MSE, R2, RMSE, MAE
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = np.mean(np.abs(y_test - y_pred))
            # now calculate the same metrics on the training data
            y_train_pred = model.predict(X_train.reshape(-1, 1))
            mse_train = mean_squared_error(y_train, y_train_pred)
            r2_train = r2_score(y_train, y_train_pred)
            rmse_train = np.sqrt(mse_train)
            mae_train = np.mean(np.abs(y_train - y_train_pred))
            # Append results to the list
            result.append({
                "feature": feat_name,
                "mse": mse,
                "r2": r2,
                "rmse": rmse,
                "mae": mae,
                "fold": fold_i,
                "mse.train": mse_train,
                "r2.train": r2_train,
                "rmse.train": rmse_train,
                "mae.train": mae_train,
            })
        # now we also want to try using all 4 features
        X_train = train_data.drop(columns=["target"]).to_numpy()
        y_train = train_data["target"].values
        X_test = test_data.drop(columns=["target"]).to_numpy()
        y_test = test_data["target"].values
        # Fit a linear regression model
        model = LinearRegression()
        model.fit(X_train, y_train)
        # Make predictions
        y_pred = model.predict(X_test)
        # Calculate MSE, R2, RMSE, MAE
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = np.mean(np.abs(y_test - y_pred))
        # now calculate the same metrics on the training data
        y_train_pred = model.predict(X_train)
        mse_train = mean_squared_error(y_train, y_train_pred)
        r2_train = r2_score(y_train, y_train_pred)
        rmse_train = np.sqrt(mse_train)
        mae_train = np.mean(np.abs(y_train - y_train_pred))
        # Append results to the list
        result.append({
            "feature": "all_features",
            "mse": mse,
            "r2": r2,
            "rmse": rmse,
            "mae": mae,
            "fold": fold_i,
            "mse.train": mse_train,
            "r2.train": r2_train,
            "rmse.train": rmse_train,
            "mae.train": mae_train,
        })
    fold_i += 1
Si_TE_1D_result_df = pd.DataFrame(Si_TE_1D_results)
Si_TM_1D_result_df = pd.DataFrame(Si_TM_1D_results)
    

In [11]:
# recall the best result using the entire wave data
# Dataset: Si_TE
# Mean MSE: 0.02473 std (9.9685e-03). ~22.7x fold improvement
# Mean R2: 1.0000 std (0.0000). 1.00x fold improvement
# Mean RMSE: 0.1547 std (0.0314). 3.14x fold improvement
# Mean MAE: 0.1257 std (0.0288). 3.18x fold improvement

Si_TE_1D_result_df.groupby("feature").agg({
    "mse": ["mean","std"],
    "r2": ["mean","std"],
    "rmse": ["mean","std"],
    "mae": ["mean","std"],
    "mse.train": ["mean","std"],
    "r2.train": ["mean","std"],
    "rmse.train": ["mean","std"],
    "mae.train": ["mean","std"],
})

Unnamed: 0_level_0,mse,mse,r2,r2,rmse,rmse,mae,mae,mse.train,mse.train,r2.train,r2.train,rmse.train,rmse.train,mae.train,mae.train
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
feature,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
all_features,0.135694,0.026329,0.999824,2.1e-05,0.366952,0.036061,0.296414,0.037559,0.121764,0.00681,0.999856,8e-06,0.348838,0.009749,0.280877,0.010746
feature_1,10.035007,1.843891,0.986622,0.003594,3.157665,0.283188,2.729086,0.278675,8.801394,0.272485,0.989567,0.000602,2.966428,0.046052,2.568578,0.061155
feature_2,1.822854,0.590767,0.99755,0.000949,1.336948,0.210429,1.125622,0.127932,1.55451,0.100828,0.998158,0.000142,1.246267,0.040747,1.057536,0.034875
feature_3,2.907157,0.638258,0.996132,0.001105,1.697132,0.183369,1.436463,0.196374,2.599922,0.108381,0.99692,0.000161,1.612147,0.03363,1.369071,0.05245
feature_4,0.562897,0.073247,0.99925,0.000174,0.749018,0.048342,0.604117,0.032712,0.512121,0.016505,0.999392,4.4e-05,0.715551,0.011602,0.582389,0.008291


In [12]:
# Recall the best result using the entire wave data
# Dataset: Si_TM
# Mean MSE: 0.1034 std (2.1553e-01). ~5x fold worsensing
# Mean R2: 0.9999 std (0.0003). ~20% worse
# Mean RMSE: 0.2035 std (0.2785). ~20% worse
# Mean MAE: 0.0871 std (0.0670). ~2x fold improvement ?

Si_TM_1D_result_df.groupby("feature").agg({
    "mse": ["mean","std"],
    "r2": ["mean","std"],
    "rmse": ["mean","std"],
    "mae": ["mean","std"],
    "mse.train": ["mean","std"],
    "r2.train": ["mean","std"],
    "rmse.train": ["mean","std"],
    "mae.train": ["mean","std"],
})

Unnamed: 0_level_0,mse,mse,r2,r2,rmse,rmse,mae,mae,mse.train,mse.train,r2.train,r2.train,rmse.train,rmse.train,mae.train,mae.train
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
feature,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
all_features,0.004556,0.001969,0.999994,3e-06,0.065996,0.015823,0.053686,0.012416,0.004328,0.000483,0.999995,7.940732e-07,0.065707,0.003633,0.051843,0.003193
feature_1,0.791981,0.165257,0.998944,0.000304,0.886253,0.090394,0.754197,0.086529,0.692697,0.02748,0.999179,5.231713e-05,0.832152,0.016579,0.710209,0.019945
feature_2,5.897534,1.077055,0.992177,0.001999,2.421099,0.211578,2.103621,0.170806,5.226674,0.179268,0.993809,0.0002495997,2.285918,0.039535,2.005111,0.052579
feature_3,0.755887,0.109434,0.998999,0.000221,0.867713,0.060837,0.736634,0.057786,0.668325,0.015962,0.999208,2.993195e-05,0.817464,0.0098,0.696552,0.014512
feature_4,0.029436,0.006827,0.999961,1.2e-05,0.17065,0.019834,0.137132,0.014416,0.027322,0.00181,0.999968,3.098957e-06,0.165218,0.005523,0.132627,0.004225


# save the actual and fitted values for the best ML model for each dataset (Figure 8)

In [13]:
pred_fitted_results_entire_wave_dfs = {}
for dat in ["Ti_TM", "Ti_TE", "Si_TE", "Si_TM"]:
    result_df = None
    fold_i = 0
    for train_data_pca, train_target, test_data_pca, test_target in PC_folds[dat]:
        # now we want to use the entire wave data
        # Fit a linear regression model
        model = LinearRegression()
        model.fit(train_data_pca, train_target)
        # Make predictions
        y_pred = model.predict(test_data_pca)
        
        # now I want to create a dataframe with the true, predicted, and train/test labels
        pred_fitted_test_df = pd.DataFrame({
            "true": test_target,
            f"predicted_fold{fold_i}": y_pred,
            f"train/test_fold{fold_i}": ["test"] * len(test_target),
        })
        # now I want to create a dataframe with the true, predicted, and train/test labels for the training data
        pred_fitted_train_df = pd.DataFrame({
            "true": train_target,
            f"predicted_fold{fold_i}": model.predict(train_data_pca),
            f"train/test_fold{fold_i}": ["train"] * len(train_target),
        })
        # now I want to concatenate the two dataframes
        pred_fitted_df = pd.concat([pred_fitted_train_df, pred_fitted_test_df])
        fold_i += 1
        if result_df is None:
            result_df = pred_fitted_df
        else:
            result_df = result_df.merge(pred_fitted_df, on="true", how="outer")
    pred_fitted_results_entire_wave_dfs[dat] = result_df

In [14]:
# save the pred_fitted_results_entire_wave_dfs to a xlsx file one to a sheet
with pd.ExcelWriter("pred_fitted_results_entire_wave_dfs.xlsx") as writer:
    for dat in pred_fitted_results_entire_wave_dfs.keys():
        pred_fitted_results_entire_wave_dfs[dat].to_excel(writer, sheet_name=dat, index=False)