# All features Minimum MSE model comparison

This notebook will assume that all features (the whole peak) can be used for prediction. This notebook aims to specifically focus only on assessing the difference in modeling techniques.

We will split the data into a 5-fold cross-validation. The approach will consist of first fitting an 80 component PC model on the training fold. Then we project the test fold into that PC space. We train a model using the PC projected training data and evaluate the performance on the PC projected test data. We take the average squared error for each model we will evaluate.

The models we will be evaluating:
- Lasso
- LinearRegressor
- RandomForestRegressor
- SVRegressor
- HistGBRegressor
- K-Nearest Neighbor Regressor

# imports and globals

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.cross_decomposition import PLSRegression
from sklearn.neighbors import KNeighborsRegressor

import optuna

from sklearn.decomposition import PCA

from tqdm import tqdm

# load the data 

In [3]:
# load the matrix
matrix = pd.read_excel("Ti_TE_matrix.xlsx", header=None).transpose()
matrix.columns = [f"feature_{i+1}" for i in range(matrix.shape[1])]
matrix

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,...,feature_9961,feature_9962,feature_9963,feature_9964,feature_9965,feature_9966,feature_9967,feature_9968,feature_9969,feature_9970,feature_9971,feature_9972,feature_9973,feature_9974,feature_9975,feature_9976,feature_9977,feature_9978,feature_9979,feature_9980,feature_9981,feature_9982,feature_9983,feature_9984,feature_9985,feature_9986,feature_9987,feature_9988,feature_9989,feature_9990,feature_9991,feature_9992,feature_9993,feature_9994,feature_9995,feature_9996,feature_9997,feature_9998,feature_9999,feature_10000
0,0.053574,0.053581,0.053767,0.053701,0.053784,0.054233,0.054402,0.054869,0.055483,0.055838,0.056650,0.057226,0.058008,0.058842,0.059539,0.060431,0.061082,0.061924,0.062578,0.063312,0.063863,0.064469,0.064756,0.065236,0.065463,0.065653,0.065934,0.066090,0.066167,0.066058,0.066103,0.066234,0.066136,0.066102,0.066112,0.065995,0.065507,0.063215,0.063104,0.063371,...,0.916525,0.916534,0.916542,0.916550,0.916558,0.916567,0.916575,0.916583,0.916591,0.916599,0.916607,0.916615,0.916623,0.916631,0.916639,0.916647,0.916655,0.916663,0.916671,0.916679,0.916687,0.916695,0.916702,0.916710,0.916718,0.916726,0.916733,0.916741,0.916749,0.916756,0.916764,0.916771,0.916779,0.916786,0.916794,0.916801,0.916809,0.916816,0.916824,0.916831
1,0.053621,0.053574,0.053611,0.053694,0.053756,0.053987,0.054315,0.054711,0.055178,0.055731,0.056319,0.056977,0.057753,0.058469,0.059335,0.060040,0.060887,0.061551,0.062388,0.062959,0.063742,0.064147,0.064692,0.065043,0.065310,0.065581,0.065829,0.066029,0.065972,0.066121,0.066226,0.066124,0.066047,0.065943,0.065847,0.065795,0.065751,0.065107,0.063009,0.063259,...,0.916463,0.916472,0.916480,0.916489,0.916497,0.916505,0.916514,0.916522,0.916530,0.916539,0.916547,0.916555,0.916563,0.916571,0.916579,0.916588,0.916596,0.916604,0.916612,0.916620,0.916628,0.916636,0.916643,0.916651,0.916659,0.916667,0.916675,0.916683,0.916690,0.916698,0.916706,0.916713,0.916721,0.916729,0.916736,0.916744,0.916751,0.916759,0.916766,0.916774
2,0.053609,0.053552,0.053499,0.053659,0.053743,0.053782,0.054188,0.054582,0.054911,0.055553,0.056087,0.056683,0.057497,0.058167,0.059015,0.059787,0.060541,0.061331,0.062028,0.062787,0.063382,0.064061,0.064399,0.064975,0.065132,0.065488,0.065722,0.065904,0.065971,0.066043,0.066256,0.066024,0.065932,0.065950,0.065774,0.065737,0.065359,0.064783,0.062865,0.063112,...,0.916339,0.916347,0.916355,0.916364,0.916372,0.916381,0.916389,0.916398,0.916406,0.916415,0.916423,0.916432,0.916440,0.916449,0.916457,0.916466,0.916475,0.916483,0.916492,0.916501,0.916510,0.916518,0.916527,0.916536,0.916545,0.916553,0.916562,0.916571,0.916580,0.916589,0.916597,0.916606,0.916615,0.916624,0.916633,0.916642,0.916650,0.916659,0.916668,0.916677
3,0.053482,0.053534,0.053528,0.053560,0.053638,0.053743,0.054038,0.054357,0.054793,0.055314,0.055819,0.056511,0.057119,0.057962,0.058650,0.059537,0.060217,0.061071,0.061723,0.062524,0.063171,0.063722,0.064387,0.064661,0.065037,0.065395,0.065602,0.065801,0.065882,0.065987,0.066140,0.066103,0.066076,0.066029,0.065890,0.065869,0.065966,0.065496,0.063099,0.062909,...,0.916329,0.916335,0.916341,0.916347,0.916353,0.916359,0.916365,0.916371,0.916378,0.916384,0.916390,0.916396,0.916402,0.916408,0.916414,0.916420,0.916426,0.916432,0.916438,0.916444,0.916450,0.916456,0.916462,0.916468,0.916475,0.916481,0.916487,0.916493,0.916499,0.916505,0.916512,0.916518,0.916524,0.916531,0.916537,0.916543,0.916550,0.916556,0.916563,0.916569
4,0.053432,0.053441,0.053511,0.053541,0.053545,0.053671,0.053895,0.054207,0.054621,0.055073,0.055645,0.056220,0.056873,0.057663,0.058364,0.059216,0.059968,0.060721,0.061516,0.062178,0.062948,0.063522,0.064100,0.064566,0.064890,0.065228,0.065505,0.065659,0.065834,0.066089,0.065942,0.065950,0.066017,0.065995,0.066048,0.065916,0.065801,0.065858,0.065143,0.062890,...,0.916271,0.916278,0.916286,0.916294,0.916302,0.916310,0.916318,0.916325,0.916333,0.916341,0.916348,0.916356,0.916364,0.916371,0.916379,0.916386,0.916394,0.916401,0.916409,0.916416,0.916424,0.916431,0.916438,0.916446,0.916453,0.916460,0.916468,0.916475,0.916482,0.916489,0.916497,0.916504,0.916511,0.916518,0.916525,0.916533,0.916540,0.916547,0.916554,0.916561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0.045977,0.046342,0.046630,0.046984,0.047278,0.047477,0.047631,0.047833,0.047842,0.048108,0.048038,0.048220,0.048178,0.048252,0.048407,0.048590,0.048810,0.049196,0.049516,0.049893,0.050148,0.050409,0.050783,0.049645,0.049898,0.050147,0.050422,0.050427,0.050596,0.050697,0.050590,0.050666,0.050707,0.050598,0.050671,0.050888,0.051057,0.051292,0.051683,0.052124,...,0.910080,0.910089,0.910097,0.910106,0.910115,0.910124,0.910132,0.910141,0.910150,0.910159,0.910168,0.910177,0.910185,0.910194,0.910203,0.910212,0.910221,0.910230,0.910239,0.910248,0.910257,0.910266,0.910275,0.910284,0.910293,0.910302,0.910311,0.910320,0.910329,0.910338,0.910347,0.910356,0.910365,0.910374,0.910383,0.910393,0.910402,0.910411,0.910420,0.910429
97,0.045820,0.046212,0.046547,0.046808,0.047224,0.047353,0.047597,0.047727,0.047824,0.048006,0.048019,0.048185,0.048091,0.048288,0.048261,0.048498,0.048748,0.049075,0.049294,0.049738,0.050081,0.050583,0.050285,0.049481,0.049787,0.050046,0.050323,0.050402,0.050511,0.050639,0.050619,0.050604,0.050649,0.050625,0.050614,0.050761,0.050976,0.051185,0.051498,0.051952,...,0.910048,0.910058,0.910068,0.910078,0.910088,0.910098,0.910108,0.910118,0.910128,0.910137,0.910147,0.910157,0.910167,0.910176,0.910186,0.910196,0.910206,0.910215,0.910225,0.910234,0.910244,0.910253,0.910263,0.910272,0.910282,0.910291,0.910301,0.910310,0.910319,0.910328,0.910338,0.910347,0.910356,0.910365,0.910374,0.910383,0.910392,0.910401,0.910410,0.910419
98,0.045771,0.046061,0.046351,0.046736,0.047111,0.047294,0.047459,0.047688,0.047807,0.047878,0.047992,0.048159,0.048045,0.048221,0.048239,0.048389,0.048643,0.048893,0.049281,0.049557,0.049880,0.050368,0.050727,0.049494,0.049640,0.049995,0.050162,0.050395,0.050437,0.050582,0.050581,0.050603,0.050623,0.050547,0.050641,0.050661,0.050846,0.051104,0.051350,0.051800,...,0.909951,0.909961,0.909972,0.909982,0.909992,0.910003,0.910013,0.910024,0.910034,0.910044,0.910055,0.910065,0.910075,0.910086,0.910096,0.910106,0.910117,0.910127,0.910137,0.910148,0.910158,0.910168,0.910178,0.910188,0.910199,0.910209,0.910219,0.910229,0.910239,0.910249,0.910259,0.910269,0.910279,0.910289,0.910299,0.910309,0.910319,0.910328,0.910338,0.910348
99,0.045634,0.045973,0.046243,0.046596,0.046988,0.047187,0.047410,0.047620,0.047714,0.047825,0.047988,0.048029,0.048079,0.048155,0.048173,0.048321,0.048537,0.048790,0.049085,0.049456,0.049849,0.050102,0.050331,0.050676,0.049533,0.049864,0.050075,0.050323,0.050380,0.050522,0.050559,0.050568,0.050588,0.050550,0.050574,0.050613,0.050752,0.050989,0.051235,0.051598,...,0.909858,0.909868,0.909877,0.909887,0.909897,0.909907,0.909917,0.909927,0.909936,0.909946,0.909956,0.909966,0.909976,0.909986,0.909996,0.910006,0.910016,0.910026,0.910036,0.910046,0.910056,0.910066,0.910076,0.910087,0.910097,0.910107,0.910117,0.910127,0.910137,0.910147,0.910157,0.910167,0.910177,0.910188,0.910198,0.910208,0.910218,0.910228,0.910238,0.910248


In [4]:
y = list(range(1, matrix.shape[0] + 1))
full_data = pd.DataFrame(matrix.values, columns=matrix.columns)
full_data["target"] = y

# split the data into 5-folds cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=19890417)
folds = []
for train_index, test_index in kf.split(full_data):
    train_data = full_data.iloc[train_index]
    test_data = full_data.iloc[test_index]
    # now we also want to dimensionally reduce the data into 80 PCs following the training data
    pca = PCA(n_components=80, svd_solver='full', random_state=19890417)
    pca.fit(train_data.drop(columns=["target"]))
    train_data_pca = pca.transform(train_data.drop(columns=["target"]))
    test_data_pca = pca.transform(test_data.drop(columns=["target"]))
    folds.append((train_data_pca, train_data["target"].values, test_data_pca, test_data["target"].values))

# linear regressor

In [5]:
# so linear regression has no hyperparameters to tune, we can just fit it and get the results
lin_reg = LinearRegression()
lin_reg_results = []
for train_data, train_target, test_data, test_target in folds:
    lin_reg.fit(train_data, train_target)
    pred = lin_reg.predict(test_data)
    mse = mean_squared_error(test_target, pred)
    r2 = r2_score(test_target, pred)
    lin_reg_results.append((mse, r2))
lin_reg_results = np.array(lin_reg_results)
lin_reg_mse = lin_reg_results[:, 0].mean()
lin_reg_r2 = lin_reg_results[:, 1].mean()
print(f"Linear Regression MSE: {lin_reg_mse:.4e}, R2: {lin_reg_r2:.4e}")

Linear Regression MSE: 2.0632e-04, R2: 1.0000e+00


# Lasso

Moving onto the Lasso model, there is a single hyper-parameter to tune. I will use optuna to tune this hyperparameter.

In [6]:
def objective_lasso(trial):
    alpha = trial.suggest_float('alpha', 1e-20, 1e2, log=True)

    # Store the validation losses for each fold
    fold_mse_scores = []

    for train_data, train_target, test_data, test_target in folds:
        model = make_pipeline(
            StandardScaler(),
            Lasso(alpha=alpha, max_iter=10000)
        )
        model.fit(train_data, train_target)
        preds = model.predict(test_data)
        mse = mean_squared_error(test_target, preds)
        fold_mse_scores.append(mse)
    
    # Return the average MSE across all folds (Optuna minimizes this)
    return np.mean(fold_mse_scores)

# Run the study
study = optuna.create_study(direction='minimize')  # Now minimizing MSE directly
study.optimize(objective_lasso, n_trials=500, timeout=1000)

# Best result
print("Best alpha:", study.best_params['alpha'])
print("Best average MSE:", study.best_value)

[I 2025-04-01 15:18:28,680] A new study created in memory with name: no-name-5cd9f7b6-13d3-4696-bbff-6c6b15d2670f
[I 2025-04-01 15:18:28,696] Trial 0 finished with value: 0.00020631694273101994 and parameters: {'alpha': 4.027359839498368e-12}. Best is trial 0 with value: 0.00020631694273101994.
[I 2025-04-01 15:18:28,708] Trial 1 finished with value: 0.0002063493006988567 and parameters: {'alpha': 1.4087152300070268e-19}. Best is trial 0 with value: 0.00020631694273101994.
[I 2025-04-01 15:18:28,719] Trial 2 finished with value: 0.00020631699754025825 and parameters: {'alpha': 1.5524950942984004e-10}. Best is trial 0 with value: 0.00020631694273101994.
[I 2025-04-01 15:18:28,729] Trial 3 finished with value: 0.00020631694131457572 and parameters: {'alpha': 1.1943209386336676e-13}. Best is trial 3 with value: 0.00020631694131457572.
[I 2025-04-01 15:18:28,739] Trial 4 finished with value: 0.0002063783851787453 and parameters: {'alpha': 1.6949684316466025e-07}. Best is trial 3 with value

Best alpha: 7.567823065691611e-16
Best average MSE: 0.00020631694127164977


In [7]:
# now we can firt the lasso model with the best alpha and evaluate it like we did with the linear regression
lasso = make_pipeline(
    StandardScaler(),
    Lasso(alpha=study.best_params['alpha'], max_iter=10000)
)
tol = 1e-6
lasso_results = []
for train_data, train_target, test_data, test_target in folds:
    lasso.fit(train_data, train_target)
    pred = lasso.predict(test_data)
    mse = mean_squared_error(test_target, pred)
    r2 = r2_score(test_target, pred)
    num_nonzero = np.sum(abs(lasso.named_steps['lasso'].coef_) > tol)
    print(f"Number of non-zero coefficients: {num_nonzero}")
    lasso_results.append((mse, r2))
lasso_results = np.array(lasso_results)
lasso_mse = lasso_results[:, 0].mean()
lasso_r2 = lasso_results[:, 1].mean()
print(f"Lasso Regression MSE: {lasso_mse:.4e}, R2: {lasso_r2:.4e}")

Number of non-zero coefficients: 79
Number of non-zero coefficients: 80
Number of non-zero coefficients: 80
Number of non-zero coefficients: 80
Number of non-zero coefficients: 79
Lasso Regression MSE: 2.0632e-04, R2: 1.0000e+00


# Support Vector Regressor

In [8]:
# use optuna to tune the hyper-parameters of the SVR model

def objective_svr(trial):
    # Suggest hyperparameters
    kernel = trial.suggest_categorical('kernel', ['linear','rbf','poly'])
    C = trial.suggest_float('C', 1e10, 1e15, log=True)
    epsilon = trial.suggest_float('epsilon', 1e-4, 1e-1, log=False)
    
    # gamma is only used with non-linear kernels
    if kernel == 'rbf':
        gamma = trial.suggest_float('gamma', 1e-20, 1e-8, log=True)
    else:
        gamma = 'auto'  # or you can skip setting it

    # Set up model
    model = make_pipeline(
        StandardScaler(),
        SVR(kernel=kernel, C=C, epsilon=epsilon, gamma=gamma)
    )

    fold_mse_scores = []
    for train_data, train_target, test_data, test_target in folds:
        model.fit(train_data, train_target)
        preds = model.predict(test_data)
        mse = mean_squared_error(test_target, preds)
        fold_mse_scores.append(mse)

    return np.mean(fold_mse_scores)

# Create and run the study
study = optuna.create_study(direction='minimize')
study.optimize(objective_svr, n_trials=500, timeout=1000)

# Best result
print("Best kernel:", study.best_params['kernel'])
print("Best C:", study.best_params['C'])
print("Best epsilon:", study.best_params['epsilon'])
if study.best_params['kernel'] == 'rbf':
    print("Best gamma:", study.best_params['gamma'])
print("Best average MSE:", study.best_value)

[I 2025-04-01 15:18:36,874] A new study created in memory with name: no-name-a0a453e6-0d62-490f-8cae-8cdb33e43933
[I 2025-04-01 15:18:36,895] Trial 0 finished with value: 1.9657476701272354e+60 and parameters: {'kernel': 'poly', 'C': 16855472451458.822, 'epsilon': 0.07479052844761307}. Best is trial 0 with value: 1.9657476701272354e+60.
[I 2025-04-01 15:18:36,910] Trial 1 finished with value: 119292.72510493745 and parameters: {'kernel': 'rbf', 'C': 120919494868.17213, 'epsilon': 0.08480799294942183, 'gamma': 5.884574706967022e-17}. Best is trial 1 with value: 119292.72510493745.
[I 2025-04-01 15:18:36,925] Trial 2 finished with value: 76037431.35218771 and parameters: {'kernel': 'rbf', 'C': 4741384316075.126, 'epsilon': 0.07125368902904058, 'gamma': 1.2848996858171887e-11}. Best is trial 1 with value: 119292.72510493745.
[I 2025-04-01 15:18:36,939] Trial 3 finished with value: 3.4036782777141643e+21 and parameters: {'kernel': 'linear', 'C': 1466940265266.6746, 'epsilon': 0.07790937713

Best kernel: rbf
Best C: 32473787814.054447
Best epsilon: 0.050060537769127604
Best gamma: 3.6890139115808204e-10
Best average MSE: 191.99279951305783


This model is not training well and looks like it might be facing some hyper-param optimization errors.

In [9]:
# now we can fit the SVR model with the best hyper-parameters and evaluate it like we did with the linear regression
svr = make_pipeline(
    StandardScaler(),
    SVR(kernel=study.best_params['kernel'], C=study.best_params['C'], epsilon=study.best_params['epsilon'], gamma=study.best_params.get('gamma', 'auto'))
)
tol = 1e-6
svr_results = []
for train_data, train_target, test_data, test_target in folds:
    svr.fit(train_data, train_target)
    pred = svr.predict(test_data)
    mse = mean_squared_error(test_target, pred)
    r2 = r2_score(test_target, pred)
    svr_results.append((mse, r2))
svr_results = np.array(svr_results)
svr_mse = svr_results[:, 0].mean()
svr_r2 = svr_results[:, 1].mean()
print(f"SVR MSE: {svr_mse:.4e}, R2: {svr_r2:.4e}")

SVR MSE: 1.9199e+02, R2: 7.4464e-01


# Random Forest Regressor

In [13]:
# use optuna to tune the hyper-parameters of the Random Forest model

def objective_rf(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 5, 200)
    max_depth = trial.suggest_int('max_depth', 1, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)

    # Set up model
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=19890417
    )

    fold_mse_scores = []
    step = 0
    for train_data, train_target, test_data, test_target in folds:
        model.fit(train_data, train_target)
        preds = model.predict(test_data)
        mse = mean_squared_error(test_target, preds)
        fold_mse_scores.append(mse)
        trial.report(mse, step)
        if trial.should_prune():
            raise optuna.TrialPruned()
        step += 1    
    
    return np.mean(fold_mse_scores)

# Create and run the study
sampler = optuna.samplers.TPESampler(n_startup_trials=10, multivariate=True, group=True)
study = optuna.create_study(direction='minimize', sampler=sampler, pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study.optimize(objective_rf, n_trials=500, timeout=1000)

# Best result
print("Best Parameters:", study.best_params)
print("Best average MSE:", study.best_value)

[I 2025-04-01 15:36:36,299] A new study created in memory with name: no-name-c14e78da-6797-4afb-b945-00da53902efb
[I 2025-04-01 15:36:37,423] Trial 0 finished with value: 30.49377098147258 and parameters: {'n_estimators': 169, 'max_depth': 45, 'min_samples_split': 16, 'min_samples_leaf': 9}. Best is trial 0 with value: 30.49377098147258.
[I 2025-04-01 15:36:38,374] Trial 1 finished with value: 17.030501053031653 and parameters: {'n_estimators': 133, 'max_depth': 27, 'min_samples_split': 5, 'min_samples_leaf': 7}. Best is trial 1 with value: 17.030501053031653.
[I 2025-04-01 15:36:38,553] Trial 2 finished with value: 33.81703571416504 and parameters: {'n_estimators': 27, 'max_depth': 33, 'min_samples_split': 7, 'min_samples_leaf': 9}. Best is trial 1 with value: 17.030501053031653.
[I 2025-04-01 15:36:39,029] Trial 3 finished with value: 26.22087659434975 and parameters: {'n_estimators': 60, 'max_depth': 23, 'min_samples_split': 16, 'min_samples_leaf': 4}. Best is trial 1 with value: 17

Best Parameters: {'n_estimators': 105, 'max_depth': 28, 'min_samples_split': 2, 'min_samples_leaf': 1}
Best average MSE: 1.5576756289817517


In [14]:
# now we can fit the random forest model with the best parameters and evaluate it like we did with the linear regression
rf = RandomForestRegressor(
    n_estimators=study.best_params['n_estimators'],
    max_depth=study.best_params['max_depth'],
    min_samples_split=study.best_params['min_samples_split'],
    min_samples_leaf=study.best_params['min_samples_leaf'],
    random_state=19890417
)
rf_results = []
for train_data, train_target, test_data, test_target in folds:
    rf.fit(train_data, train_target)
    pred = rf.predict(test_data)
    mse = mean_squared_error(test_target, pred)
    r2 = r2_score(test_target, pred)
    rf_results.append((mse, r2))
rf_results = np.array(rf_results)
rf_mse = rf_results[:, 0].mean()
rf_r2 = rf_results[:, 1].mean()
print(f"Random Forest MSE: {rf_mse:.4e}, R2: {rf_r2:.4e}")


Random Forest MSE: 1.5577e+00, R2: 9.9788e-01


# Histogram Gradient Boosting Regressor

In [15]:
# use optuna to tune the hyper-parameters of the HistGradientBoosting model

def objective_hgb(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float('learning_rate', 1e-2, 1e0, log=False)
    max_iter = trial.suggest_int('max_iter', 400, 800)
    max_depth = trial.suggest_int('max_depth', 40, 100)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    # Set up model
    model = HistGradientBoostingRegressor(
        learning_rate=learning_rate,
        max_iter=max_iter,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        random_state=19890417
    )

    fold_mse_scores = []
    for train_data, train_target, test_data, test_target in folds:
        model.fit(train_data, train_target)
        preds = model.predict(test_data)
        mse = mean_squared_error(test_target, preds)
        fold_mse_scores.append(mse)

    if trial.should_prune():
        raise optuna.TrialPruned()

    return np.mean(fold_mse_scores)

# Create and run the study
study = optuna.create_study(direction='minimize')
study.optimize(objective_hgb, n_trials=500, timeout=1000)

# Best result
print("Best Parameters:", study.best_params)
print("Best average MSE:", study.best_value)


[I 2025-04-01 15:47:49,718] A new study created in memory with name: no-name-a36d186a-5682-4ac2-810d-dca787d010b5
[I 2025-04-01 15:47:52,764] Trial 0 finished with value: 2.7955723183040733 and parameters: {'learning_rate': 0.8110091810672965, 'max_iter': 648, 'max_depth': 65, 'min_samples_leaf': 2}. Best is trial 0 with value: 2.7955723183040733.
[I 2025-04-01 15:47:55,037] Trial 1 finished with value: 7.438420109055526 and parameters: {'learning_rate': 0.6663945389039152, 'max_iter': 500, 'max_depth': 50, 'min_samples_leaf': 8}. Best is trial 0 with value: 2.7955723183040733.
[I 2025-04-01 15:48:00,550] Trial 2 finished with value: 1.314928850741023 and parameters: {'learning_rate': 0.36251324266240614, 'max_iter': 496, 'max_depth': 66, 'min_samples_leaf': 4}. Best is trial 2 with value: 1.314928850741023.
[I 2025-04-01 15:48:03,916] Trial 3 finished with value: 1.3766582581104365 and parameters: {'learning_rate': 0.08902057322427452, 'max_iter': 417, 'max_depth': 43, 'min_samples_le

Best Parameters: {'learning_rate': 0.11054699207220904, 'max_iter': 621, 'max_depth': 69, 'min_samples_leaf': 8}
Best average MSE: 0.8434689329872229


In [16]:
# now we can fit the HistGradientBoosting model with the best parameters and evaluate it like we did with the linear regression
hgb = HistGradientBoostingRegressor(
    learning_rate=study.best_params['learning_rate'],
    max_iter=study.best_params['max_iter'],
    max_depth=study.best_params['max_depth'],
    min_samples_leaf=study.best_params['min_samples_leaf'],
    random_state=19890417
)
hgb_results = []
for train_data, train_target, test_data, test_target in folds:
    hgb.fit(train_data, train_target)
    pred = hgb.predict(test_data)
    mse = mean_squared_error(test_target, pred)
    r2 = r2_score(test_target, pred)
    hgb_results.append((mse, r2))
hgb_results = np.array(hgb_results)
hgb_mse = hgb_results[:, 0].mean()
hgb_r2 = hgb_results[:, 1].mean()
print(f"HistGradientBoosting MSE: {hgb_mse:.4e}, R2: {hgb_r2:.4e}")

HistGradientBoosting MSE: 8.4347e-01, R2: 9.9887e-01


In [None]:
# use optuna to tune the hyper-parameters of the KNN model

def objective_knn(trial):
    # Suggest hyperparameters
    n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
    
    # Set up model
    model = KNeighborsRegressor(
        n_neighbors=n_neighbors,
        weights=weights,
        algorithm=algorithm
    )

    fold_mse_scores = []
    for train_data, train_target, test_data, test_target in folds:
        model.fit(train_data, train_target)
        preds = model.predict(test_data)
        mse = mean_squared_error(test_target, preds)
        fold_mse_scores.append(mse)

    return np.mean(fold_mse_scores)

# Create and run the study
sampler = optuna.samplers.TPESampler(n_startup_trials=10, multivariate=True, group=True)
study = optuna.create_study(direction='minimize',sampler=sampler)
study.optimize(objective_knn, n_trials=500, timeout=1000)

# Best result
print("Best Parameters:", study.best_params)
print("Best average MSE:", study.best_value)

[I 2025-04-04 00:50:40,938] A new study created in memory with name: no-name-06b44acb-9a2e-4b65-b3de-d3d456240136
[I 2025-04-04 00:50:41,372] Trial 0 finished with value: 1.2610052910052911 and parameters: {'n_neighbors': 6, 'weights': 'uniform', 'algorithm': 'brute'}. Best is trial 0 with value: 1.2610052910052911.
[I 2025-04-04 00:50:41,390] Trial 1 finished with value: 0.6207052544487311 and parameters: {'n_neighbors': 6, 'weights': 'distance', 'algorithm': 'brute'}. Best is trial 1 with value: 0.6207052544487311.
[I 2025-04-04 00:50:41,413] Trial 2 finished with value: 0.653182290270421 and parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'auto'}. Best is trial 1 with value: 0.6207052544487311.
[I 2025-04-04 00:50:41,433] Trial 3 finished with value: 9.497754497354496 and parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'auto'}. Best is trial 1 with value: 0.6207052544487311.
[I 2025-04-04 00:50:41,452] Trial 4 finished with value: 2.797865961199

Best Parameters: {'n_neighbors': 2, 'weights': 'distance', 'algorithm': 'brute'}
Best average MSE: 0.3661572092662183


# Put all model results into table

In [17]:
# format all the model results into a dataframe for easy comparison
results_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Lasso', 'SVR', 'Random Forest', 'HistGradientBoosting'],
    'MSE': [lin_reg_mse, lasso_mse, svr_mse, rf_mse, hgb_mse],
    'R2': [lin_reg_r2, lasso_r2, svr_r2, rf_r2, hgb_r2]
})
results_df['MSE'] = results_df['MSE'].apply(lambda x: float(f"{x:.4e}"))
results_df['R2'] = results_df['R2'].apply(lambda x: float(f"{x:.4e}") if x is not None else "N/A")
print(results_df.sort_values(by="MSE").to_markdown())

|    | Model                |          MSE |      R2 |
|---:|:---------------------|-------------:|--------:|
|  0 | Linear Regression    |   0.00020632 | 1       |
|  1 | Lasso                |   0.00020632 | 1       |
|  4 | HistGradientBoosting |   0.84347    | 0.99887 |
|  3 | Random Forest        |   1.5577     | 0.99788 |
|  2 | SVR                  | 191.99       | 0.74464 |


# Try PLS Regression on the original data

PLS Regression is something the other author had tried. I think it should perform similar to the PC approach, but since the latent variables/dimensions are informed by Y it might perform a bit better.

In [7]:
folds_noPC = []
for train_index, test_index in kf.split(full_data):
    train_data = full_data.iloc[train_index]
    test_data = full_data.iloc[test_index]
    folds_noPC.append((train_data.drop(columns=["target"]), train_data["target"].values, test_data.drop(columns=["target"]), test_data["target"].values))

In [19]:
# no need to use optuna to tune the hyper-parameters of the PLS model, we can just fit it and get the results
# for all 80 values of the number of components variable
# note that pls regression errors out if the number of components is greater than the number of features, so we need to limit it to 75

pls_results = []
for n_components in tqdm(range(1, 75)):
    pls = PLSRegression(n_components=n_components, scale=True)
    pls_results_fold = []
    for train_data, train_target, test_data, test_target in folds_noPC:
        pls.fit(train_data, train_target)
        pred = pls.predict(test_data)
        mse = mean_squared_error(test_target, pred)
        r2 = r2_score(test_target, pred)
        pls_results_fold.append((mse, r2))
    pls_results.append(np.array(pls_results_fold))
pls_results = np.array(pls_results)
pls_mse = pls_results[:, :, 0].mean(axis=1)
pls_r2 = pls_results[:, :, 1].mean(axis=1)

# find the best number of components
best_n_components = np.argmin(pls_mse) + 1
best_mse = pls_mse[best_n_components - 1]
best_r2 = pls_r2[best_n_components - 1]
print(f"Best number of components: {best_n_components}, MSE: {best_mse:.4e}, R2: {best_r2:.4e}")


100%|██████████| 74/74 [01:52<00:00,  1.52s/it]

Best number of components: 22, MSE: 9.9914e-05, R2: 1.0000e+00





# Try KNN on original data

In [8]:
# use optuna to tune the hyper-parameters of the KNN model

def objective_knn(trial):
    # Suggest hyperparameters
    n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
    
    # Set up model
    model = KNeighborsRegressor(
        n_neighbors=n_neighbors,
        weights=weights,
        algorithm=algorithm
    )

    fold_mse_scores = []
    for train_data, train_target, test_data, test_target in folds_noPC:
        model.fit(train_data, train_target)
        preds = model.predict(test_data)
        mse = mean_squared_error(test_target, preds)
        fold_mse_scores.append(mse)

    return np.mean(fold_mse_scores)

# Create and run the study
sampler = optuna.samplers.TPESampler(n_startup_trials=10, multivariate=True, group=True)
study = optuna.create_study(direction='minimize',sampler=sampler)
study.optimize(objective_knn, n_trials=500, timeout=1000)

# Best result
print("Best Parameters:", study.best_params)
print("Best average MSE:", study.best_value)

[I 2025-04-04 00:52:41,510] A new study created in memory with name: no-name-0910636a-6b77-4e0a-b9f0-1b5fd8d5b4df
[I 2025-04-04 00:52:41,937] Trial 0 finished with value: 1.7133956284895233 and parameters: {'n_neighbors': 12, 'weights': 'distance', 'algorithm': 'auto'}. Best is trial 0 with value: 1.7133956284895233.
[I 2025-04-04 00:52:42,378] Trial 1 finished with value: 0.6523015625291444 and parameters: {'n_neighbors': 6, 'weights': 'distance', 'algorithm': 'brute'}. Best is trial 1 with value: 0.6523015625291444.
[I 2025-04-04 00:52:42,780] Trial 2 finished with value: 4.406477764659581 and parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'auto'}. Best is trial 1 with value: 0.6523015625291444.
[I 2025-04-04 00:52:43,362] Trial 3 finished with value: 14.993377425044091 and parameters: {'n_neighbors': 18, 'weights': 'uniform', 'algorithm': 'kd_tree'}. Best is trial 1 with value: 0.6523015625291444.
[I 2025-04-04 00:52:43,854] Trial 4 finished with value: 0.5407708

Best Parameters: {'n_neighbors': 2, 'weights': 'distance', 'algorithm': 'kd_tree'}
Best average MSE: 0.3743600287292021


# save the file to npz file under TTER

In [20]:
# so the old code used the data stored in a numpy matrix with the y values in the first column and the features in the rest of the columns
# we want to put the data into the same format as it was before
# so we can use the same code to plot the results

# we can just bind the y variable to the matrix variable
y = np.array(y).reshape(-1, 1)
# and then we can bind the y variable to the matrix variable
new_mat = np.hstack((y, matrix))
print(new_mat.shape)

# show the head of the matrix
print(new_mat[:5, :5])

# now we want to save the matrix to a file
all_data = {"TTER":new_mat}
# save the data to a file
np.savez_compressed('../p05_evaluate_coauth_codes/all.npz', **all_data)


(101, 10001)
[[1.         0.05357352 0.05358149 0.05376663 0.05370087]
 [2.         0.05362125 0.05357391 0.05361085 0.05369367]
 [3.         0.05360937 0.05355164 0.05349888 0.05365914]
 [4.         0.05348237 0.05353369 0.05352754 0.05356049]
 [5.         0.05343162 0.05344124 0.05351068 0.0535407 ]]


# Try HGBR with all the data

In [23]:
def objective_hgb(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float('learning_rate', 1e-2, 1e0, log=False)
    max_iter = trial.suggest_int('max_iter', 10, 100)
    max_depth = trial.suggest_int('max_depth', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    # Set up model
    model = HistGradientBoostingRegressor(
        learning_rate=learning_rate,
        max_iter=max_iter,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        random_state=19890417
    )
    step = 0
    fold_mse_scores = []
    for train_data, train_target, test_data, test_target in folds_noPC:
        model.fit(train_data, train_target)
        preds = model.predict(test_data)
        mse = mean_squared_error(test_target, preds)
        fold_mse_scores.append(mse)
        trial.report(mse, step)
        if trial.should_prune():
            raise optuna.TrialPruned()
        step += 1   

    if trial.should_prune():
        raise optuna.TrialPruned()

    return np.mean(fold_mse_scores)

# Create and run the study
sampler = optuna.samplers.TPESampler(n_startup_trials=10, multivariate=True, group=True)
study = optuna.create_study(direction='minimize', sampler=sampler, pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study.optimize(objective_hgb, n_trials=500, timeout=1000)

# Best result
print("Best Parameters:", study.best_params)
print("Best average MSE:", study.best_value)

[I 2025-04-01 16:18:14,621] A new study created in memory with name: no-name-108db503-d7c7-40f1-9a13-c90acf4ac919
[I 2025-04-01 16:18:46,734] Trial 0 finished with value: 52.46934792462631 and parameters: {'learning_rate': 0.9246156106315383, 'max_iter': 71, 'max_depth': 3, 'min_samples_leaf': 7}. Best is trial 0 with value: 52.46934792462631.
[I 2025-04-01 16:20:11,364] Trial 1 finished with value: 6.4748507925285566 and parameters: {'learning_rate': 0.24494715260201042, 'max_iter': 83, 'max_depth': 17, 'min_samples_leaf': 7}. Best is trial 1 with value: 6.4748507925285566.
[I 2025-04-01 16:20:54,247] Trial 2 finished with value: 23.216128212182902 and parameters: {'learning_rate': 0.6162187994508563, 'max_iter': 38, 'max_depth': 14, 'min_samples_leaf': 7}. Best is trial 1 with value: 6.4748507925285566.
[I 2025-04-01 16:21:25,734] Trial 3 finished with value: 61.702542948923295 and parameters: {'learning_rate': 0.9741027001505104, 'max_iter': 53, 'max_depth': 4, 'min_samples_leaf': 9

Best Parameters: {'learning_rate': 0.1757479864732147, 'max_iter': 52, 'max_depth': 10, 'min_samples_leaf': 6}
Best average MSE: 5.328619095155375
