# REGRESSION MODELS: GPLVM IMPUTATION

## Description of this notebook

This notebook evaluates the performance of the regression models using the datasets imputed using the Gaussian Process Latent Variable Model (GPLVM).

In [None]:
# Import all the necessary libraries 

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import joblib
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic, WhiteKernel, ExpSineSquared, DotProduct
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
import os
from sklearn.neural_network import MLPRegressor
from GPy import models
from GPy import kern
from sklearn.gaussian_process import GaussianProcessRegressor
import warnings
warnings.filterwarnings("ignore")
from sklearn.base import clone

In [None]:
#os.chdir('..') # move to the general directory

In [None]:
def scale_data(input, target):
    all_data = pd.concat([input, target], axis=1)

    all_data = all_data.dropna(subset=['ADAS13'])

    # Divide again in input and target

    input = all_data.drop(columns=['ADAS13'])
    target = all_data['ADAS13']

    # Divide numerical and categorical variables
    input_num = input.drop(columns=['PTGENDER', 'APOE4'])
    input_cat = input[['PTGENDER', 'APOE4']]

    # Scale the data
    scaler = StandardScaler()
    input_scaled = scaler.fit_transform(input_num)
    input_scaled = pd.DataFrame(input_scaled, columns=input_num.columns).reset_index(drop=True)
    input_cat = input_cat.reset_index(drop=True)

    # Concatenate scaled data with categorical variables
    input = pd.concat([input_scaled, input_cat], axis=1)
    
    return input, target

## **1.** Models Definitions

### **1.1** Random Forest Model

In [None]:
def random_forest(X, y, imp='None'):
    rf = RandomForestRegressor(random_state=42)

    param_grid_rf = {
        'n_estimators': [30, 40, 50, 55],
        'max_depth': [1, 5, 10, 20, 30],
        'min_samples_split': [2, 5, 10, 15, 20],
        'min_samples_leaf': [1, 2, 4, 6, 8, 10],
        'max_features': [15, 16, 17, 18, 19, 20]
    }

    scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    kf_outer = KFold(n_splits=10, shuffle=True, random_state=42)

    r2_scores = []

    fold = 1
    for train_index, test_index in kf_outer.split(X):
        print(f"Fold {fold}:")

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        grid_search_rf = GridSearchCV(
            estimator=rf,
            param_grid=param_grid_rf,
            cv=KFold(n_splits=3, shuffle=True, random_state=42), 
            scoring=scorer,
            verbose=0,
            n_jobs=-1
        )

        grid_search_rf.fit(X_train, y_train)

        best_rf_model = grid_search_rf.best_estimator_

        y_pred_test = best_rf_model.predict(X_test)

        r2 = r2_score(y_test, y_pred_test)

        print(f"Fold R²: {r2:.3f}")

        r2_scores.append(r2)

        fold += 1

    return (f"\nRandom Forest 10-Fold CV R² Score {imp}: {np.mean(r2_scores):.3f} ± {np.std(r2_scores):.3f}")


### **1.2** Gradient Boosting Regressor

In [None]:
def gradient_boosting(X, y, imp='None'):
    gradboosting = GradientBoostingRegressor(random_state=42)

    param_grid = {
        'n_estimators': [500, 700, 900],
        'learning_rate': [0.0001, 0.001, 0.01],
        'max_depth': [1, 2, 3],
        'min_samples_split': [3, 4, 5],
        'min_samples_leaf': [1, 2, 3]
    }

    scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    kf_outer = KFold(n_splits=10, shuffle=True, random_state=42)

    r2_scores = []

    fold = 1
    for train_index, test_index in kf_outer.split(X):
        print(f"Fold {fold}:")

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        grid_search = GridSearchCV(
            estimator=gradboosting,
            param_grid=param_grid,
            cv=KFold(n_splits=3, shuffle=True, random_state=42), 
            scoring=scorer,
            verbose=0,
            n_jobs=-1
        )

        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_

        y_pred_test = best_model.predict(X_test)

        r2 = r2_score(y_test, y_pred_test)

        print(f"  Fold R²: {r2:.3f}")

        r2_scores.append(r2)

        fold += 1

    return (f"\nGradient Boosting 10-Fold CV R² Score {imp}: {np.mean(r2_scores):.3f} ± {np.std(r2_scores):.3f}")


### **1.3** XGBRegressor

In [None]:
def xgb_regressor(X, y, imp='None'):
    xgb = XGBRegressor(random_state=42)

    param_grid = {
        'n_estimators': [100, 500, 1000],
        'learning_rate': [1e-4, 1e-3, 1e-2, 1e-1],
        'max_depth': [1, 3, 5, 7]
    }

    scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    kf_outer = KFold(n_splits=10, shuffle=True, random_state=42)

    r2_scores = []

    fold = 1
    for train_index, test_index in kf_outer.split(X):
        print(f"Fold {fold}:")

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        grid_search = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid,
            cv=KFold(n_splits=3, shuffle=True, random_state=42),
            scoring=scorer,
            verbose=0,
            n_jobs=-1
        )

        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_

        y_pred_test = best_model.predict(X_test)

        r2 = r2_score(y_test, y_pred_test)

        print(f"  Fold R²: {r2:.3f}")

        r2_scores.append(r2)

        fold += 1

    return (f"\nXGB Regressor 10-Fold CV R² Score {imp}: {np.mean(r2_scores):.3f} ± {np.std(r2_scores):.3f}")


### **1.4** Support Vector Regressor

In [None]:
def svr(X, y, imp='None'):
    svr_model = SVR()

    param_grid = {
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'C': [1e-3, 1e-2, 1e-1, 1, 10],
        'gamma': [1e-4, 1e-3, 1e-2, 1, 10],
        'epsilon': [1e-4, 1e-3, 1e-2, 1, 10]
    }

    scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    kf_outer = KFold(n_splits=10, shuffle=True, random_state=42)

    r2_scores = []

    fold = 1
    for train_index, test_index in kf_outer.split(X):
        print(f"Fold {fold}:")

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        grid_search = GridSearchCV(
            estimator=svr_model,
            param_grid=param_grid,
            cv=KFold(n_splits=3, shuffle=True, random_state=42),
            scoring=scorer,
            verbose=0,
            n_jobs=-1
        )

        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_

        y_pred_test = best_model.predict(X_test)

        r2 = r2_score(y_test, y_pred_test)

        print(f"  Fold R²: {r2:.3f}")

        r2_scores.append(r2)

        fold += 1

    return (f"\nSVR 10-Fold CV R² Score {imp}: {np.mean(r2_scores):.3f} ± {np.std(r2_scores):.3f}")


### **1.5** Ridge Regression Model

In [None]:
def ridge_regression(X, y, imp='None'):
    ridge = Ridge(random_state=42)

    param_grid = {
        'alpha': [1e-10, 1e-5, 1e-4, 1e-3, 1e-2],
        'fit_intercept': [True, False],
        'tol': [1e-5, 1e-4, 1e-3, 1e-2],
        'max_iter': [100, 500, 1000, 2000, 3000],
        'solver': ['auto', 'saga'],
    }

    scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    kf_outer = KFold(n_splits=10, shuffle=True, random_state=42)

    r2_scores = []

    fold = 1
    for train_index, test_index in kf_outer.split(X):
        print(f"Fold {fold}:")

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        grid_search = GridSearchCV(
            estimator=ridge,
            param_grid=param_grid,
            cv=KFold(n_splits=3, shuffle=True, random_state=42),
            scoring=scorer,
            verbose=0,
            n_jobs=-1
        )

        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_

        y_pred_test = best_model.predict(X_test)

        r2 = r2_score(y_test, y_pred_test)

        print(f"  Fold R²: {r2:.3f}")

        r2_scores.append(r2)

        fold += 1

    return (f"\nRidge Regression 10-Fold CV R² Score {imp}: {np.mean(r2_scores):.3f} ± {np.std(r2_scores):.3f}")


### **1.6** MultiLayer Perceptron (MLP)

In [None]:
def mlp(X, y, imp='None'):
    mlp = MLPRegressor(random_state=42, max_iter=1000000)

    param_grid = {
        'hidden_layer_sizes': [(2,), (5,), (10,), (2, 2), (5, 2), (10, 2)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'lbfgs'],
        'alpha': [1e-10, 1e-5, 1e-4, 1e-3, 1e-2]
    }

    scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    kf_outer = KFold(n_splits=10, shuffle=True, random_state=42)

    r2_scores = []

    fold = 1
    for train_index, test_index in kf_outer.split(X):
        print(f"Fold {fold}:")

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        grid_search = GridSearchCV(
            estimator=mlp,
            param_grid=param_grid,
            cv=KFold(n_splits=3, shuffle=True, random_state=42),
            scoring=scorer,
            verbose=0,
            n_jobs=-1
        )

        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_

        y_pred_test = best_model.predict(X_test)

        r2 = r2_score(y_test, y_pred_test)

        print(f"  Fold R²: {r2:.3f}")

        r2_scores.append(r2)

        fold += 1

    return (f"\nMLP 10-Fold CV R² Score {imp}: {np.mean(r2_scores):.3f} ± {np.std(r2_scores):.3f}")

### **1.7** GP Regressor

In [None]:
def gp_regressor(X, y, imp='None'):

    gpr = GaussianProcessRegressor(random_state=42, normalize_y=True)

    # Define kernels with tunable hyperparameters
    kernel_options = [
        RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) +
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1)) +
        Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)),
        
        RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) +
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1)) +
        RationalQuadratic(alpha=1.0, alpha_bounds=(1e-2, 1e3),
                  length_scale=1.0, length_scale_bounds=(1e-2, 1e3)),

        
        RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) +
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1)) +
        ExpSineSquared(length_scale=1.0, length_scale_bounds=(1e-2, 1e3),
                    periodicity=3.0, periodicity_bounds=(1e-2, 10.0)),
        
        RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) +
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1)) +
        DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-2, 1e3)),
        
        Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)),
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1)) +
        RationalQuadratic(alpha=1.0, alpha_bounds=(1e-2, 1e3),
                  length_scale=1.0, length_scale_bounds=(1e-2, 1e3)),
        
        Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) +
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1)) +
        ExpSineSquared(length_scale=1.0, length_scale_bounds=(1e-2, 1e3),
               periodicity=3.0, periodicity_bounds=(1e-2, 10.0)),
        
        Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) +
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1)) +
        DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-2, 1e3)),
        
        RationalQuadratic(alpha=1.0, alpha_bounds=(1e-2, 1e3),
                  length_scale=1.0, length_scale_bounds=(1e-2, 1e3)),
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1)) +
        ExpSineSquared(length_scale=1.0, length_scale_bounds=(1e-2, 1e3),
               periodicity=3.0, periodicity_bounds=(1e-2, 10.0)),
        
        RationalQuadratic(alpha=1.0, alpha_bounds=(1e-2, 1e3),
                  length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) +
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1)) +
        DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-2, 1e3)),
        
        ExpSineSquared(length_scale=1.0, length_scale_bounds=(1e-2, 1e3),
               periodicity=3.0, periodicity_bounds=(1e-2, 10.0)) +  
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1)) +
        DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-2, 1e3)),
        
        DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-2, 1e3)) +
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1)),
        
        RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) +         
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1)),
        
        Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) +
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1)),
        
        RationalQuadratic(alpha=1.0, alpha_bounds=(1e-2, 1e3),
                  length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) + 
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1)),
        
        ExpSineSquared(length_scale=1.0, length_scale_bounds=(1e-2, 1e3),
                    periodicity=3.0, periodicity_bounds=(1e-2, 10.0)) +
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e1))

    ]

    scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    kf_outer = KFold(n_splits=10, shuffle=True, random_state=42)

    r2_scores = []

    param_grid = {
        'kernel': [clone(k) for k in kernel_options]
    }

    fold = 1
    for train_index, test_index in kf_outer.split(X):
        print(f"Fold {fold}:")

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        grid_search = GridSearchCV(
            estimator=gpr,
            param_grid=param_grid,
            cv=KFold(n_splits=3, shuffle=True, random_state=42),
            scoring=scorer,
            verbose=0,
            n_jobs=-1
        )

        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_

        y_pred, y_std = best_model.predict(X_test, return_std=True)

        r2 = r2_score(y_test, y_pred)

        print(f"  Fold R²: {r2:.3f}")

        r2_scores.append(r2)

        fold += 1

    return (f"\nGPR 10-Fold CV R² Score {imp}: {np.mean(r2_scores):.3f} ± {np.std(r2_scores):.3f}")
    


## **2.** GPLVM Imputed Datasets

### **2.1** Mean Pre-Imputation

In [None]:
input = pd.read_csv('GPLVM IMPUTATION/imputed_gplvm_mean_df.csv')
target = pd.read_csv('DATA/target_COMPLETE_DATA.csv')

scaled_input, y = scale_data(input, target)

##### Random Forest

In [None]:
r2_score_rf = random_forest(scaled_input, y, imp='GPLVM Mean')

In [None]:
print(r2_score_rf)


Random Forest 10-Fold CV R² Score GPLVM Mean: 0.713 ± 0.040


##### Gradient Boosting

In [None]:
r2_score_grad = gradient_boosting(scaled_input, y, imp='GPLVM Mean')

In [None]:
print(r2_score_grad)


Gradient Boosting 10-Fold CV R² Score GPLVM Mean: 0.720 ± 0.035


##### Extreme Gradient Boosting

In [None]:
r2_score_xgb = xgb_regressor(scaled_input, y, imp='GPLVM Mean')

In [None]:
print(r2_score_xgb)


XGB Regressor 10-Fold CV R² Score GPLVM Mean: 0.719 ± 0.034


##### SVR

In [None]:
r2_score_svr = svr(scaled_input, y, imp='GPLVM Mean')

In [None]:
print(r2_score_svr)


SVR 10-Fold CV R² Score GPLVM Mean: 0.731 ± 0.034


##### Ridge Regression

In [None]:
r2_score_ridge = ridge_regression(scaled_input, y, imp='GPLVM Mean')

In [None]:
print(r2_score_ridge)


Ridge Regression 10-Fold CV R² Score GPLVM Mean: 0.731 ± 0.035


##### MLP

In [None]:
r2_score_mlp = mlp(scaled_input, y, imp='GPLVM Mean')

In [None]:
print(r2_score_mlp)


MLP 10-Fold CV R² Score GPLVM Mean: 0.726 ± 0.037


##### GP Regression

In [None]:
r2_score_gpr = gp_regressor(scaled_input, y, imp='GPLVM Mean')

In [None]:
print(r2_score_gpr)


GPR 10-Fold CV R² Score GPLVM Mean: 0.730 ± 0.037


### **2.2** Median Pre-Imputation

In [None]:
input = pd.read_csv('GPLVM IMPUTATION/imputed_gplvm_median_df.csv')
target = pd.read_csv('DATA/target_COMPLETE_DATA.csv')

scaled_input, y = scale_data(input, target)

##### Random Forest

In [None]:
r2_score_rf = random_forest(scaled_input, y, imp='GPLVM Median')

In [None]:
print(r2_score_rf)


Random Forest 10-Fold CV R² Score GPLVM Median: 0.711 ± 0.040


##### Gradient Boosting

In [None]:
r2_score_grad = gradient_boosting(scaled_input, y, imp='GPLVM Median')

In [None]:
print(r2_score_grad)


Gradient Boosting 10-Fold CV R² Score GPLVM Median: 0.718 ± 0.035


##### Extreme Gradient Boosting

In [None]:
r2_score_xgb = xgb_regressor(scaled_input, y, imp='GPLVM Median')

In [None]:
print(r2_score_xgb)


XGB Regressor 10-Fold CV R² Score GPLVM Median: 0.717 ± 0.035


##### SVR

In [None]:
r2_score_svr = svr(scaled_input, y, imp='GPLVM Median')

In [None]:
print(r2_score_svr)


SVR 10-Fold CV R² Score GPLVM Median: 0.730 ± 0.034


##### Ridge Regression

In [None]:
r2_score_ridge = ridge_regression(scaled_input, y, imp='GPLVM Median')

In [None]:
print(r2_score_ridge)


Ridge Regression 10-Fold CV R² Score GPLVM Median: 0.730 ± 0.035


##### MLP

In [None]:
r2_score_mlp = mlp(scaled_input, y, imp='GPLVM Median')

In [None]:
print(r2_score_mlp)


MLP 10-Fold CV R² Score GPLVM Median: 0.729 ± 0.038


##### GP Regression

In [None]:
r2_score_gpr = gp_regressor(scaled_input, y, imp='GPLVM Median')

In [None]:
print(r2_score_gpr)


GPR 10-Fold CV R² Score GPLVM Median: 0.730 ± 0.037


### **2.3** Mode Pre-Imputation

In [None]:
input = pd.read_csv('GPLVM IMPUTATION/imputed_gplvm_mode_df.csv')
target = pd.read_csv('DATA/target_COMPLETE_DATA.csv')

scaled_input, y = scale_data(input, target)

##### Random Forest

In [None]:
r2_score_rf = random_forest(scaled_input, y, imp='GPLVM Mode')

In [None]:
print(r2_score_rf)


Random Forest 10-Fold CV R² Score GPLVM Mode: 0.707 ± 0.036


##### Gradient Boosting

In [None]:
r2_score_grad = gradient_boosting(scaled_input, y, imp='GPLVM Mode')

In [None]:
print(r2_score_grad)


Gradient Boosting 10-Fold CV R² Score GPLVM Mode: 0.721 ± 0.033


##### Extreme Gradient Boosting

In [None]:
r2_score_xgb = xgb_regressor(scaled_input, y, imp='GPLVM Mode')

In [None]:
print(r2_score_xgb)


XGB Regressor 10-Fold CV R² Score GPLVM Mode: 0.719 ± 0.033


##### SVR

In [None]:
r2_score_svr = svr(scaled_input, y, imp='GPLVM Mode')

In [None]:
print(r2_score_svr)


SVR 10-Fold CV R² Score GPLVM Mode: 0.732 ± 0.034


##### Ridge Regression

In [None]:
r2_score_ridge = ridge_regression(scaled_input, y, imp='GPLVM Mode')

In [None]:
print(r2_score_ridge)


Ridge Regression 10-Fold CV R² Score GPLVM Mode: 0.731 ± 0.036


##### MLP

In [None]:
r2_score_mlp = mlp(scaled_input, y, imp='GPLVM Mode')

In [None]:
print(r2_score_mlp)


MLP 10-Fold CV R² Score GPLVM Mode: 0.729 ± 0.035


##### GP Regression

In [None]:
r2_score_gpr = gp_regressor(scaled_input, y, imp='GPLVM Mode')

In [None]:
print(r2_score_gpr)


GPR 10-Fold CV R² Score GPLVM Mode: 0.731 ± 0.038


### **2.4** KNN Pre-Imputation

In [None]:
input = pd.read_csv('GPLVM IMPUTATION/imputed_gplvm_knn_df.csv')
target = pd.read_csv('DATA/target_COMPLETE_DATA.csv')

scaled_input, y = scale_data(input, target)

##### Random Forest

In [None]:
r2_score_rf = random_forest(scaled_input, y, imp='GPLVM KNN')

In [None]:
print(r2_score_rf)


Random Forest 10-Fold CV R² Score GPLVM KNN: 0.715 ± 0.036


##### Gradient Boosting

In [None]:
r2_score_grad = gradient_boosting(scaled_input, y, imp='GPLVM KNN')

In [None]:
print(r2_score_grad)


Gradient Boosting 10-Fold CV R² Score GPLVM KNN: 0.719 ± 0.035


##### Extreme Gradient Boosting

In [None]:
r2_score_xgb = xgb_regressor(scaled_input, y, imp='GPLVM KNN')

In [None]:
print(r2_score_xgb)


XGB Regressor 10-Fold CV R² Score GPLVM KNN: 0.720 ± 0.034


##### SVR

In [None]:
r2_score_svr = svr(scaled_input, y, imp='GPLVM KNN')

In [None]:
print(r2_score_svr)


SVR 10-Fold CV R² Score GPLVM KNN: 0.730 ± 0.034


##### Ridge Regression

In [None]:
r2_score_ridge = ridge_regression(scaled_input, y, imp='GPLVM KNN')

In [None]:
print(r2_score_ridge)


Ridge Regression 10-Fold CV R² Score GPLVM KNN: 0.730 ± 0.034


##### MLP

In [None]:
r2_score_mlp = mlp(scaled_input, y, imp='GPLVM KNN')

In [None]:
print(r2_score_mlp)


MLP 10-Fold CV R² Score GPLVM KNN: 0.728 ± 0.037


##### GP Regression

In [None]:
r2_score_gpr = gp_regressor(scaled_input, y, imp='GPLVM KNN')

In [None]:
print(r2_score_gpr)


GPR 10-Fold CV R² Score GPLVM KNN: 0.731 ± 0.037


### **2.5** Iterative Pre-Imputation

In [None]:
input = pd.read_csv('GPLVM IMPUTATION/imputed_gplvm_iter_df.csv')
target = pd.read_csv('DATA/target_COMPLETE_DATA.csv')

scaled_input, y = scale_data(input, target)

##### Random Forest

In [None]:
r2_score_rf = random_forest(scaled_input, y, imp='GPLVM Iterative')

In [None]:
print(r2_score_rf)


Random Forest 10-Fold CV R² Score GPLVM Iterative: 0.715 ± 0.036


##### Gradient Boosting

In [None]:
r2_score_grad = gradient_boosting(scaled_input, y, imp='GPLVM Iterative')

In [None]:
print(r2_score_grad)


Gradient Boosting 10-Fold CV R² Score GPLVM Iterative: 0.717 ± 0.035


##### Extreme Gradient Boosting

In [None]:
r2_score_xgb = xgb_regressor(scaled_input, y, imp='GPLVM Iterative')

In [None]:
print(r2_score_xgb)


XGB Regressor 10-Fold CV R² Score GPLVM Iterative: 0.718 ± 0.035


##### SVR

In [None]:
r2_score_svr = svr(scaled_input, y, imp='GPLVM Iterative')

In [None]:
print(r2_score_svr)


SVR 10-Fold CV R² Score GPLVM Iterative: 0.723 ± 0.037


##### Ridge Regression

In [None]:
r2_score_ridge = ridge_regression(scaled_input, y, imp='GPLVM Iterative')

In [None]:
print(r2_score_ridge)


Ridge Regression 10-Fold CV R² Score GPLVM Iterative: 0.729 ± 0.035


##### MLP

In [None]:
r2_score_mlp = mlp(scaled_input, y, imp='GPLVM Iterative')

In [None]:
print(r2_score_mlp)


MLP 10-Fold CV R² Score GPLVM Iterative: 0.729 ± 0.038


##### GP Regression

In [None]:
r2_score_gpr = gp_regressor(scaled_input, y, imp='GPLVM Iterative')

In [None]:
print(r2_score_gpr)


GPR 10-Fold CV R² Score GPLVM Iterative: 0.730 ± 0.037


### **2.6** MICE Pre-Imputation

In [None]:
input = pd.read_csv('GPLVM IMPUTATION/imputed_gplvm_mice_df.csv')
target = pd.read_csv('DATA/target_COMPLETE_DATA.csv')

scaled_input, y = scale_data(input, target)

##### Random Forest

In [None]:
r2_score_rf = random_forest(scaled_input, y,)

In [None]:
print(r2_score_rf)


Random Forest 10-Fold CV R² Score None: 0.709 ± 0.039


##### Gradient Boosting

In [None]:
r2_score_grad = gradient_boosting(scaled_input, y, imp='GPLVM MICE')

In [None]:
print(r2_score_grad)


Gradient Boosting 10-Fold CV R² Score GPLVM MICE: 0.721 ± 0.034


##### Extreme Gradient Boosting

In [None]:
r2_score_xgb = xgb_regressor(scaled_input, y, imp='GPLVM MICE')

In [None]:
print(r2_score_xgb)


XGB Regressor 10-Fold CV R² Score GPLVM MICE: 0.717 ± 0.033


##### SVR

In [None]:
r2_score_svr = svr(scaled_input, y, imp='GPLVM MICE')

In [None]:
print(r2_score_svr)


SVR 10-Fold CV R² Score GPLVM MICE: 0.730 ± 0.034


##### Ridge Regression

In [None]:
r2_score_ridge = ridge_regression(scaled_input, y, imp='GPLVM MICE')

In [None]:
print(r2_score_ridge)


Ridge Regression 10-Fold CV R² Score GPLVM MICE: 0.730 ± 0.035


##### MLP

In [None]:
r2_score_mlp = mlp(scaled_input, y, imp='GPLVM MICE')

In [None]:
print(r2_score_mlp)


MLP 10-Fold CV R² Score GPLVM MICE: 0.729 ± 0.038


##### GP Regression

In [None]:
r2_score_gpr = gp_regressor(scaled_input, y, imp='GPLVM MICE')

In [None]:
print(r2_score_gpr)


GPR 10-Fold CV R² Score GPLVM MICE: 0.730 ± 0.037


## **3.** GPLVM Latent Spaces

### **3.1** Mean Pre-Imputation

In [None]:
input = pd.read_csv('GPLVM IMPUTATION/latent_mean_df.csv')
target = pd.read_csv('DATA/target_COMPLETE_DATA.csv')

all_data = pd.concat([input, target], axis=1)
all_data = all_data.dropna(subset=['ADAS13'])

# Divide again in input and target
input = all_data.drop(columns=['ADAS13'])
target = all_data['ADAS13']

# Scale the data
scaler = StandardScaler()
input_scaled = scaler.fit_transform(input)

scaled_input = pd.DataFrame(input_scaled, columns=input.columns)
y = target

##### Random Forest

In [None]:
r2_score_rf = random_forest(scaled_input, y, imp='Latent GPLVM Mean')

In [None]:
print(r2_score_rf)


Random Forest 10-Fold CV R² Score Latent GPLVM Mean: 0.198 ± 0.037


##### Gradient Boosting

In [None]:
r2_score_grad = gradient_boosting(scaled_input, y, imp='Latent GPLVM Mean')

In [None]:
print(r2_score_grad)


Gradient Boosting 10-Fold CV R² Score Latent GPLVM Mean: 0.199 ± 0.033


##### Extreme Gradient Boosting

In [None]:
r2_score_xgb = xgb_regressor(scaled_input, y, imp='Latent GPLVM Mean')

In [None]:
print(r2_score_xgb)


XGB Regressor 10-Fold CV R² Score Latent GPLVM Mean: 0.195 ± 0.028


##### SVR

In [None]:
r2_score_svr = svr(scaled_input, y, imp='Latent GPLVM Mean')

In [None]:
print(r2_score_svr)


SVR 10-Fold CV R² Score Latent GPLVM Mean: 0.203 ± 0.032


##### Ridge Regression

In [None]:
r2_score_ridge = ridge_regression(scaled_input, y, imp='Latent GPLVM Mean')

In [None]:
print(r2_score_ridge)


Ridge Regression 10-Fold CV R² Score Latent GPLVM Mean: 0.091 ± 0.040


##### MLP

In [None]:
r2_score_mlp = mlp(scaled_input, y, imp='Latent GPLVM Mean')

In [None]:
print(r2_score_mlp)


MLP 10-Fold CV R² Score Latent GPLVM Mean: 0.200 ± 0.044


##### GP Regression

In [None]:
r2_score_gpr = gp_regressor(scaled_input, y, imp='Latent GPLVM Mean')

In [None]:
print(r2_score_gpr)


GPR 10-Fold CV R² Score Latent GPLVM Mean: 0.211 ± 0.036


### **3.2** Median Pre-Imputation

In [None]:
input = pd.read_csv('GPLVM IMPUTATION/latent_median_df.csv')
target = pd.read_csv('DATA/target_COMPLETE_DATA.csv')

all_data = pd.concat([input, target], axis=1)
all_data = all_data.dropna(subset=['ADAS13'])

# Divide again in input and target
input = all_data.drop(columns=['ADAS13'])
target = all_data['ADAS13']

# Scale the data
scaler = StandardScaler()
input_scaled = scaler.fit_transform(input)

scaled_input = pd.DataFrame(input_scaled, columns=input.columns)
y = target

##### Random Forest

In [None]:
r2_score_rf = random_forest(scaled_input, y, imp='Latent GPLVM Median')

In [None]:
print(r2_score_rf)


Random Forest 10-Fold CV R² Score Latent GPLVM Median: 0.195 ± 0.054


##### Gradient Boosting

In [None]:
r2_score_grad = gradient_boosting(scaled_input, y, imp='Latent GPLVM Median')

In [None]:
print(r2_score_grad)


Gradient Boosting 10-Fold CV R² Score Latent GPLVM Median: 0.190 ± 0.044


##### Extreme Gradient Boosting

In [None]:
r2_score_xgb = xgb_regressor(scaled_input, y, imp='Latent GPLVM Median')

In [None]:
print(r2_score_xgb)


XGB Regressor 10-Fold CV R² Score Latent GPLVM Median: 0.181 ± 0.043


##### SVR

In [None]:
r2_score_svr = svr(scaled_input, y, imp='Latent GPLVM Median')

In [None]:
print(r2_score_svr)


SVR 10-Fold CV R² Score Latent GPLVM Median: 0.189 ± 0.039


##### Ridge Regression

In [None]:
r2_score_ridge = ridge_regression(scaled_input, y, imp='Latent GPLVM Median')

In [None]:
print(r2_score_ridge)


Ridge Regression 10-Fold CV R² Score Latent GPLVM Median: 0.186 ± 0.047


##### MLP

In [None]:
r2_score_mlp = mlp(scaled_input, y, imp='Latent GPLVM Median')

In [None]:
print(r2_score_mlp)


MLP 10-Fold CV R² Score Latent GPLVM Median: 0.189 ± 0.056


##### GP Regression

In [None]:
r2_score_gpr = gp_regressor(scaled_input, y, imp='Latent GPLVM Median')

In [None]:
print(r2_score_gpr)


GPR 10-Fold CV R² Score Latent GPLVM Median: 0.201 ± 0.052


### **3.3** Mode Pre-Imputation

In [None]:
input = pd.read_csv('GPLVM IMPUTATION/latent_mode_df.csv')
target = pd.read_csv('DATA/target_COMPLETE_DATA.csv')

all_data = pd.concat([input, target], axis=1)
all_data = all_data.dropna(subset=['ADAS13'])

# Divide again in input and target
input = all_data.drop(columns=['ADAS13'])
target = all_data['ADAS13']

# Scale the data
scaler = StandardScaler()
input_scaled = scaler.fit_transform(input)

scaled_input = pd.DataFrame(input_scaled, columns=input.columns)
y = target

##### Random Forest

In [None]:
r2_score_rf = random_forest(scaled_input, y, imp='Latent GPLVM Mode')

In [None]:
print(r2_score_rf)


Random Forest 10-Fold CV R² Score Latent GPLVM Mode: 0.203 ± 0.045


##### Gradient Boosting

In [None]:
r2_score_grad = gradient_boosting(scaled_input, y, imp='Latent GPLVM Mode')

In [None]:
print(r2_score_grad)


Gradient Boosting 10-Fold CV R² Score Latent GPLVM Mode: 0.200 ± 0.040


##### Extreme Gradient Boosting

In [None]:
r2_score_xgb = xgb_regressor(scaled_input, y, imp='Latent GPLVM Mode')

In [None]:
print(r2_score_xgb)


XGB Regressor 10-Fold CV R² Score Latent GPLVM Mode: 0.207 ± 0.043


##### SVR

In [None]:
r2_score_svr = svr(scaled_input, y, imp='Latent GPLVM Mode')

In [None]:
print(r2_score_svr)


SVR 10-Fold CV R² Score Latent GPLVM Mode: 0.203 ± 0.036


##### Ridge Regression

In [None]:
r2_score_ridge = ridge_regression(scaled_input, y, imp='Latent GPLVM Mode')

In [None]:
print(r2_score_ridge)


Ridge Regression 10-Fold CV R² Score Latent GPLVM Mode: 0.155 ± 0.046


##### MLP

In [None]:
r2_score_mlp = mlp(scaled_input, y, imp='Latent GPLVM Mode')

In [None]:
print(r2_score_mlp)


MLP 10-Fold CV R² Score Latent GPLVM Mode: 0.197 ± 0.043


##### GP Regression

In [None]:
r2_score_gpr = gp_regressor(scaled_input, y, imp='Latent GPLVM Mode')

In [None]:
print(r2_score_gpr)


GPR 10-Fold CV R² Score Latent GPLVM Mode: 0.217 ± 0.044


### **3.4** KNN Pre-Imputation

In [None]:
input = pd.read_csv('GPLVM IMPUTATION/latent_knn_df.csv')
target = pd.read_csv('DATA/target_COMPLETE_DATA.csv')

all_data = pd.concat([input, target], axis=1)
all_data = all_data.dropna(subset=['ADAS13'])

# Divide again in input and target
input = all_data.drop(columns=['ADAS13'])
target = all_data['ADAS13']

# Scale the data
scaler = StandardScaler()
input_scaled = scaler.fit_transform(input)

scaled_input = pd.DataFrame(input_scaled, columns=input.columns)
y = target

##### Random Forest

In [None]:
r2_score_rf = random_forest(scaled_input, y, imp='Latent GPLVM KNN')

In [None]:
print(r2_score_rf)


Random Forest 10-Fold CV R² Score Latent GPLVM KNN: 0.166 ± 0.063


##### Gradient Boosting

In [None]:
r2_score_grad = gradient_boosting(scaled_input, y, imp='Latent GPLVM KNN')

In [None]:
print(r2_score_grad)


Gradient Boosting 10-Fold CV R² Score Latent GPLVM KNN: 0.177 ± 0.046


##### Extreme Gradient Boosting

In [None]:
r2_score_xgb = xgb_regressor(scaled_input, y, imp='Latent GPLVM KNN')

In [None]:
print(r2_score_xgb)


XGB Regressor 10-Fold CV R² Score Latent GPLVM KNN: 0.180 ± 0.047


##### SVR

In [None]:
r2_score_svr = svr(scaled_input, y, imp='Latent GPLVM KNN')

In [None]:
print(r2_score_svr)


SVR 10-Fold CV R² Score Latent GPLVM KNN: 0.152 ± 0.044


##### Ridge Regression

In [None]:
r2_score_ridge = ridge_regression(scaled_input, y, imp='Latent GPLVM KNN')

In [None]:
print(r2_score_ridge)


Ridge Regression 10-Fold CV R² Score Latent GPLVM KNN: 0.168 ± 0.049


##### MLP

In [None]:
r2_score_mlp = mlp(scaled_input, y, imp='Latent GPLVM KNN')

In [None]:
print(r2_score_mlp)


MLP 10-Fold CV R² Score Latent GPLVM KNN: 0.186 ± 0.046


##### GP Regression

In [None]:
r2_score_gpr = gp_regressor(scaled_input, y, imp='Latent GPLVM KNN')

In [None]:
print(r2_score_gpr)


GPR 10-Fold CV R² Score Latent GPLVM KNN: 0.196 ± 0.056


### **3.5** Iterative Pre-Imputation

In [None]:
input = pd.read_csv('GPLVM IMPUTATION/latent_iter_df.csv')
target = pd.read_csv('DATA/target_COMPLETE_DATA.csv')

all_data = pd.concat([input, target], axis=1)
all_data = all_data.dropna(subset=['ADAS13'])

# Divide again in input and target
input = all_data.drop(columns=['ADAS13'])
target = all_data['ADAS13']

# Scale the data
scaler = StandardScaler()
input_scaled = scaler.fit_transform(input)

scaled_input = pd.DataFrame(input_scaled, columns=input.columns)
y = target

##### Random Forest

In [None]:
r2_score_rf = random_forest(scaled_input, y, imp='Latent GPLVM Iterative')

In [None]:
print(r2_score_rf)


Random Forest 10-Fold CV R² Score Latent GPLVM Iterative: 0.197 ± 0.048


##### Gradient Boosting

In [None]:
r2_score_grad = gradient_boosting(scaled_input, y, imp='Latent GPLVM Iterative')

In [None]:
print(r2_score_grad)


Gradient Boosting 10-Fold CV R² Score Latent GPLVM Iterative: 0.203 ± 0.045


##### Extreme Gradient Boosting

In [None]:
r2_score_xgb = xgb_regressor(scaled_input, y, imp='Latent GPLVM Iterative')

In [None]:
print(r2_score_xgb)


XGB Regressor 10-Fold CV R² Score Latent GPLVM Iterative: 0.205 ± 0.046


##### SVR

In [None]:
r2_score_svr = svr(scaled_input, y, imp='Latent GPLVM Iterative')

In [None]:
print(r2_score_svr)


SVR 10-Fold CV R² Score Latent GPLVM Iterative: 0.186 ± 0.052


##### Ridge Regression

In [None]:
r2_score_ridge = ridge_regression(scaled_input, y, imp='Latent GPLVM Iterative')

In [None]:
print(r2_score_ridge)


Ridge Regression 10-Fold CV R² Score Latent GPLVM Iterative: 0.187 ± 0.051


##### MLP

In [None]:
r2_score_mlp = mlp(scaled_input, y, imp='Latent GPLVM Iterative')

In [None]:
print(r2_score_mlp)


MLP 10-Fold CV R² Score Latent GPLVM Iterative: 0.194 ± 0.048


##### GP Regression

In [None]:
r2_score_gpr = gp_regressor(scaled_input, y, imp='Latent GPLVM Iterative')

In [None]:
print(r2_score_gpr)


GPR 10-Fold CV R² Score Latent GPLVM Iterative: 0.204 ± 0.046


### **3.6** MICE Pre-Imputation

In [None]:
input = pd.read_csv('GPLVM IMPUTATION/latent_mice_df.csv')
target = pd.read_csv('DATA/target_COMPLETE_DATA.csv')

all_data = pd.concat([input, target], axis=1)
all_data = all_data.dropna(subset=['ADAS13'])

# Divide again in input and target
input = all_data.drop(columns=['ADAS13'])
target = all_data['ADAS13']

# Scale the data
scaler = StandardScaler()
input_scaled = scaler.fit_transform(input)

scaled_input = pd.DataFrame(input_scaled, columns=input.columns)
y = target

##### Random Forest

In [None]:
r2_score_rf = random_forest(scaled_input, y, imp='Latent GPLVM MICE')

In [None]:
print(r2_score_rf)


Random Forest 10-Fold CV R² Score Latent GPLVM MICE: 0.209 ± 0.049


##### Gradient Boosting

In [None]:
r2_score_grad = gradient_boosting(scaled_input, y, imp='Latent GPLVM MICE')

In [None]:
print(r2_score_grad)


Gradient Boosting 10-Fold CV R² Score Latent GPLVM MICE: 0.198 ± 0.050


##### Extreme Gradient Boosting

In [None]:
r2_score_xgb = xgb_regressor(scaled_input, y, imp='Latent GPLVM MICE')

In [None]:
print(r2_score_xgb)


XGB Regressor 10-Fold CV R² Score Latent GPLVM MICE: 0.194 ± 0.045


##### SVR

In [None]:
r2_score_svr = svr(scaled_input, y, imp='Latent GPLVM MICE')

In [None]:
print(r2_score_svr)


SVR 10-Fold CV R² Score Latent GPLVM MICE: 0.192 ± 0.044


##### Ridge Regression

In [None]:
r2_score_ridge = ridge_regression(scaled_input, y, imp='Latent GPLVM MICE')

In [None]:
print(r2_score_ridge)


Ridge Regression 10-Fold CV R² Score Latent GPLVM MICE: 0.166 ± 0.049


##### MLP

In [None]:
r2_score_mlp = mlp(scaled_input, y, imp='Latent GPLVM MICE')

In [None]:
print(r2_score_mlp)


MLP 10-Fold CV R² Score Latent GPLVM MICE: 0.205 ± 0.045


##### GP Regression

In [None]:
r2_score_gpr = gp_regressor(scaled_input, y, imp='Latent GPLVM MICE')

In [None]:
print(r2_score_gpr)


GPR 10-Fold CV R² Score Latent GPLVM MICE: 0.213 ± 0.046
