# Running regression models on the clusters

In [None]:
#| hide
from nbdev.showdoc import *

## Import Modules and Import Data

In [None]:
#| export
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from tabulate import tabulate
from scipy.stats import ttest_rel, ttest_ind

df_pheno = pd.read_pickle('../../../D/2023-ca4021-bradyd-35-mcdaida-3/data/proc/pheno_cluster.pkl')

In [None]:
df_pheno.columns

Index(['DonAge', 'RecAge', 'DonSex', 'RecSex', 'SexMismatch', 'Year',
       'IntracranialHaemorrhage', 'RecPC1', 'RecHypertensionPRS',
       'DonHypertensionPRS', 'DoneGFRPRS', 'DonStrokePRS', 'RecHAKVPRS',
       'ColdIschemiaTime', 'GraftNo', 'eGFR1Year', 'eGFR5Year',
       'GraftSurvivalDays', 'MClustClusters', 'KamilaClusters'],
      dtype='object')

In [None]:
df_pheno.shape

(1878, 20)

## Data Preparation
1. Convert categorical data to numerical.
2. Dropping features that I am not including for now
3. Remove null values from eGFR1year

In [None]:
df_pheno.dtypes

DonAge                      float64
RecAge                      float64
DonSex                     category
RecSex                     category
SexMismatch                    bool
Year                        float64
IntracranialHaemorrhage        bool
RecPC1                      float64
RecHypertensionPRS          float64
DonHypertensionPRS          float64
DoneGFRPRS                  float64
DonStrokePRS                float64
RecHAKVPRS                  float64
ColdIschemiaTime            float64
GraftNo                     float64
eGFR1Year                   float64
eGFR5Year                   float64
GraftSurvivalDays           float64
MClustClusters             category
KamilaClusters             category
dtype: object

In [None]:
## Changing column MClustClusters from category to type int
## ## Changing column KamilaClusters from category to type int
## Changing column RecSex/DonSex from category to type int
df_pheno['MClustClusters'] = df_pheno['MClustClusters'].astype(int)
df_pheno['KamilaClusters'] = df_pheno['KamilaClusters'].astype(int)
df_pheno['DonSex'].replace(['Male', 'Female'], [0,1], inplace=True)
df_pheno['RecSex'].replace(['Male', 'Female'], [0,1], inplace=True)
df_pheno['DonSex'] = df_pheno['DonSex'].astype(int)
df_pheno['RecSex'] = df_pheno['RecSex'].astype(int)

## Dropping 'eGFR5Year' and 'GraftSurvivalDays' as post transplant features
df_pheno = df_pheno.drop(['eGFR5Year', 'GraftSurvivalDays'], axis = 1)

## remove null values in eGFR 1 year
df_pheno = df_pheno.dropna(subset=['eGFR1Year'])

In [None]:
df_pheno.dtypes

DonAge                     float64
RecAge                     float64
DonSex                       int64
RecSex                       int64
SexMismatch                   bool
Year                       float64
IntracranialHaemorrhage       bool
RecPC1                     float64
RecHypertensionPRS         float64
DonHypertensionPRS         float64
DoneGFRPRS                 float64
DonStrokePRS               float64
RecHAKVPRS                 float64
ColdIschemiaTime           float64
GraftNo                    float64
eGFR1Year                  float64
MClustClusters               int64
KamilaClusters               int64
dtype: object

In [None]:
df_pheno.head()

Unnamed: 0,DonAge,RecAge,DonSex,RecSex,SexMismatch,Year,IntracranialHaemorrhage,RecPC1,RecHypertensionPRS,DonHypertensionPRS,DoneGFRPRS,DonStrokePRS,RecHAKVPRS,ColdIschemiaTime,GraftNo,eGFR1Year,MClustClusters,KamilaClusters
0,25.0,54.0,1,1,True,2000.0,False,-0.00181,0.574557,0.041781,-0.06846,-2.471242,-0.329893,1080.0,1.0,37.016586,3,4
1,37.0,35.0,0,0,True,2001.0,False,-0.009447,-0.37424,0.829304,0.975353,0.069456,0.439058,1110.0,1.0,75.068169,3,1
2,22.0,53.0,1,1,True,2002.0,False,-0.006079,0.050664,0.20412,-0.613161,-0.701665,0.004753,1102.0,1.0,59.60294,3,4
3,48.0,33.0,1,0,False,2002.0,True,-0.005711,-0.178995,0.322234,-0.446593,-0.399453,-1.000807,740.0,1.0,50.073505,1,3
4,39.0,61.0,0,0,True,1999.0,False,-0.010986,-0.090834,-0.195021,1.232662,1.297929,-1.184922,1072.0,1.0,43.704456,3,5


## Evaluation function

In [None]:
#| export
def evaluate_model(predict, y_test):
    ### Calculate the mean squared error
    ## Average squared difference between the observed and the predicted values.
    ### Calculate the absolute errors
    ### Difference between the observed and the predicted values.
    ## R squared score
    ## Explains to what extent the variance of one variable explains the variance of the second variable.
    rmse = np.sqrt(mean_squared_error(y_test, predict))
    errors = abs(predict - y_test)
    mae = np.mean(errors)
    r_square = r2_score(y_test, predict)
    
    table = [['RMSE', 'MAE', 'R Squared'], [rmse, mae, r_square]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))
    
    ## Return list of errors in an array
    difference = y_test - predict
    error_array = np.array(difference)
    return error_array

# MClust Clusters

In [None]:
## Removing KAMILA clusters
df_mclust = df_pheno.drop(['KamilaClusters'], axis = 1)

In [None]:
df_mclust['MClustClusters'].min(), df_mclust['MClustClusters'].max()

(1, 3)

## Splitting the data based on cluster value

In [None]:
df_mclust_1 = df_mclust.loc[df_mclust.MClustClusters==1]
df_mclust_2 = df_mclust.loc[df_mclust.MClustClusters==2]
df_mclust_3 = df_mclust.loc[df_mclust.MClustClusters==3]

In [None]:
df_mclust_3['MClustClusters'].min(), df_mclust_3['MClustClusters'].max()

(3, 3)

In [None]:
x1 = df_mclust_1.drop(['MClustClusters', 'eGFR1Year'], axis = 1)
y1 = df_mclust_1['eGFR1Year']

x2 = df_mclust_2.drop(['MClustClusters', 'eGFR1Year'], axis = 1)
y2 = df_mclust_2['eGFR1Year']

x3 = df_mclust_3.drop(['MClustClusters', 'eGFR1Year'], axis = 1)
y3 = df_mclust_3['eGFR1Year']


## Splitting each dataset into training and testing data

In [None]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size=0.2, random_state=123)
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size=0.2, random_state=123)
x_train3, x_test3, y_train3, y_test3 = train_test_split(x3, y3, test_size=0.2, random_state=123)

## Carrying out Machine learning on Cluster 1
1. Linear Regression
2. XGBoost Regression
3. Random Forest Regression
4. Support Vector Regression

### 1. Linear Regression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(x_train1, y_train1)
predicts = lin_reg.predict(x_test1)
lin_error1 = evaluate_model(predicts, y_test1)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.5732 │ 13.2541 │    0.212628 │
╘═════════╧═════════╧═════════════╛


### Hyperparameter Tuning

In [None]:
# define hyperparameters to tune
params = {'normalize': [True, False],
          'fit_intercept': [True, False]}
# create linear regression model
lr = LinearRegression()

# use grid search to find best hyperparameters
grid_search = GridSearchCV(lr, param_grid=params, cv=5)
grid_search.fit(x_train1, y_train1)
grid_search_pred = grid_search.predict(x_test1)

lin_error1 = evaluate_model(grid_search_pred, y_test1)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.5732 │ 13.2541 │    0.212628 │
╘═════════╧═════════╧═════════════╛


#### Regularization 

In [None]:
## Apply L1 regularization using Lasso regression:
lasso_model = Lasso(alpha=0.5)
lasso_model.fit(x_train1, y_train1)
y_pred_lasso = lasso_model.predict(x_test1)
lin_error_use1 = evaluate_model(y_pred_lasso, y_test1)

##Lasso regression can help to reduce the impact of less important features on the model and improve its performance.

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.3766 │ 13.0373 │    0.231199 │
╘═════════╧═════════╧═════════════╛


### 2. XGBoost Regression

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(x_train1,y_train1)
predicts = xg_reg.predict(x_test1)
xg_error1 = evaluate_model(predicts, y_test1)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 26.5064 │ 21.6227 │    -1.01405 │
╘═════════╧═════════╧═════════════╛


#### Hyperparameter Tuning

In [None]:
## Hyperparameter tuning
param_grid = {
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200],
    'colsample_bytree': [0.3, 0.5, 0.7]
}

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')
grid_search = GridSearchCV(xg_reg, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train1, y_train1)
grid_search_pred = grid_search.predict(x_test1)

xg_error_use1 = evaluate_model(grid_search_pred, y_test1)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.1537 │ 13.2975 │    0.251986 │
╘═════════╧═════════╧═════════════╛


#### Regularization 

In [None]:
## Apply L1 regularization using the alpha parameter:

xgb_model_l1 = xgb.XGBRegressor(objective='reg:squarederror', random_state=123, n_estimators=1000, max_depth=5, learning_rate=0.1, alpha=0.5, colsample_bytree=0.5)
xgb_model_l1.fit(x_train1,y_train1)
predicts = xgb_model_l1.predict(x_test1)
xg_error1 = evaluate_model(predicts, y_test1)

##By applying L1 regularization, we can reduce the impact of less important features on the model and improve its performance.

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.5462 │ 13.4405 │    0.215193 │
╘═════════╧═════════╧═════════════╛


### 3. Random Forest Regression

In [None]:
rf_reg = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_reg.fit(x_train1,y_train1)
predictions = rf_reg.predict(x_test1)
rf_error1 = evaluate_model(predictions, y_test1)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.4848 │ 13.6401 │    0.221002 │
╘═════════╧═════════╧═════════════╛


#### Hyperparameter Tuning

In [None]:
## Hyperparameter Tuning

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'max_features': ['auto','sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_reg = RandomForestRegressor()
grid_search = GridSearchCV(rf_reg, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train1, y_train1)
grid_search_pred = grid_search.predict(x_test1)

rf_error_use1 = evaluate_model(grid_search_pred, y_test1)

╒═════════╤════════╤═════════════╕
│    RMSE │    MAE │   R Squared │
╞═════════╪════════╪═════════════╡
│ 16.1891 │ 13.463 │    0.248698 │
╘═════════╧════════╧═════════════╛


#### Regularization

In [None]:
rf_model_l1 = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=123, ccp_alpha=0.5)
rf_model_l1.fit(x_train1,y_train1)
predictions = rf_model_l1.predict(x_test1)
rf_error1 = evaluate_model(predictions, y_test1)

## We are adding L1 regularization by setting the ccp_alpha parameter to 0.5. 
## The ccp_alpha parameter controls the complexity of the tree and can help to prevent overfitting. 
## By applying L1 regularization, we can reduce the impact of less important features on the model and improve its performance.

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.4258 │ 13.6485 │    0.226568 │
╘═════════╧═════════╧═════════════╛


### 4. Support Vector Regression

In [None]:
svr_reg = SVR(kernel = 'linear', C=1, epsilon=0.1)
svr_reg.fit(x_train1,y_train1)
predictions = svr_reg.predict(x_test1)
svr_error_use1 = evaluate_model(predictions, y_test1)

╒═════════╤════════╤═════════════╕
│    RMSE │    MAE │   R Squared │
╞═════════╪════════╪═════════════╡
│ 16.3345 │ 12.737 │    0.235147 │
╘═════════╧════════╧═════════════╛


#### Hyperparameter Tuning

In [None]:
## Hyperparameter Tuning
# Define the parameter grid for the SVR model
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear']
}

svr = SVR()

# Perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train1, y_train1)

grid_search_pred = grid_search.predict(x_test1)

svr_error1 = evaluate_model(grid_search_pred, y_test1)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.3386 │ 12.8898 │    0.234758 │
╘═════════╧═════════╧═════════════╛


#### Regularization

In [None]:
svr_model_l1 = SVR(kernel='rbf', C=1.0, epsilon=0.1, coef0=0.0, shrinking=True, tol=0.001, cache_size=200, verbose=False, max_iter=-1, gamma='scale', degree=3)
svr_model_l1.set_params(C=0.5, epsilon=0.2, kernel='linear')
svr_model_l1.fit(x_train1,y_train1)
predictions = svr_model_l1.predict(x_test1)
svr_error1 = evaluate_model(predictions, y_test1)

## We are adding L1 regularization by setting the C parameter to 0.5 and the kernel to 'linear'. 
## We are also setting the epsilon parameter to 0.2 to control the width of the epsilon-insensitive zone. 
## By applying L1 regularization, we can reduce the impact of less important features on the model and improve its performance.

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.3572 │ 12.8447 │    0.233014 │
╘═════════╧═════════╧═════════════╛


## Carrying out Machine learning on Cluster 2
1. Linear Regression
2. XGBoost Regression
3. Random Forest Regression
4. Support Vector Regression

### 1. Linear Regression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(x_train2, y_train2)
predicts = lin_reg.predict(x_test2)
lin_error2 = evaluate_model(predicts, y_test2)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.8311 │ 13.4509 │    0.103644 │
╘═════════╧═════════╧═════════════╛


#### Hyperparameter Tuning

In [None]:
# define hyperparameters to tune
params = {'normalize': [True, False],
          'fit_intercept': [True, False]}
# create linear regression model
lr = LinearRegression()

# use grid search to find best hyperparameters
grid_search = GridSearchCV(lr, param_grid=params, cv=5)
grid_search.fit(x_train2, y_train2)
grid_search_pred = grid_search.predict(x_test2)

lin_error2 = evaluate_model(grid_search_pred, y_test2)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.6241 │ 13.2563 │    0.125552 │
╘═════════╧═════════╧═════════════╛


#### Regularization

In [None]:
## Apply L1 regularization using Lasso regression:
lasso_model = Lasso(alpha=0.5)
lasso_model.fit(x_train2, y_train2)
y_pred_lasso = lasso_model.predict(x_test2)
lin_error_use2 = evaluate_model(y_pred_lasso, y_test2)

##Lasso regression can help to reduce the impact of less important features on the model and improve its performance.

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.5095 │ 13.1655 │    0.137572 │
╘═════════╧═════════╧═════════════╛


### 2. XGBoost Regression

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(x_train2,y_train2)
predicts = xg_reg.predict(x_test2)
xg_error2 = evaluate_model(predicts, y_test2)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 25.2523 │ 20.7985 │    -1.01771 │
╘═════════╧═════════╧═════════════╛


#### Hyperparameter Tuning

In [None]:
## Hyperparameter tuning
param_grid = {
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200],
    'colsample_bytree': [0.3, 0.5, 0.7]
}

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')
grid_search = GridSearchCV(xg_reg, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train2, y_train2)
grid_search_pred = grid_search.predict(x_test2)

xg_error_use2 = evaluate_model(grid_search_pred, y_test2)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.8189 │ 13.3597 │    0.104945 │
╘═════════╧═════════╧═════════════╛


#### Regularization

In [None]:
## Apply L1 regularization using the alpha parameter:

xgb_model_l1 = xgb.XGBRegressor(objective='reg:squarederror', random_state=123, n_estimators=1000, max_depth=5, learning_rate=0.1, alpha=0.5, colsample_bytree=0.5)
xgb_model_l1.fit(x_train2,y_train2)
predicts = xgb_model_l1.predict(x_test2)
xg_error2 = evaluate_model(predicts, y_test2)

##By applying L1 regularization, we can reduce the impact of less important features on the model and improve its performance.

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 17.8387 │ 14.2114 │ -0.00689356 │
╘═════════╧═════════╧═════════════╛


### 3. Random Forest Regression

In [None]:
rf_reg = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_reg.fit(x_train2,y_train2)
predictions = rf_reg.predict(x_test2)
rf_error2 = evaluate_model(predictions, y_test2)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 15.9134 │ 12.8405 │    0.198725 │
╘═════════╧═════════╧═════════════╛


#### Hyperparameter Tuning

In [None]:
## Hyperparameter Tuning

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'max_features': ['auto','sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_reg = RandomForestRegressor()
grid_search = GridSearchCV(rf_reg, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train2, y_train2)
grid_search_pred = grid_search.predict(x_test2)

rf_error_use2 = evaluate_model(grid_search_pred, y_test2)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 15.9673 │ 12.7441 │    0.193284 │
╘═════════╧═════════╧═════════════╛


#### Regularization

In [None]:
rf_model_l1 = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=123, ccp_alpha=0.5)
rf_model_l1.fit(x_train2,y_train2)
predictions = rf_model_l1.predict(x_test2)
rf_error2 = evaluate_model(predictions, y_test2)

## We are adding L1 regularization by setting the ccp_alpha parameter to 0.5. 
## The ccp_alpha parameter controls the complexity of the tree and can help to prevent overfitting. 
## By applying L1 regularization, we can reduce the impact of less important features on the model and improve its performance.

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.0055 │ 12.8786 │    0.189422 │
╘═════════╧═════════╧═════════════╛


### 4. Support Vector Regression

In [None]:
svr_reg = SVR(kernel = 'linear', C=1, epsilon=0.1)
svr_reg.fit(x_train2,y_train2)
predictions = svr_reg.predict(x_test2)
svr_error2 = evaluate_model(predictions, y_test2)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.6197 │ 13.0975 │    0.126021 │
╘═════════╧═════════╧═════════════╛


#### Hyperparameter Tuning

In [None]:
## Hyperparameter Tuning
# Define the parameter grid for the SVR model
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear']
}

svr = SVR()

# Perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train2, y_train2)

grid_search_pred = grid_search.predict(x_test2)

svr_error_use2 = evaluate_model(grid_search_pred, y_test2)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.6954 │ 13.2443 │    0.118041 │
╘═════════╧═════════╧═════════════╛


#### Regularization

In [None]:
svr_model_l1 = SVR(kernel='rbf', C=1.0, epsilon=0.1, coef0=0.0, shrinking=True, tol=0.001, cache_size=200, verbose=False, max_iter=-1, gamma='scale', degree=3)
svr_model_l1.set_params(C=0.5, epsilon=0.2, kernel='linear')
svr_model_l1.fit(x_train2,y_train2)
predictions = svr_model_l1.predict(x_test2)
svr_error2 = evaluate_model(predictions, y_test2)

## We are adding L1 regularization by setting the C parameter to 0.5 and the kernel to 'linear'. 
## We are also setting the epsilon parameter to 0.2 to control the width of the epsilon-insensitive zone. 
## By applying L1 regularization, we can reduce the impact of less important features on the model and improve its performance.

╒════════╤═════════╤═════════════╕
│   RMSE │     MAE │   R Squared │
╞════════╪═════════╪═════════════╡
│ 16.639 │ 13.1215 │    0.123983 │
╘════════╧═════════╧═════════════╛


## Carrying out Machine learning on Cluster 3
1. Linear Regression
2. XGBoost Regression
3. Random Forest Regression
4. Support Vector Regression

### 1. Linear Regression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(x_train3, y_train3)
predicts = lin_reg.predict(x_test3)
lin_error3 = evaluate_model(predicts, y_test3)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.2227 │ 12.2408 │    0.233061 │
╘═════════╧═════════╧═════════════╛


#### Hyperparameter Tuning

In [None]:
# define hyperparameters to tune
params = {'normalize': [True, False],
          'fit_intercept': [True, False]}
# create linear regression model
lr = LinearRegression()

# use grid search to find best hyperparameters
grid_search = GridSearchCV(lr, param_grid=params, cv=5)
grid_search.fit(x_train3, y_train3)
grid_search_pred = grid_search.predict(x_test3)

lin_error_use3 = evaluate_model(grid_search_pred, y_test3)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.2227 │ 12.2408 │    0.233061 │
╘═════════╧═════════╧═════════════╛


#### Regularization

In [None]:
## Apply L1 regularization using Lasso regression:
lasso_model = Lasso(alpha=0.5)
lasso_model.fit(x_train3, y_train3)
y_pred_lasso = lasso_model.predict(x_test3)
lin_error3 = evaluate_model(y_pred_lasso, y_test3)

##Lasso regression can help to reduce the impact of less important features on the model and improve its performance.

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.1726 │ 12.0822 │    0.237795 │
╘═════════╧═════════╧═════════════╛


### 2. XGBoost Regression

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(x_train3,y_train3)
predicts = xg_reg.predict(x_test3)
xg_error3 = evaluate_model(predicts, y_test3)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 27.7968 │ 22.8583 │    -1.25165 │
╘═════════╧═════════╧═════════════╛


#### Hyperparameter Tuning

In [None]:
## Hyperparameter tuning
param_grid = {
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200],
    'colsample_bytree': [0.3, 0.5, 0.7]
}

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')
grid_search = GridSearchCV(xg_reg, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train3, y_train3)
grid_search_pred = grid_search.predict(x_test3)

xg_error_use3 = evaluate_model(grid_search_pred, y_test3)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 17.0036 │ 12.9782 │    0.157449 │
╘═════════╧═════════╧═════════════╛


#### Regularization

In [None]:
## Apply L1 regularization using the alpha parameter:

xgb_model_l1 = xgb.XGBRegressor(objective='reg:squarederror', random_state=123, n_estimators=1000, max_depth=5, learning_rate=0.1, alpha=0.5, colsample_bytree=0.5)
xgb_model_l1.fit(x_train3,y_train3)
predicts = xgb_model_l1.predict(x_test3)
xg_error3 = evaluate_model(predicts, y_test3)

##By applying L1 regularization, we can reduce the impact of less important features on the model and improve its performance.

╒═════════╤════════╤═════════════╕
│    RMSE │    MAE │   R Squared │
╞═════════╪════════╪═════════════╡
│ 17.6773 │ 13.595 │   0.0893599 │
╘═════════╧════════╧═════════════╛


### 3. Random Forest Regression

In [None]:
rf_reg = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_reg.fit(x_train3,y_train3)
predictions = rf_reg.predict(x_test3)
rf_error_use3 = evaluate_model(predictions, y_test3)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 17.0738 │ 13.1958 │    0.150485 │
╘═════════╧═════════╧═════════════╛


#### Hyperparameter Tuning

In [None]:
## Hyperparameter Tuning

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'max_features': ['auto','sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_reg = RandomForestRegressor()
grid_search = GridSearchCV(rf_reg, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train3, y_train3)
grid_search_pred = grid_search.predict(x_test3)

rf_error3 = evaluate_model(grid_search_pred, y_test3)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 17.2144 │ 13.2035 │     0.13643 │
╘═════════╧═════════╧═════════════╛


#### Regularization

In [None]:
rf_model_l1 = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=123, ccp_alpha=0.5)
rf_model_l1.fit(x_train3,y_train3)
predictions = rf_model_l1.predict(x_test3)
rf_error3 = evaluate_model(predictions, y_test3)

## We are adding L1 regularization by setting the ccp_alpha parameter to 0.5. 
## The ccp_alpha parameter controls the complexity of the tree and can help to prevent overfitting. 
## By applying L1 regularization, we can reduce the impact of less important features on the model and improve its performance.

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 17.1082 │ 13.3525 │    0.147057 │
╘═════════╧═════════╧═════════════╛


### 4. Support Vector Regression

In [None]:
svr_reg = SVR(kernel = 'linear', C=1, epsilon=0.1)
svr_reg.fit(x_train3,y_train3)
predictions = svr_reg.predict(x_test3)
svr_error3 = evaluate_model(predictions, y_test3)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.8411 │ 12.8622 │    0.173477 │
╘═════════╧═════════╧═════════════╛


#### Hyperparameter Tuning

In [None]:
## Hyperparameter Tuning
# Define the parameter grid for the SVR model
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear']
}

svr = SVR()

# Perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train3, y_train3)

grid_search_pred = grid_search.predict(x_test3)

svr_error_use3 = evaluate_model(grid_search_pred, y_test3)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.6726 │ 12.6173 │    0.189933 │
╘═════════╧═════════╧═════════════╛


#### Regularization

In [None]:
svr_model_l1 = SVR(kernel='rbf', C=1.0, epsilon=0.1, coef0=0.0, shrinking=True, tol=0.001, cache_size=200, 
                   verbose=False, max_iter=-1, gamma='scale', degree=3)
svr_model_l1.set_params(C=0.5, epsilon=0.2, kernel='linear')
svr_model_l1.fit(x_train3,y_train3)
predictions = svr_model_l1.predict(x_test3)
svr_error3 = evaluate_model(predictions, y_test3)

## We are adding L1 regularization by setting the C parameter to 0.5 and the kernel to 'linear'. 
## We are also setting the epsilon parameter to 0.2 to control the width of the epsilon-insensitive zone. 
## By applying L1 regularization, we can reduce the impact of less important features on the model and improve its performance.

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.7047 │ 12.7459 │    0.186813 │
╘═════════╧═════════╧═════════════╛


# KAMILA Clusters

In [None]:
df_pheno.dtypes

DonAge                     float64
RecAge                     float64
DonSex                       int64
RecSex                       int64
SexMismatch                   bool
Year                       float64
IntracranialHaemorrhage       bool
RecPC1                     float64
RecHypertensionPRS         float64
DonHypertensionPRS         float64
DoneGFRPRS                 float64
DonStrokePRS               float64
RecHAKVPRS                 float64
ColdIschemiaTime           float64
GraftNo                    float64
eGFR1Year                  float64
MClustClusters               int64
KamilaClusters               int64
dtype: object

In [None]:
## Removing MClustClusters
df_kclust = df_pheno.drop(['MClustClusters'], axis = 1)

In [None]:
df_kclust['KamilaClusters'].min(), df_kclust['KamilaClusters'].max()

(1, 5)

## Splitting the data based on the cluster values

In [None]:
df_kclust_1 = df_kclust.loc[df_kclust.KamilaClusters==1]
df_kclust_2 = df_kclust.loc[df_kclust.KamilaClusters==2]
df_kclust_3 = df_kclust.loc[df_kclust.KamilaClusters==3]
df_kclust_4 = df_kclust.loc[df_kclust.KamilaClusters==4]
df_kclust_5 = df_kclust.loc[df_kclust.KamilaClusters==5]

In [None]:
x1 = df_kclust_1.drop(['KamilaClusters', 'eGFR1Year'], axis = 1)
y1 = df_kclust_1['eGFR1Year']

x2 = df_kclust_2.drop(['KamilaClusters', 'eGFR1Year'], axis = 1)
y2 = df_kclust_2['eGFR1Year']

x3 = df_kclust_3.drop(['KamilaClusters', 'eGFR1Year'], axis = 1)
y3 = df_kclust_3['eGFR1Year']

x4 = df_kclust_4.drop(['KamilaClusters', 'eGFR1Year'], axis = 1)
y4 = df_kclust_4['eGFR1Year']

x5 = df_kclust_5.drop(['KamilaClusters', 'eGFR1Year'], axis = 1)
y5 = df_kclust_5['eGFR1Year']

## Splitting each dataset into training and testing data

In [None]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size=0.2, random_state=123)
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size=0.2, random_state=123)
x_train3, x_test3, y_train3, y_test3 = train_test_split(x3, y3, test_size=0.2, random_state=123)
x_train4, x_test4, y_train4, y_test4 = train_test_split(x4, y4, test_size=0.2, random_state=123)
x_train5, x_test5, y_train5, y_test5 = train_test_split(x5, y5, test_size=0.2, random_state=123)

## Cluster 1

### 1. Linear Regression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(x_train1, y_train1)
predicts = lin_reg.predict(x_test1)
error = evaluate_model(predicts, y_test1)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 17.1373 │ 14.1517 │  -0.0279897 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# define hyperparameters to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
          'normalize': [True, False],
          'fit_intercept': [True, False]}

# create linear regression model
lasso_model = Lasso()

# use grid search to find best hyperparameters
grid_search = GridSearchCV(lasso_model, param_grid=params, cv=5)
grid_search.fit(x_train1, y_train1)
grid_search_pred = grid_search.predict(x_test1)

lin_error1 = evaluate_model(grid_search_pred, y_test1)

## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.6767 │ 13.4456 │   0.0265268 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'alpha': 0.1, 'fit_intercept': False, 'normalize': True}


### 2. XGBoost Regression

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(x_train1,y_train1)
predicts = xg_reg.predict(x_test1)
xg_error1 = evaluate_model(predicts, y_test1)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 29.0294 │ 24.6877 │    -1.94973 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# Combining regularization and hyperparameter tuning

param_grid = {'n_estimators': [100, 500, 1000],
              'learning_rate': [0.01, 0.1, 0.5],
              'max_depth': [3, 5, 7],
              'reg_alpha': [0.5],
              'reg_lambda': [0.5]}

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')
grid_search = GridSearchCV(xg_reg, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train1, y_train1)
grid_search_pred = grid_search.predict(x_test1)

xg_error1 = evaluate_model(grid_search_pred, y_test1)

## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤════════╤═════════════╕
│    RMSE │    MAE │   R Squared │
╞═════════╪════════╪═════════════╡
│ 15.7266 │ 12.915 │    0.134282 │
╘═════════╧════════╧═════════════╛
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'reg_alpha': 0.5, 'reg_lambda': 0.5}


### 3. Random Forest Regression

In [None]:
rf_reg = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_reg.fit(x_train1,y_train1)
predictions = rf_reg.predict(x_test1)
rf_error1 = evaluate_model(predictions, y_test1)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 15.3111 │ 12.7206 │    0.179431 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'max_features': ['auto','sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'ccp_alpha': [0.5]
}

rf_reg = RandomForestRegressor()
grid_search = GridSearchCV(rf_reg, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train1, y_train1)
grid_search_pred = grid_search.predict(x_test1)

rf_error1 = evaluate_model(grid_search_pred, y_test1)
## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒════════╤═════════╤═════════════╕
│   RMSE │     MAE │   R Squared │
╞════════╪═════════╪═════════════╡
│ 15.335 │ 12.6996 │    0.176858 │
╘════════╧═════════╧═════════════╛
Best parameters: {'ccp_alpha': 0.5, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}


### 4. Support Vector Regression

In [None]:
svr_reg = SVR(kernel = 'linear', C=1, epsilon=0.1)
svr_reg.fit(x_train1,y_train1)
predictions = svr_reg.predict(x_test1)
svr_error_use1 = evaluate_model(predictions, y_test1)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 17.4768 │ 14.2982 │  -0.0691224 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# Define the parameter grid for the SVR model
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear'],
    'gamma': ['scale']
}

svr = SVR()

# Perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train1, y_train1)

grid_search_pred = grid_search.predict(x_test1)

svr_error1 = evaluate_model(grid_search_pred, y_test1)
## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.5512 │ 13.0655 │   0.0411167 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


## Carrying out Machine learning on Cluster 2
1. Linear Regression
2. XGBoost Regression
3. Random Forest Regression
4. Support Vector Regression

### 1. Linear Regression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(x_train2, y_train2)
predicts = lin_reg.predict(x_test2)
lin_error2 = evaluate_model(predicts, y_test2)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.1971 │ 12.7902 │  -0.0365454 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# define hyperparameters to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
          'normalize': [True, False],
          'fit_intercept': [True, False]}

# create linear regression model
lasso_model = Lasso()

# use grid search to find best hyperparameters
grid_search = GridSearchCV(lasso_model, param_grid=params, cv=5)
grid_search.fit(x_train2, y_train2)
grid_search_pred = grid_search.predict(x_test2)

lin_error2 = evaluate_model(grid_search_pred, y_test2)

## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 15.8055 │ 12.1633 │   0.0129758 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'alpha': 1, 'fit_intercept': True, 'normalize': False}


### 2. XGBoost Regression

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(x_train2,y_train2)
predicts = xg_reg.predict(x_test2)
xg_error2 = evaluate_model(predicts, y_test2)

╒════════╤═════════╤═════════════╕
│   RMSE │     MAE │   R Squared │
╞════════╪═════════╪═════════════╡
│ 23.597 │ 18.8374 │    -1.20002 │
╘════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# Combining regularization and hyperparameter tuning

param_grid = {'n_estimators': [100, 500, 1000],
              'learning_rate': [0.01, 0.1, 0.5],
              'max_depth': [3, 5, 7],
              'reg_alpha': [0.5],
              'reg_lambda': [0.5]}

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')
grid_search = GridSearchCV(xg_reg, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train2, y_train2)
grid_search_pred = grid_search.predict(x_test2)

xg_error2 = evaluate_model(grid_search_pred, y_test2)

## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 15.8753 │ 12.6224 │  0.00423524 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'reg_alpha': 0.5, 'reg_lambda': 0.5}


### 3. Random Forest Regression

In [None]:
rf_reg = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_reg.fit(x_train2,y_train2)
predictions = rf_reg.predict(x_test2)
rf_error2 = evaluate_model(predictions, y_test2)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 15.3685 │ 12.1512 │   0.0667986 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'max_features': ['auto','sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'ccp_alpha': [0.5]
}

rf_reg = RandomForestRegressor()
grid_search = GridSearchCV(rf_reg, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train2, y_train2)
grid_search_pred = grid_search.predict(x_test2)

rf_error2 = evaluate_model(grid_search_pred, y_test2)
## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 15.3793 │ 12.1898 │   0.0654917 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'ccp_alpha': 0.5, 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


### 4. Support Vector Regression

In [None]:
svr_reg = SVR(kernel = 'linear', C=1, epsilon=0.1)
svr_reg.fit(x_train2,y_train2)
predictions = svr_reg.predict(x_test2)
svr_error2 = evaluate_model(predictions, y_test2)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.4487 │ 12.6183 │  -0.0689904 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# Define the parameter grid for the SVR model
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear'],
    'gamma': ['scale']
}

svr = SVR()

# Perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train2, y_train2)

grid_search_pred = grid_search.predict(x_test2)

svr_error2 = evaluate_model(grid_search_pred, y_test2)
## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.2793 │ 12.4047 │  -0.0470841 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


## Carrying out Machine learning on Cluster 3
1. Linear Regression
2. XGBoost Regression
3. Random Forest Regression
4. Support Vector Regression

### 1. Linear Regression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(x_train3, y_train3)
predicts = lin_reg.predict(x_test3)
lin_error3 = evaluate_model(predicts, y_test3)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 17.9203 │ 13.4708 │   0.0691534 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# define hyperparameters to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
          'normalize': [True, False],
          'fit_intercept': [True, False]}

# create linear regression model
lasso_model = Lasso()

# use grid search to find best hyperparameters
grid_search = GridSearchCV(lasso_model, param_grid=params, cv=5)
grid_search.fit(x_train3, y_train3)
grid_search_pred = grid_search.predict(x_test3)

lin_error3 = evaluate_model(grid_search_pred, y_test3)

## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤════════╤═════════════╕
│    RMSE │    MAE │   R Squared │
╞═════════╪════════╪═════════════╡
│ 18.1133 │ 13.596 │   0.0489944 │
╘═════════╧════════╧═════════════╛
Best parameters: {'alpha': 0.1, 'fit_intercept': False, 'normalize': True}


### 2. XGBoost Regression

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(x_train3,y_train3)
predicts = xg_reg.predict(x_test3)
xg_error3 = evaluate_model(predicts, y_test3)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 27.1018 │ 22.2032 │    -1.12903 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# Combining regularization and hyperparameter tuning

param_grid = {'n_estimators': [100, 500, 1000],
              'learning_rate': [0.01, 0.1, 0.5],
              'max_depth': [3, 5, 7],
              'reg_alpha': [0.5],
              'reg_lambda': [0.5]}

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')
grid_search = GridSearchCV(xg_reg, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train3, y_train3)
grid_search_pred = grid_search.predict(x_test3)

xg_error3 = evaluate_model(grid_search_pred, y_test3)

## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 19.0387 │ 14.3408 │  -0.0506554 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'reg_alpha': 0.5, 'reg_lambda': 0.5}


### 3. Random Forest Regression

In [None]:
rf_reg = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_reg.fit(x_train3,y_train3)
predictions = rf_reg.predict(x_test3)
rf_error_use3 = evaluate_model(predictions, y_test3)

╒════════╤═════════╤═════════════╕
│   RMSE │     MAE │   R Squared │
╞════════╪═════════╪═════════════╡
│ 17.913 │ 13.3223 │    0.069916 │
╘════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'max_features': ['auto','sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'ccp_alpha': [0.5]
}

rf_reg = RandomForestRegressor()
grid_search = GridSearchCV(rf_reg, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train3, y_train3)
grid_search_pred = grid_search.predict(x_test3)

rf_error3 = evaluate_model(grid_search_pred, y_test3)
## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 18.2673 │ 13.7879 │   0.0327552 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'ccp_alpha': 0.5, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}


### 4. Support Vector Regression

In [None]:
svr_reg = SVR(kernel = 'linear', C=1, epsilon=0.1)
svr_reg.fit(x_train3,y_train3)
predictions = svr_reg.predict(x_test3)
svr_error3 = evaluate_model(predictions, y_test3)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 18.2083 │ 13.7659 │   0.0389936 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# Define the parameter grid for the SVR model
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear'],
    'gamma': ['scale']
}

svr = SVR()

# Perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train3, y_train3)

grid_search_pred = grid_search.predict(x_test3)

svr_error3 = evaluate_model(grid_search_pred, y_test3)
## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 17.7225 │ 13.3239 │   0.0895876 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


## Carrying out Machine learning on Cluster 4
1. Linear Regression
2. XGBoost Regression
3. Random Forest Regression
4. Support Vector Regression

### 1. Linear Regression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(x_train4, y_train4)
predicts = lin_reg.predict(x_test4)
lin_error4 = evaluate_model(predicts, y_test4)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 18.3882 │ 14.5975 │   -0.344816 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# define hyperparameters to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
          'normalize': [True, False],
          'fit_intercept': [True, False]}

# create linear regression model
lasso_model = Lasso()

# use grid search to find best hyperparameters
grid_search = GridSearchCV(lasso_model, param_grid=params, cv=5)
grid_search.fit(x_train4, y_train4)
grid_search_pred = grid_search.predict(x_test4)

lin_error4 = evaluate_model(grid_search_pred, y_test4)

## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 17.3739 │ 13.1981 │   -0.200554 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'alpha': 1, 'fit_intercept': True, 'normalize': False}


### 2. XGBoost Regression

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(x_train4,y_train4)
predicts = xg_reg.predict(x_test4)
xg_error4 = evaluate_model(predicts, y_test4)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 29.2988 │ 25.2185 │    -2.41417 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# Combining regularization and hyperparameter tuning

param_grid = {'n_estimators': [100, 500, 1000],
              'learning_rate': [0.01, 0.1, 0.5],
              'max_depth': [3, 5, 7],
              'reg_alpha': [0.5],
              'reg_lambda': [0.5]}

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')
grid_search = GridSearchCV(xg_reg, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train4, y_train4)
grid_search_pred = grid_search.predict(x_test4)

xg_error4 = evaluate_model(grid_search_pred, y_test4)

## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 18.2315 │ 13.6225 │   -0.321991 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'reg_alpha': 0.5, 'reg_lambda': 0.5}


### 3. Random Forest Regression

In [None]:
rf_reg = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_reg.fit(x_train4,y_train4)
predictions = rf_reg.predict(x_test4)
rf_error4 = evaluate_model(predictions, y_test4)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.8595 │ 12.6498 │   -0.130503 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'max_features': ['auto','sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'ccp_alpha': [0.5]
}

rf_reg = RandomForestRegressor()
grid_search = GridSearchCV(rf_reg, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train4, y_train4)
grid_search_pred = grid_search.predict(x_test4)

rf_error4 = evaluate_model(grid_search_pred, y_test4)
## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤═══════╤═════════════╕
│    RMSE │   MAE │   R Squared │
╞═════════╪═══════╪═════════════╡
│ 17.1262 │ 13.29 │   -0.166557 │
╘═════════╧═══════╧═════════════╛
Best parameters: {'ccp_alpha': 0.5, 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}


### 4. Support Vector Regression

In [None]:
svr_reg = SVR(kernel = 'linear', C=1, epsilon=0.1)
svr_reg.fit(x_train4,y_train4)
predictions = svr_reg.predict(x_test4)
svr_error4 = evaluate_model(predictions, y_test4)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 17.4414 │ 13.3035 │   -0.209892 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# Define the parameter grid for the SVR model
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear'],
    'gamma': ['scale']
}

svr = SVR()

# Perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train4, y_train4)

grid_search_pred = grid_search.predict(x_test4)

svr_error4 = evaluate_model(grid_search_pred, y_test4)
## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 17.1431 │ 13.2313 │   -0.168858 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


## Carrying out Machine learning on Cluster 5
1. Linear Regression
2. XGBoost Regression
3. Random Forest Regression
4. Support Vector Regression

### 1. Linear Regression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(x_train5, y_train5)
predicts = lin_reg.predict(x_test5)
lin_error5 = evaluate_model(predicts, y_test5)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 15.6897 │ 12.4033 │  -0.0370005 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# define hyperparameters to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
          'normalize': [True, False],
          'fit_intercept': [True, False]}

# create linear regression model
lasso_model = Lasso()

# use grid search to find best hyperparameters
grid_search = GridSearchCV(lasso_model, param_grid=params, cv=5)
grid_search.fit(x_train5, y_train5)
grid_search_pred = grid_search.predict(x_test5)

lin_error5 = evaluate_model(grid_search_pred, y_test5)

## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 16.2202 │ 12.9364 │   -0.108311 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'alpha': 1, 'fit_intercept': True, 'normalize': False}


### 2. XGBoost Regression

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(x_train5,y_train5)
predicts = xg_reg.predict(x_test5)
xg_error5 = evaluate_model(predicts, y_test5)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 23.4932 │ 19.5428 │    -1.32507 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# Combining regularization and hyperparameter tuning

param_grid = {'n_estimators': [100, 500, 1000],
              'learning_rate': [0.01, 0.1, 0.5],
              'max_depth': [3, 5, 7],
              'reg_alpha': [0.5],
              'reg_lambda': [0.5]}

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')
grid_search = GridSearchCV(xg_reg, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train5, y_train5)
grid_search_pred = grid_search.predict(x_test5)

xg_error5 = evaluate_model(grid_search_pred, y_test5)

## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 15.8401 │ 12.6523 │  -0.0569823 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'reg_alpha': 0.5, 'reg_lambda': 0.5}


### 3. Random Forest Regression

In [None]:
rf_reg = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_reg.fit(x_train5,y_train5)
predictions = rf_reg.predict(x_test5)
rf_error_use5 = evaluate_model(predictions, y_test5)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 15.3542 │ 12.2607 │  0.00686932 │
╘═════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'max_features': ['auto','sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'ccp_alpha': [0.5]
}

rf_reg = RandomForestRegressor()
grid_search = GridSearchCV(rf_reg, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train5, y_train5)
grid_search_pred = grid_search.predict(x_test5)

rf_error5 = evaluate_model(grid_search_pred, y_test5)
## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 15.3228 │ 12.1498 │    0.010936 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'ccp_alpha': 0.5, 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}


### 4. Support Vector Regression

In [None]:
svr_reg = SVR(kernel = 'linear', C=1, epsilon=0.1)
svr_reg.fit(x_train5,y_train5)
predictions = svr_reg.predict(x_test5)
svr_error5 = evaluate_model(predictions, y_test5)

╒════════╤═════════╤═════════════╕
│   RMSE │     MAE │   R Squared │
╞════════╪═════════╪═════════════╡
│ 15.373 │ 12.0874 │  0.00444092 │
╘════════╧═════════╧═════════════╛


### Regularization and Hyperparameter Tuning

In [None]:
# Define the parameter grid for the SVR model
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear'],
    'gamma': ['scale']
}

svr = SVR()

# Perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train5, y_train5)

grid_search_pred = grid_search.predict(x_test5)

svr_error5 = evaluate_model(grid_search_pred, y_test5)
## Printing the parameters that generated the best results
print("Best parameters:", grid_search.best_params_)

╒═════════╤═════════╤═════════════╕
│    RMSE │     MAE │   R Squared │
╞═════════╪═════════╪═════════════╡
│ 15.4274 │ 12.3896 │ -0.00261412 │
╘═════════╧═════════╧═════════════╛
Best parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


# Evaluating Cluster Prediction Results
1. Analyse cluster results and choose best model for each cluster
2. Average the scores from each cluster to produce errors
3. t-test to determine significant difference between the XGBoost previous and the clustered data.

## 1. Results

Best Model and resulting RMSE and R-Squared for each cluster.
1. RFR - 15.335, 0.176858
2. RFR - 15.3793, 0.0654917
3. SVR - 17.7225, 0.0895876
4. RFR - 17.1262, -0.166557
5. RFR - 15.3228, 0.010936


In [None]:
## Calculating the average RMSE and R-Squared

sumRMSE = 15.335 + 15.3793 + 17.7225 + 17.1262 + 15.3228
averageRMSE = sumRMSE / 5
averageRMSE



16.17716

In [None]:
sumR = 0.176858 + 0.0654917 + 0.0895876 + (-0.166557) + 0.010936
averageR = sumR / 5
averageR

0.03526326

## Student T-Test

In [None]:
XGError = np.array([-1.49930141e+01, -1.05391868e+01, -2.28876797e+01, -5.57648221e+00,
       -8.77451437e+00,  2.48051978e+01,  1.47653985e+00, -6.33486857e+00,
        4.61792870e+01,  6.44005270e+00, -1.32366804e+01, -1.27059778e+01,
       -2.89741814e+00, -1.96761159e+00, -2.00630232e+01,  7.80281890e+00,
       -2.95987886e+01, -1.37410507e+01,  1.01659603e+01,  3.97299456e+00,
        1.38344183e+01, -1.57039601e+01, -3.88069447e+00,  2.27151903e+01,
        2.11823201e+01,  3.09669242e+01,  1.95808960e+01, -2.07234200e+01,
       -1.10437354e+01, -3.01690782e+00,  2.14914611e+01, -5.19517385e+00,
        2.46319871e+01, -3.31543344e+00,  1.80235940e+01, -9.56499256e+00,
       -9.92516499e-01,  4.19898753e+00,  7.36667482e-02,  1.40164645e+01,
       -7.86428163e+00, -1.39020197e+01, -2.18899131e+01, -1.02733649e+01,
       -1.56543003e+01, -2.44553433e+01,  1.37606358e+00, -5.42034620e+00,
       -3.18388223e+01, -1.22763792e+01, -6.31948420e+00,  1.19252618e+01,
       -2.68161443e+00,  1.08115435e+01,  1.61543720e+01, -1.32026894e+01,
       -1.99664792e+01, -1.25157095e+01,  7.84949842e+00, -9.09626558e+00,
        2.83582583e-01,  1.64829847e+01, -1.63520003e+01, -6.80884870e+00,
        7.56051622e+00,  8.61054400e+00, -1.35872271e+01, -3.00533002e+00,
       -8.40634336e+00,  2.36423995e+01, -1.34421109e+01,  2.16712957e-01,
        1.63225581e+01, -1.00933719e+01, -1.27840793e+01,  1.61993888e+01,
       -1.15606374e+01,  2.77314959e+00, -6.30839841e+00, -3.22966041e-01,
        2.19677191e+01, -9.97578772e+00, -1.30084042e+01, -7.12404108e+00,
       -6.58753150e+00,  8.73450175e+00, -1.48527927e-02, -1.95438410e+01,
       -2.53688981e+00, -2.12319329e-01, -2.25116903e+01,  1.49873933e+01,
       -1.76085508e+01, -1.08776291e+01, -3.88609236e-01, -6.21969551e+00,
       -1.89662218e+01, -9.72874792e+00,  1.07893609e+01, -2.06914687e+01,
       -1.35700612e+00, -2.05220639e+01,  2.60096593e+00,  3.29990422e+01,
        5.89199518e+00, -1.24953546e+00,  1.73845348e+01, -4.21492511e+00,
        1.52851166e+01, -6.91719769e+00, -1.33298857e+01,  2.15751711e+01,
        6.88017902e+00, -1.23764581e+01,  2.37540663e+00, -7.35959196e+00,
       -9.13404968e+00,  6.13457601e+00,  1.34922507e+01, -1.62509093e+01,
       -1.25583144e+01,  1.43031578e+01, -9.86035365e+00, -5.04106702e+00,
       -2.42061427e+00,  3.82688791e+00,  1.93612844e+01, -2.76980389e+01,
       -2.27479302e+00, -6.62757928e+00,  1.39671447e+01, -1.32880566e+01,
        2.34751117e+01,  8.35911371e+00,  1.33979070e+00, -2.26170200e+01,
       -3.52700626e+00,  2.09248042e+00, -8.32927635e+00,  1.03610363e+01,
        8.54605225e+00,  2.49819369e+01,  1.47744144e+01,  5.00630885e+00,
        3.20212942e+00, -1.44215769e+01,  1.12022048e+01,  9.21048953e+00,
       -1.51147398e+00,  1.69821621e+01,  1.91685442e+01, -1.22319686e+01,
       -1.33281251e+01, -1.30768485e+01, -2.12449421e+01,  4.26985791e+00,
        4.30423210e+01, -7.01209005e+00,  2.53399401e+01, -1.40437381e+01,
       -2.30243780e+01, -9.53316099e+00, -2.84378407e+01, -9.98043045e+00,
       -1.88538999e+01,  8.36485296e+00, -7.37331318e+00,  1.58260296e+00,
        4.60100547e+00, -3.15724209e+00,  4.83556680e+00,  1.36344079e+01,
        4.24354616e-01, -9.13472577e+00, -1.04129518e+01,  1.35597047e+01,
        4.16861737e+01, -1.60500477e+01, -2.45157770e+01,  4.98308269e+01,
        2.48720811e+01,  1.22703159e+01,  1.86726374e+01, -2.04306458e+01,
        1.42479339e+01, -1.96266053e+01, -5.60695201e+00,  1.94340720e+01,
        2.40868443e+01,  1.76832983e+01,  1.01662904e+01, -8.90735914e+00,
       -5.57777784e+00,  3.49917722e+01, -9.01888824e+00, -6.23040669e+00,
        2.56289658e+01,  7.85200291e+00,  1.79460587e+01, -1.82618168e+01,
       -6.75668686e+00,  5.32840371e+00,  1.93551994e+01,  1.27487651e+01,
       -3.00986259e+01,  2.51873873e+01,  3.22827475e+01, -7.05218826e+00,
       -1.21465570e+00,  6.45410539e+00, -9.27516992e+00,  3.69940489e+00,
        8.09052027e+00,  3.44840142e+00, -2.27953254e+01, -5.88707282e+00,
       -1.45219543e+01,  4.10595158e+00, -6.24087995e+00, -2.16154337e+01,
       -4.85071534e+00, -1.54410664e+00, -2.45990743e+01, -1.93091584e+01,
        2.68235841e+01,  7.91804730e+00, -1.77968565e+01,  1.18970440e+01,
       -5.23247229e+00,  3.45594218e+01,  2.09748079e+01, -1.15721868e+01,
       -2.90559928e+01, -1.42283791e+01,  1.89928025e+01, -7.08926742e+00,
        8.68237321e+00, -7.44196642e-01,  7.84147817e-01, -4.45033396e+00,
       -3.34914994e+00, -1.98789287e+01,  1.62509104e+01, -4.71212337e+00,
       -3.47985020e+00,  7.25475934e+00,  3.55735568e+00,  1.05179621e+01,
       -2.32680968e+00,  4.27055320e+01, -9.27691065e+00,  2.31746986e+01,
       -1.81204600e+01, -5.02341170e-01, -2.26571438e+01, -3.97284827e+01,
       -2.59066582e+01,  1.51269015e+01,  3.84963292e+00, -6.87070045e+00,
       -1.68441822e+01, -5.61200503e+00, -2.48155927e+01, -2.44082756e+01,
       -6.84623535e+00, -1.27387784e+01, -1.36952768e+01,  7.03011931e+00,
       -3.26094271e+00,  5.17194848e+00, -8.53576933e+00,  7.54840700e+00,
       -1.06420851e+01,  3.34901211e+00,  6.70479721e+00,  1.35097442e+01,
       -4.23829637e+00, -3.35112231e+00,  3.33642512e+00, -2.58237802e+01,
        3.99886997e+00, -4.39409822e+00,  5.29665789e+00,  5.80445717e-01,
        1.70016914e+01,  5.95002349e+00, -1.28830396e+01, -9.60932012e+00,
       -7.04938505e+00,  8.14227251e+00,  9.51256744e+00, -2.39737655e+01,
       -2.59570567e+00, -8.33122184e+00, -4.46079856e+00, -1.17284090e+01,
       -1.08156548e+01,  4.67349430e-01,  3.32044820e+01, -1.20561734e+01,
       -9.08896241e+00,  1.32767485e+01, -6.19532068e+00, -7.03304409e+00,
        1.96759471e+01, -1.25877281e+01,  2.04535034e+01,  1.00551005e+01,
       -9.69625254e+00, -6.14030927e+00, -1.48899146e+01, -8.05146086e+00,
        1.20855945e+01, -3.20091939e+00,  5.02771147e+00, -1.31742192e+01,
       -2.09159919e+01,  3.73850873e+00, -5.03880064e+00, -7.35948309e-02,
        2.11833564e+00, -1.69549719e+01, -1.95176674e+00])

### Comparing XGBoost overall to Cluster 1

In [None]:
## Using python package
# Perform t-test
# lin_error = array of errors generated from the linear regression model
# xg_error = array of errors generated from the xgboost regression model

t_statistic, p_value = ttest_ind(rf_error1, XGError)
# Print the results
print("Paired t-test results:")
print("t-statistic:", t_statistic)
print("p-value:", p_value)

# Compare the p-value with a significance level alpha (e.g., 0.05)
alpha = 0.05
if p_value < alpha:
    print("The difference between the RMSE scores is statistically significant.")
else:
    print("The difference between the RMSE scores is not statistically significant.")

Paired t-test results:
t-statistic: 0.6878684398619253
p-value: 0.49196546273370145
The difference between the RMSE scores is not statistically significant.


In [None]:
## Using python package
# Perform t-test
# lin_error = array of errors generated from the linear regression model
# xg_error = array of errors generated from the xgboost regression model

t_statistic, p_value = ttest_ind(rf_error2, XGError)
# Print the results
print("Paired t-test results:")
print("t-statistic:", t_statistic)
print("p-value:", p_value)

# Compare the p-value with a significance level alpha (e.g., 0.05)
alpha = 0.05
if p_value < alpha:
    print("The difference between the RMSE scores is statistically significant.")
else:
    print("The difference between the RMSE scores is not statistically significant.")

Paired t-test results:
t-statistic: 1.664455510049037
p-value: 0.09681644661716271
The difference between the RMSE scores is not statistically significant.


In [None]:
## Using python package
# Perform t-test
# lin_error = array of errors generated from the linear regression model
# xg_error = array of errors generated from the xgboost regression model

t_statistic, p_value = ttest_ind(svr_error3, XGError)
# Print the results
print("Paired t-test results:")
print("t-statistic:", t_statistic)
print("p-value:", p_value)

# Compare the p-value with a significance level alpha (e.g., 0.05)
alpha = 0.05
if p_value < alpha:
    print("The difference between the RMSE scores is statistically significant.")
else:
    print("The difference between the RMSE scores is not statistically significant.")

Paired t-test results:
t-statistic: 1.7216458363376803
p-value: 0.08589727732947233
The difference between the RMSE scores is not statistically significant.


In [None]:
## Using python package
# Perform t-test
# lin_error = array of errors generated from the linear regression model
# xg_error = array of errors generated from the xgboost regression model

t_statistic, p_value = ttest_ind(rf_error4, XGError)
# Print the results
print("Paired t-test results:")
print("t-statistic:", t_statistic)
print("p-value:", p_value)

# Compare the p-value with a significance level alpha (e.g., 0.05)
alpha = 0.05
if p_value < alpha:
    print("The difference between the RMSE scores is statistically significant.")
else:
    print("The difference between the RMSE scores is not statistically significant.")

Paired t-test results:
t-statistic: 0.7996968085529483
p-value: 0.42441236815203187
The difference between the RMSE scores is not statistically significant.


In [None]:
## Using python package
# Perform t-test
# lin_error = array of errors generated from the linear regression model
# xg_error = array of errors generated from the xgboost regression model

t_statistic, p_value = ttest_ind(rf_error5, XGError)
# Print the results
print("Paired t-test results:")
print("t-statistic:", t_statistic)
print("p-value:", p_value)

# Compare the p-value with a significance level alpha (e.g., 0.05)
alpha = 0.05
if p_value < alpha:
    print("The difference between the RMSE scores is statistically significant.")
else:
    print("The difference between the RMSE scores is not statistically significant.")

Paired t-test results:
t-statistic: 0.5324319496626041
p-value: 0.5947232288822357
The difference between the RMSE scores is not statistically significant.
