In [163]:
import joblib
import numpy as np
import pandas as pd

In [164]:
preprocesser = joblib.load('../models/preprocesser.pkl')
preprocesser.feature_names_in_

array(['SEX', 'AGE', 'TC', 'HDL', 'SMOKE_', 'BPMED', 'DIAB_noyes'],
      dtype=object)

In [140]:
data = np.load('../data/data.npy')
target = np.load('../data/target.npy')

In [141]:
data, target

(array([[ 0.        , -0.74305822,  0.43577994, ...,  0.        ,
          1.        ,  0.        ],
        [ 1.        , -0.74305822,  0.98855147, ...,  0.        ,
          1.        ,  1.        ],
        [ 1.        , -1.09896301, -0.69279528, ...,  1.        ,
          1.        ,  0.        ],
        ...,
        [ 1.        ,  0.76953712, -0.1169916 , ...,  1.        ,
          0.        ,  0.        ],
        [ 1.        ,  0.85851332, -0.76189172, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        , -1.00998681, -0.32428092, ...,  0.        ,
          1.        ,  0.        ]]),
 array([0.011, 0.07 , 0.07 , ..., 0.306, 0.238, 0.01 ]))

In [142]:
from sklearn.feature_selection import mutual_info_regression, SelectKBest

In [143]:
fs = SelectKBest(score_func=mutual_info_regression, k=3)
fs.fit(data, target)

In [144]:
fs.scores_

array([0.08552118, 0.46619246, 0.02691189, 0.03374404, 0.03191114,
       0.04435743, 0.05539569])

In [145]:
# Calculate Mutual Information 
mi = mutual_info_regression(data, target)
print("Mutual information scores:", mi)

Mutual information scores: [0.06635552 0.46367226 0.03247265 0.02985524 0.02606181 0.06259301
 0.05971403]


In [146]:
from sklearn.model_selection import train_test_split

In [147]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

In [148]:
np.save('../data/test_data', X_test)
np.save('../data/test_target', y_test)


### Model list with HPs

In [149]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [150]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "ElasticNet": ElasticNet(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "SVR (Support Vector Regressor)": SVR(),
    "Decision Tree": DecisionTreeRegressor()
}


In [151]:
# Define hyperparameter grids for each model

param_grids = {
    "Linear Regression": {},
    
    "Ridge Regression": {
        "alpha": [0.1, 1, 10, 100],
        "solver": ["auto", "svd", "cholesky", "lsqr", "saga"]
    },
    
    "Lasso Regression": {
        "alpha": [0.1, 1, 10, 100],
        "max_iter": [1000, 5000, 10000]
    },
    
    "ElasticNet": {
        "alpha": [0.1, 1, 10],
        "l1_ratio": [0.2, 0.5, 0.7, 1]
    },
    
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "bootstrap": [True, False]
    },
    
    "Gradient Boosting": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.5],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2],
        "subsample": [0.8, 1.0]
    },
    
    "AdaBoost": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 1]
    },
    
    "SVR (Support Vector Regressor)": {
        "C": [0.1, 1, 10],
        "kernel": ["linear", "rbf"],
        "gamma": ["scale", "auto"]
    },
    
    "Decision Tree": {
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }
}

### Tuning Hyperparameters

In [152]:
from sklearn.model_selection import GridSearchCV

In [153]:
best_models = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    grid_search = GridSearchCV(model, param_grids[model_name], cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best score for {model_name}: {grid_search.best_score_}")
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")


Training Linear Regression...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best score for Linear Regression: -0.003073605452294612
Best parameters for Linear Regression: {}
Training Ridge Regression...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best score for Ridge Regression: -0.0030736049601034258
Best parameters for Ridge Regression: {'alpha': 0.1, 'solver': 'lsqr'}
Training Lasso Regression...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score for Lasso Regression: -0.01300309213712934
Best parameters for Lasso Regression: {'alpha': 0.1, 'max_iter': 1000}
Training ElasticNet...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score for ElasticNet: -0.006723282915017651
Best parameters for ElasticNet: {'alpha': 0.1, 'l1_ratio': 0.2}
Training Random Forest...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best score for Random Forest: -0.0019521889497138845
Best parameters for Random Forest: {'boo

In [154]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [155]:
# Evaluate the best models using the test data
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"{model_name} Model Performance:")
    print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred)}")
    print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred)}")
    print(f"R² Score: {r2_score(y_test, y_pred)}")


Linear Regression Model Performance:
Mean Squared Error (MSE): 0.0028219952250920373
Mean Absolute Error (MAE): 0.04080622923753691
R² Score: 0.761918278297532
Ridge Regression Model Performance:
Mean Squared Error (MSE): 0.0028219845938511233
Mean Absolute Error (MAE): 0.04080582185102784
R² Score: 0.7619191752175971
Lasso Regression Model Performance:
Mean Squared Error (MSE): 0.01187203621983972
Mean Absolute Error (MAE): 0.08856515459510429
R² Score: -0.001601561264624296
ElasticNet Model Performance:
Mean Squared Error (MSE): 0.006225921244000518
Mean Absolute Error (MAE): 0.05916994747725011
R² Score: 0.4747411208297595
Random Forest Model Performance:
Mean Squared Error (MSE): 0.001806113553903352
Mean Absolute Error (MAE): 0.02633500050673137
R² Score: 0.8476246094677751
Gradient Boosting Model Performance:
Mean Squared Error (MSE): 0.0017009039596349394
Mean Absolute Error (MAE): 0.02654705378249661
R² Score: 0.8565007695407337
AdaBoost Model Performance:
Mean Squared Error (M

In [156]:
from sklearn.model_selection import cross_val_score

In [157]:
# Cross-validation with each model (for regression tasks)
for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    print(f"{model_name} Cross-validation MSE scores: {-cv_scores}")
    print(f"Mean cross-validation MSE: {-np.mean(cv_scores)}\n")


Linear Regression Cross-validation MSE scores: [0.00308868 0.00311583 0.00279308 0.00306717 0.00330328]
Mean cross-validation MSE: 0.003073605452294612

Ridge Regression Cross-validation MSE scores: [0.00308882 0.00311611 0.00279256 0.00306694 0.00330359]
Mean cross-validation MSE: 0.003073605402699612

Lasso Regression Cross-validation MSE scores: [0.01341233 0.01330362 0.01147547 0.01285508 0.01396895]
Mean cross-validation MSE: 0.01300309213712934

ElasticNet Cross-validation MSE scores: [0.01341233 0.01330362 0.01147547 0.01285508 0.01396895]
Mean cross-validation MSE: 0.01300309213712934

Random Forest Cross-validation MSE scores: [0.00202298 0.00198191 0.00182982 0.00207377 0.00226922]
Mean cross-validation MSE: 0.0020355419689275065

Gradient Boosting Cross-validation MSE scores: [0.00199975 0.00195738 0.00172124 0.00188017 0.00210509]
Mean cross-validation MSE: 0.001932725301147229

AdaBoost Cross-validation MSE scores: [0.00504311 0.00540608 0.00509839 0.00514792 0.00549444]
M

In [158]:
# Select the Best Model
best_model_name = min(best_models, key=lambda k: mean_squared_error(y_test, best_models[k].predict(X_test)))
best_model = best_models[best_model_name]
print(f"Best model: {best_model_name} with MSE: {mean_squared_error(y_test, best_model.predict(X_test))}")


Best model: Gradient Boosting with MSE: 0.0017009039596349394


In [159]:
from sklearn.pipeline import make_pipeline

In [160]:
model_pipeline = make_pipeline(preprocesser, best_model)
model_pipeline

In [161]:
joblib.dump(best_model, "../models/model.pkl")

['../models/model.pkl']

In [162]:
joblib.dump(model_pipeline, "../models/model_pipe.pkl")

['../models/model_pipe.pkl']

In [168]:
# Column names
columns = ['SEX', 'AGE', 'TC', 'HDL', 'SMOKE_', 'BPMED', 'DIAB_noyes']
test_data = np.array([[0, 48, 236, 66, 0, 1, 0]])


# Create DataFrame
df = pd.DataFrame(test_data, columns=columns)
df

Unnamed: 0,SEX,AGE,TC,HDL,SMOKE_,BPMED,DIAB_noyes
0,0,48,236,66,0,1,0
