In [1]:
from pycaret.datasets import get_data
from functools import partial
import pandas as pd
import time
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor as rfr
import shap
import xgboost as xgb
import numpy as np
from sklearn.feature_selection import RFE, SelectKBest, f_regression, mutual_info_regression
from sklearn.linear_model import RidgeClassifier, Ridge
from sklearn.model_selection import train_test_split


### Models

In [2]:
data = get_data('diabetes')
data_reg = get_data('insurance')
new_data = data.drop("Class variable", axis=1)
charges = data_reg.loc[data.index]["charges"]

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
labels = pd.read_csv("../data/regression_labels.csv", index_col=0)
regre = pd.read_csv("../data/regression.csv", index_col=0)

## Regression

In [4]:
import BioML.models.regression as regression
from collections import defaultdict
from functools import partial
import pandas as pd
import numpy as np

In [5]:
plot = ("learning", "error", "residuals")

In [6]:
excel = pd.read_excel("../data/esterase_features.xlsx", index_col=0, sheet_name="ch2_20")
label = list(range(len(excel)))

In [7]:
data = regression.DataParser("../data/esterase_features.xlsx", label)
experiment = regression.PycaretInterface("regression", 200, budget_time=20, best_model=3, 
                                        output_path="regression_results", optimize="RMSE")
regressor = regression.Regressor(optimize="RMSE")
training = regression.Trainer(experiment, regressor, 5, 30)


31-05-2024 21:57:40 INFO ------------------------------------------------------------------------------
31-05-2024 21:57:40 INFO PycaretInterface parameters
31-05-2024 21:57:40 INFO Seed: 200
31-05-2024 21:57:40 INFO Budget time: 20
31-05-2024 21:57:40 INFO The number of models to select: 3
31-05-2024 21:57:40 INFO Output path: regression_results
31-05-2024 21:57:40 INFO ----------------Trainer inputs-------------------------
31-05-2024 21:57:40 INFO Number of kfolds: 5
31-05-2024 21:57:40 INFO Number of retuning iterations: 50
31-05-2024 21:57:40 INFO Test size: 30


In [8]:
c = regression.split.ClusterSpliter("../data/resultsDB_clu.tsv", 5, random_state=experiment.seed)
X_train, X_test = c.train_test_split(data.features)

In [9]:
results, models_dict = training.generate_training_results(X_train, data.label, True,
                                                          test_data=X_test, fold_strategy=c)

2024/05/31 21:57:52 INFO mlflow.tracking.fluent: Experiment with name 'Regression' does not exist. Creating a new experiment.
31-05-2024 21:57:53 INFO --------------------------------------------------------
31-05-2024 21:57:53 INFO Training regression models
31-05-2024 21:57:53 INFO The models used ['lr', 'lasso', 'ridge', 'en', 'lar', 'llar', 'omp', 'br', 'ard', 'par', 'huber', 'svm', 'knn', 'dt', 'rf', 'et', 'ada', 'gbr', 'mlp', 'xgboost', 'catboost', 'dummy']
31-05-2024 21:57:53 INFO The number of models used 22
31-05-2024 21:57:53 INFO Time budget is 20 minutes
31-05-2024 21:57:58 INFO Model ridge trained in 0.091 minutes
31-05-2024 21:58:02 INFO Model lr trained in 0.067 minutes
31-05-2024 21:58:07 INFO Model xgboost trained in 0.076 minutes
31-05-2024 21:58:09 INFO Model llar trained in 0.035 minutes
31-05-2024 21:58:11 INFO Model dt trained in 0.035 minutes
31-05-2024 21:58:13 INFO Model mlp trained in 0.044 minutes
31-05-2024 21:58:15 INFO Model en trained in 0.034 minutes
31-

In [13]:
results["not_tuned"]["train"][0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Unnamed: 0_level_1,Split,Fold,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
catboost,CV-Train,Mean,0.1192,0.1091,0.1477,0.1999,0.0106,0.0052
catboost,CV-Train,Std,0.2383,0.2183,0.2955,0.3999,0.0213,0.0103
catboost,CV-Val,Mean,7.4438,371.0364,8.6144,0.0141,0.1954,0.4896
catboost,CV-Val,Std,14.8877,742.0728,17.2287,0.0282,0.3908,0.9793
omp,CV-Train,Mean,30.9773,1354.9729,36.8079,0.1956,0.7452,1.3546
...,...,...,...,...,...,...,...,...
lar,CV-Val,Std,15.1894,3024.6323,23.4350,1.5407,0.4418,5.7405
dt,CV-Train,Mean,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000
dt,CV-Train,Std,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
dt,CV-Val,Mean,48.2105,3460.0692,57.8207,-1.1355,1.0222,2.7474


In [7]:
test_set_predictions = training.generate_holdout_prediction(models_dict)

In [8]:
regression.evaluate_all_models(experiment.evaluate_model, models_dict, "regression_evaluation")

In [9]:
training_output = "regression_results"
for tune_status, result_dict in results.items():
    for key, value in result_dict.items():
        regression.write_results(f"{training_output}/{tune_status}", *value, sheet_name=key)
    regression.write_results(f"{training_output}/{tune_status}", test_set_predictions[tune_status] , sheet_name=f"test_results")

## Save models from pre-trained

In [None]:
from BioML.models import save_model

In [None]:
generate = save_model.GenerateModel(training)
final_model = generate.finalize_model(models["tuned"]["majority"])
generate.save_model(final_model, "model_output/majority")

## Save Models from scratch

In [11]:
from BioML.models import save_model

In [6]:
data = save_model.DataParser("../data/regression.csv", "../data/regression_labels.csv")
experiment = save_model.PycaretInterface("regression", 200, budget_time=20, best_model=3, 
                                  output_path="regression_training", optimize="RMSE", experiment_name="generate_model")
regressor = save_model.Regressor(test_size=0.2, optimize="RMSE", selected=["br", "lr"])
training = save_model.Trainer(experiment, regressor, 5, 30)


23-02-2024 12:21:03 INFO ------------------------------------------------------------------------------
23-02-2024 12:21:03 INFO PycaretInterface parameters
23-02-2024 12:21:03 INFO Seed: 200
23-02-2024 12:21:03 INFO Budget time: 20
23-02-2024 12:21:03 INFO The number of models to select: 3
23-02-2024 12:21:03 INFO Output path: regression_training
23-02-2024 12:21:03 INFO ----------------Trainer inputs-------------------------
23-02-2024 12:21:03 INFO Number of kfolds: 5
23-02-2024 12:21:03 INFO Number of iterations: 30


In [7]:
regressor.selected

['br', 'lr']

In [10]:
sorted_results, sorted_models, top_params = training.run_training(data.features, data.label)

2024/02/23 12:22:41 INFO mlflow.tracking.fluent: Experiment with name 'generate_model' does not exist. Creating a new experiment.
23-02-2024 12:22:49 INFO --------------------------------------------------------
23-02-2024 12:22:49 INFO Training regression models
23-02-2024 12:22:49 INFO The models used ['br', 'lr']
23-02-2024 12:22:49 INFO Time budget is 20 minutes
23-02-2024 12:23:34 INFO Training over: Total runtime 0.765 minutes
23-02-2024 12:23:34 INFO Analyse the best models and plotting them
23-02-2024 12:23:34 INFO Analyse the top 1 model: br
23-02-2024 12:23:41 INFO Analyse the top 2 model: lr


In [28]:
generate = save_model.GenerateModel(training)
models =  generate.train_by_strategy(sorted_models, "majority")
final_model = generate.finalize_model(models)
generate.save_model(final_model, "model_output/majority")

26-12-2023 13:40:52 INFO --------Creating an ensemble model--------
26-12-2023 13:40:52 INFO ----------Creating a majority voting model--------------
26-12-2023 13:40:52 INFO fold: 5
26-12-2023 13:40:52 INFO weights: None
26-12-2023 13:40:53 INFO ----------Finalizing the model by training it with all the data including test set--------------


Transformation Pipeline and Model Successfully Saved


## Prediction

In [1]:
from BioML.models import predict
import pandas as pd
import numpy as np
from scipy.spatial import distance

In [11]:
training_features = "../data/regression.csv"
label = pd.read_csv("../data/regression_labels.csv", index_col=0)
test_features = "../data/regression.csv"
outlier_train=()
outlier_test=()
sheet_name=None
problem="regression"
model_path="model_output/majority"
scaler="zscore"

In [12]:
feature = predict.DataParser(training_features, label, outliers=outlier_train, sheets=sheet_name)
test_features = feature.remove_outliers(feature.read_features(test_features), outlier_test)
predictions = predict.predict(test_features, model_path, problem)

In [5]:
transformed, scaler_dict, test_x = predict.scale(scaler, feature.drop(), test_features)

In [27]:
filtered_pred = predict.domain_filter(predictions, transformed, test_x, 1)

In [None]:
col_name = ["prediction_score", "prediction_label", "AD_number"]
predictions = predictions.loc[:, predictions.columns.str.contains("|".join(col_name))] 