In [1]:
from pycaret.datasets import get_data
from functools import partial
import pandas as pd
import time
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor as rfr
import shap
import xgboost as xgb
import numpy as np
from sklearn.feature_selection import RFE, SelectKBest, f_regression, mutual_info_regression
from sklearn.linear_model import RidgeClassifier, Ridge
from sklearn.model_selection import train_test_split


### Models

In [2]:
data = get_data('diabetes')
data_reg = get_data('insurance')
new_data = data.drop("Class variable", axis=1)
charges = data_reg.loc[data.index]["charges"]

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
labels = pd.read_csv("../data/regression_labels.csv", index_col=0)
regre = pd.read_csv("../data/regression.csv", index_col=0)

## Regression

In [3]:
import BioML.models.regression as regression
from collections import defaultdict
from functools import partial
import pandas as pd
import numpy as np

In [7]:
plot = ("learning", "error", "residuals")

In [4]:
excel = pd.read_excel("../data/esterase_features.xlsx", index_col=0, sheet_name="ch2_20")
label = list(range(len(excel)))

In [12]:
data = regression.DataParser("../data/esterase_features.xlsx", label)
experiment = regression.PycaretInterface("regression", 200, budget_time=20, best_model=3, 
                                        output_path="regression_training", optimize="RMSE")
regressor = regression.Regressor(test_size=0.2, optimize="RMSE")
training = regression.Trainer(experiment, regressor, 5, 30)


26-12-2023 13:25:00 INFO ------------------------------------------------------------------------------
26-12-2023 13:25:00 INFO PycaretInterface parameters
26-12-2023 13:25:00 INFO Seed: 200
26-12-2023 13:25:00 INFO Budget time: 20
26-12-2023 13:25:00 INFO The number of models to select: 3
26-12-2023 13:25:00 INFO Output path: regression_training
26-12-2023 13:25:00 INFO ----------------Trainer inputs-------------------------
26-12-2023 13:25:00 INFO Number of kfolds: 5
26-12-2023 13:25:00 INFO Number of iterations: 30


In [13]:
c = regression.split.ClusterSpliter("../data/resultsDB_clu.tsv", 5, random_state=experiment.seed, test_size=0.2)
X_train, X_test = c.train_test_split(data.features)

In [None]:
results, models_dict = training.generate_training_results(X_train, data.label, True,
                                                          test_data=X_test, fold_strategy=c)

In [16]:
test_set_predictions = training.generate_holdout_prediction(models_dict)

In [None]:
regression.evaluate_all_models(experiment.evaluate_model, models_dict, "regression_evaluation")

In [6]:
training_output = "regression_results"
for tune_status, result_dict in results.items():
    for key, value in result_dict.items():
        regression.write_results(f"{training_output}/{tune_status}", *value, sheet_name=key)
    regression.write_results(f"{training_output}/{tune_status}", test_set_predictions[tune_status] , sheet_name=f"test_results")

## Generate Models

In [11]:
from BioML.models import save_model

In [6]:
data = save_model.DataParser("../data/regression.csv", "../data/regression_labels.csv")
experiment = save_model.PycaretInterface("regression", 200, budget_time=20, best_model=3, 
                                  output_path="regression_training", optimize="RMSE", experiment_name="generate_model")
regressor = save_model.Regressor(test_size=0.2, optimize="RMSE", selected=["br", "lr"])
training = save_model.Trainer(experiment, regressor, 5, 30)


23-02-2024 12:21:03 INFO ------------------------------------------------------------------------------
23-02-2024 12:21:03 INFO PycaretInterface parameters
23-02-2024 12:21:03 INFO Seed: 200
23-02-2024 12:21:03 INFO Budget time: 20
23-02-2024 12:21:03 INFO The number of models to select: 3
23-02-2024 12:21:03 INFO Output path: regression_training
23-02-2024 12:21:03 INFO ----------------Trainer inputs-------------------------
23-02-2024 12:21:03 INFO Number of kfolds: 5
23-02-2024 12:21:03 INFO Number of iterations: 30


In [7]:
regressor.selected

['br', 'lr']

In [10]:
sorted_results, sorted_models, top_params = training.run_training(data.features, data.label)

2024/02/23 12:22:41 INFO mlflow.tracking.fluent: Experiment with name 'generate_model' does not exist. Creating a new experiment.
23-02-2024 12:22:49 INFO --------------------------------------------------------
23-02-2024 12:22:49 INFO Training regression models
23-02-2024 12:22:49 INFO The models used ['br', 'lr']
23-02-2024 12:22:49 INFO Time budget is 20 minutes
23-02-2024 12:23:34 INFO Training over: Total runtime 0.765 minutes
23-02-2024 12:23:34 INFO Analyse the best models and plotting them
23-02-2024 12:23:34 INFO Analyse the top 1 model: br
23-02-2024 12:23:41 INFO Analyse the top 2 model: lr


In [28]:
generate = save_model.GenerateModel(training)
models =  generate.train_by_strategy(sorted_models, "majority")
final_model = generate.finalize_model(models)
generate.save_model(final_model, "model_output/majority")

26-12-2023 13:40:52 INFO --------Creating an ensemble model--------
26-12-2023 13:40:52 INFO ----------Creating a majority voting model--------------
26-12-2023 13:40:52 INFO fold: 5
26-12-2023 13:40:52 INFO weights: None
26-12-2023 13:40:53 INFO ----------Finalizing the model by training it with all the data including test set--------------


Transformation Pipeline and Model Successfully Saved


## Prediction

In [1]:
from BioML.models import predict
import pandas as pd
import numpy as np
from scipy.spatial import distance

In [11]:
training_features = "../data/regression.csv"
label = pd.read_csv("../data/regression_labels.csv", index_col=0)
test_features = "../data/regression.csv"
outlier_train=()
outlier_test=()
sheet_name=None
problem="regression"
model_path="model_output/majority"
scaler="zscore"

In [12]:
feature = predict.DataParser(training_features, label, outliers=outlier_train, sheets=sheet_name)
test_features = feature.remove_outliers(feature.read_features(test_features), outlier_test)
predictions = predict.predict(test_features, model_path, problem)

In [14]:
feature.features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,target
0,0,1.876796,0.756989,0.279969,0.725767,0.481009,1.355638,-1.244655,0.413435,0.869606,...,-1.125489,2.445752,0.129221,0.223884,1.496044,-0.773789,-0.055585,0.109395,-1.778720,109.170595
1,0,-1.457551,-1.406317,-0.160133,-0.796026,1.076007,0.760056,-0.752156,0.082440,-1.504720,...,0.671340,0.213197,-0.751969,0.021312,1.340450,-0.309209,0.115026,-0.319054,0.319175,-250.636124
2,0,-1.662492,-0.134309,-0.308034,-0.209222,-1.683438,-1.748532,1.126705,1.304340,0.793489,...,0.779661,1.310309,1.395684,-0.805870,-0.410814,1.032546,-0.214921,-0.562168,-1.090966,2.603758
3,0,-0.756795,-1.046911,0.455888,0.268592,1.528468,0.718953,1.501334,0.996048,1.185704,...,2.165002,-0.643518,0.927840,0.507836,-0.250833,-1.421811,0.556230,0.057013,-0.322680,128.397377
4,0,-0.401220,0.519347,1.451144,0.183342,2.189803,0.401712,0.012592,0.690144,-0.108760,...,0.959271,2.153182,-0.767348,-0.808298,-0.773010,0.224092,0.497998,0.872321,0.097676,31.433719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,-0.334501,-0.792521,2.122156,-0.707669,0.443819,0.865755,-0.653329,-1.200296,0.504987,...,1.032465,-1.519370,-0.484234,0.774634,0.404982,-0.474945,0.917862,1.266911,1.765454,-95.318477
96,0,0.622850,-1.594428,-1.534114,0.115675,1.179297,0.046981,-0.142379,-0.450065,0.005244,...,1.277677,0.332314,-0.748487,0.067518,0.514439,-1.067620,-1.124642,1.551152,0.120296,-43.333232
97,0,-1.331233,0.133541,-0.006071,-0.290275,0.267392,0.956702,0.507991,-0.785989,0.708109,...,0.838491,0.081829,-0.098890,0.321698,-2.152891,-1.836205,2.493000,0.919076,-1.103367,-235.883143
98,0,0.513085,-0.971657,1.188913,-0.881875,-0.163067,0.862393,0.516178,0.953125,-0.626717,...,0.708304,0.351448,1.070150,-0.744903,0.431923,0.725096,0.754291,-0.026521,-0.641482,100.310892


In [5]:
transformed, scaler_dict, test_x = predict.scale(scaler, feature.drop(), test_features)

In [12]:
domain = predict.ApplicabilityDomain()

In [9]:
predictions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,prediction_label
0,0,1.876796,0.756989,0.279969,0.725767,0.481009,1.355638,-1.244655,0.413435,0.869606,...,-1.125489,2.445752,0.129221,0.223884,1.496044,-0.773789,-0.055585,0.109395,-1.778720,109.689812
1,0,-1.457551,-1.406317,-0.160133,-0.796026,1.076007,0.760056,-0.752156,0.082440,-1.504720,...,0.671340,0.213197,-0.751969,0.021312,1.340450,-0.309209,0.115026,-0.319054,0.319174,-253.080261
2,0,-1.662492,-0.134309,-0.308034,-0.209222,-1.683438,-1.748532,1.126705,1.304340,0.793489,...,0.779661,1.310309,1.395684,-0.805870,-0.410814,1.032546,-0.214921,-0.562168,-1.090966,-2.699476
3,0,-0.756795,-1.046911,0.455888,0.268592,1.528468,0.718953,1.501334,0.996048,1.185704,...,2.165002,-0.643518,0.927840,0.507836,-0.250833,-1.421811,0.556230,0.057013,-0.322680,125.710091
4,0,-0.401220,0.519347,1.451144,0.183342,2.189803,0.401712,0.012592,0.690144,-0.108760,...,0.959271,2.153183,-0.767348,-0.808298,-0.773010,0.224092,0.497998,0.872321,0.097676,23.891327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,-0.334501,-0.792521,2.122156,-0.707669,0.443819,0.865755,-0.653329,-1.200296,0.504987,...,1.032465,-1.519370,-0.484234,0.774634,0.404982,-0.474945,0.917862,1.266911,1.765454,-91.321083
96,0,0.622850,-1.594428,-1.534114,0.115675,1.179297,0.046981,-0.142379,-0.450065,0.005244,...,1.277677,0.332314,-0.748487,0.067518,0.514439,-1.067620,-1.124642,1.551152,0.120296,-36.768723
97,0,-1.331233,0.133541,-0.006071,-0.290275,0.267392,0.956702,0.507991,-0.785989,0.708109,...,0.838491,0.081829,-0.098890,0.321698,-2.152891,-1.836205,2.493000,0.919077,-1.103367,-234.532959
98,0,0.513085,-0.971657,1.188913,-0.881875,-0.163067,0.862393,0.516178,0.953125,-0.626717,...,0.708304,0.351448,1.070150,-0.744903,0.431923,0.725096,0.754291,-0.026521,-0.641482,107.006805


In [27]:
domain0 = predict.domain_filter(predictions, transformed, test_x, 1)