In [3]:
from pycaret.datasets import get_data
from functools import partial
import pandas as pd
import time
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor as rfr
import shap
import xgboost as xgb
import numpy as np
from sklearn.feature_selection import RFE, SelectKBest, f_regression, mutual_info_regression
from sklearn.linear_model import RidgeClassifier, Ridge
from sklearn.model_selection import train_test_split


### Models

In [4]:
data = get_data('diabetes')
data_reg = get_data('insurance')
new_data = data.drop("Class variable", axis=1)
charges = data_reg.loc[data.index]["charges"]

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
labels = pd.read_csv("../data/regression_labels.csv", index_col=0)
regre = pd.read_csv("../data/regression.csv", index_col=0)

## Regression

In [1]:
import BioML.models.regression as regression
from collections import defaultdict
from functools import partial
import pandas as pd
import numpy as np
from pycaret.regression import RegressionExperiment

In [None]:
plot = ("learning", "error", "residuals")

In [3]:
excel = pd.read_excel("../data/esterase_features.xlsx", index_col=0, sheet_name="ch2_20")
label = list(range(len(excel)))

In [4]:
len(label)

147

In [5]:
data = regression.DataParser(data_reg)
experiment = regression.PycaretInterface("regression", 200, budget_time=20, best_model=3, 
                                        output_path="regression_results", optimize="RMSE")
regressor = regression.Regressor(optimize="NDCG")
training = regression.Trainer(experiment, regressor, 5)


01-06-2024 23:37:14 INFO ------------------------------------------------------------------------------
01-06-2024 23:37:14 INFO PycaretInterface parameters
01-06-2024 23:37:14 INFO Seed: 200
01-06-2024 23:37:14 INFO Budget time: 20
01-06-2024 23:37:14 INFO The number of models to select: 3
01-06-2024 23:37:14 INFO Output path: regression_results
01-06-2024 23:37:14 INFO ----------------Trainer inputs-------------------------
01-06-2024 23:37:14 INFO Number of kfolds: 5
01-06-2024 23:37:14 INFO Number of retuning iterations: 50
01-06-2024 23:37:14 INFO Test size: 0.2


In [11]:
c = regression.split.ClusterSpliter("../data/resultsDB_clu.tsv", 5, random_state=experiment.seed)
X_train, X_test = c.train_test_split(data.features)

ValueError: Found input variables with inconsistent numbers of samples: [1338, 0]

In [6]:
results, models_dict = training.generate_training_results(data_reg, "charges", False)

01-06-2024 23:37:19 INFO --------------------------------------------------------
01-06-2024 23:37:19 INFO Training regression models
01-06-2024 23:37:19 INFO The models used ['lr', 'lasso', 'ridge', 'en', 'lar', 'llar', 'omp', 'br', 'ard', 'par', 'huber', 'svm', 'knn', 'dt', 'rf', 'et', 'ada', 'gbr', 'mlp', 'xgboost', 'catboost', 'dummy']
01-06-2024 23:37:19 INFO The number of models used 22
01-06-2024 23:37:19 INFO Time budget is 20 minutes
01-06-2024 23:37:24 INFO Model ridge trained in 0.087 minutes
01-06-2024 23:37:28 INFO Model lr trained in 0.063 minutes
01-06-2024 23:37:32 INFO Model xgboost trained in 0.071 minutes
01-06-2024 23:37:35 INFO Model llar trained in 0.037 minutes
01-06-2024 23:37:37 INFO Model dt trained in 0.037 minutes
01-06-2024 23:37:41 INFO Model mlp trained in 0.065 minutes
01-06-2024 23:37:43 INFO Model en trained in 0.031 minutes
01-06-2024 23:37:45 INFO Model br trained in 0.034 minutes
01-06-2024 23:37:46 INFO Model lasso trained in 0.031 minutes
01-06-20

In [None]:
results

In [7]:
results["not_tuned"]["train"][0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,NDCG
Unnamed: 0_level_1,Split,Fold,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
gbr,CV-Train,Mean,2000.5400,1.320064e+07,3631.7598,0.9081,0.3275,0.2354,0.8889
gbr,CV-Train,Std,95.4788,7.587107e+05,104.7157,0.0047,0.0123,0.0137,0.0216
gbr,CV-Val,Mean,2545.7444,2.170922e+07,4642.5606,0.8484,0.4231,0.2879,0.8970
gbr,CV-Val,Std,224.7497,3.636617e+06,394.7798,0.0188,0.0276,0.0148,0.0875
catboost,CV-Train,Mean,1319.0178,5.470697e+06,2338.2056,0.9619,0.2257,0.1579,0.9768
...,...,...,...,...,...,...,...,...,...
svm,CV-Val,Std,617.0308,2.261710e+07,907.3702,0.0434,0.0306,0.0680,0.0993
mlp,CV-Train,Mean,11261.4279,2.475842e+08,15732.4942,-0.7228,2.3073,0.8259,0.6158
mlp,CV-Train,Std,205.0196,8.473238e+06,269.9014,0.0212,0.0388,0.0010,0.0436
mlp,CV-Val,Mean,11268.0921,2.484096e+08,15733.2889,-0.7359,2.2946,0.8242,0.7062


In [7]:
test_set_predictions = training.generate_holdout_prediction(models_dict)

In [8]:
regression.evaluate_all_models(experiment.evaluate_model, models_dict, "regression_evaluation")

In [9]:
training_output = "regression_results"
for tune_status, result_dict in results.items():
    for key, value in result_dict.items():
        regression.write_results(f"{training_output}/{tune_status}", *value, sheet_name=key)
    regression.write_results(f"{training_output}/{tune_status}", test_set_predictions[tune_status] , sheet_name=f"test_results")

## Save models from pre-trained

In [None]:
from BioML.models import save_model

In [None]:
generate = save_model.GenerateModel(training)
final_model = generate.finalize_model(models["tuned"]["majority"])
generate.save_model(final_model, "model_output/majority")

## Save Models from scratch

In [11]:
from BioML.models import save_model

In [6]:
data = save_model.DataParser("../data/regression.csv", "../data/regression_labels.csv")
experiment = save_model.PycaretInterface("regression", 200, budget_time=20, best_model=3, 
                                  output_path="regression_training", optimize="RMSE", experiment_name="generate_model")
regressor = save_model.Regressor(test_size=0.2, optimize="RMSE", selected=["br", "lr"])
training = save_model.Trainer(experiment, regressor, 5, 30)


23-02-2024 12:21:03 INFO ------------------------------------------------------------------------------
23-02-2024 12:21:03 INFO PycaretInterface parameters
23-02-2024 12:21:03 INFO Seed: 200
23-02-2024 12:21:03 INFO Budget time: 20
23-02-2024 12:21:03 INFO The number of models to select: 3
23-02-2024 12:21:03 INFO Output path: regression_training
23-02-2024 12:21:03 INFO ----------------Trainer inputs-------------------------
23-02-2024 12:21:03 INFO Number of kfolds: 5
23-02-2024 12:21:03 INFO Number of iterations: 30


In [7]:
regressor.selected

['br', 'lr']

In [10]:
sorted_results, sorted_models, top_params = training.run_training(data.features, data.label)

2024/02/23 12:22:41 INFO mlflow.tracking.fluent: Experiment with name 'generate_model' does not exist. Creating a new experiment.
23-02-2024 12:22:49 INFO --------------------------------------------------------
23-02-2024 12:22:49 INFO Training regression models
23-02-2024 12:22:49 INFO The models used ['br', 'lr']
23-02-2024 12:22:49 INFO Time budget is 20 minutes
23-02-2024 12:23:34 INFO Training over: Total runtime 0.765 minutes
23-02-2024 12:23:34 INFO Analyse the best models and plotting them
23-02-2024 12:23:34 INFO Analyse the top 1 model: br
23-02-2024 12:23:41 INFO Analyse the top 2 model: lr


In [28]:
generate = save_model.GenerateModel(training)
models =  generate.train_by_strategy(sorted_models, "majority")
final_model = generate.finalize_model(models)
generate.save_model(final_model, "model_output/majority")

26-12-2023 13:40:52 INFO --------Creating an ensemble model--------
26-12-2023 13:40:52 INFO ----------Creating a majority voting model--------------
26-12-2023 13:40:52 INFO fold: 5
26-12-2023 13:40:52 INFO weights: None
26-12-2023 13:40:53 INFO ----------Finalizing the model by training it with all the data including test set--------------


Transformation Pipeline and Model Successfully Saved


## Prediction

In [1]:
from BioML.models import predict
import pandas as pd
import numpy as np
from scipy.spatial import distance

In [11]:
training_features = "../data/regression.csv"
label = pd.read_csv("../data/regression_labels.csv", index_col=0)
test_features = "../data/regression.csv"
outlier_train=()
outlier_test=()
sheet_name=None
problem="regression"
model_path="model_output/majority"
scaler="zscore"

In [12]:
feature = predict.DataParser(training_features, label, outliers=outlier_train, sheets=sheet_name)
test_features = feature.remove_outliers(feature.read_features(test_features), outlier_test)
predictions = predict.predict(test_features, model_path, problem)

In [5]:
transformed, scaler_dict, test_x = predict.scale(scaler, feature.drop(), test_features)

In [27]:
filtered_pred = predict.domain_filter(predictions, transformed, test_x, 1)

In [None]:
col_name = ["prediction_score", "prediction_label", "AD_number"]
predictions = predictions.loc[:, predictions.columns.str.contains("|".join(col_name))] 