In [1]:

from functools import partial
import pandas as pd
import time
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor as rfr
import shap
import xgboost as xgb
import numpy as np

### Models

In [2]:
from BioML.features import selection
from pycaret.datasets import get_data
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path

In [3]:
data = get_data('diabetes')
new_data = data.drop("Class variable", axis=1)
labels = data["Class variable"]

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
ester_label = pd.read_csv("../data/esterase_labels.csv", index_col=0)
ester = "-../data/esterase_features.xlsx"

## Classification

In [5]:
import BioML.models.classification as classification
from collections import defaultdict
from functools import partial
import pandas as pd
import numpy as np

In [6]:
plot = ("learning", "confusion_matrix", "class_report")

In [7]:
excel = pd.read_excel("../data/esterase_features.xlsx", index_col=0, sheet_name="ch2_20")
label = pd.read_csv("../data/esterase_labels.csv", index_col=0)
l = pd.Series(label["label1"])

In [8]:
data = classification.DataParser("../data/esterase_features.xlsx", "../data/esterase_labels.csv", sheets="ch2_20")
experiment = classification.PycaretInterface("classification", 200, budget_time=20, best_model=3, 
                                  output_path="classification_training", optimize="MCC")
regressor = classification.Classifier(test_size=0.2, optimize="MCC")
training = classification.Trainer(experiment, regressor, 5, 30)


23-02-2024 13:36:43 INFO ------------------------------------------------------------------------------
23-02-2024 13:36:43 INFO PycaretInterface parameters
23-02-2024 13:36:43 INFO Seed: 200
23-02-2024 13:36:43 INFO Budget time: 20
23-02-2024 13:36:43 INFO The number of models to select: 3
23-02-2024 13:36:43 INFO Output path: classification_training
23-02-2024 13:36:43 INFO ----------------Trainer inputs-------------------------
23-02-2024 13:36:43 INFO Number of kfolds: 5
23-02-2024 13:36:43 INFO Number of iterations: 30


In [9]:
c = classification.split.ClusterSpliter("../data/resultsDB_clu.tsv", 5, random_state=experiment.seed, test_size=0.2)
X_train, X_test = c.train_test_split(data.features)

In [10]:
results, models = training.generate_training_results(X_train, data.label, tune=True, 
                                                           test_data=X_test, fold_strategy=c)

2024/02/23 13:36:47 INFO mlflow.tracking.fluent: Experiment with name 'Classification' does not exist. Creating a new experiment.
23-02-2024 13:36:48 INFO --------------------------------------------------------
23-02-2024 13:36:48 INFO Training classification models
23-02-2024 13:36:48 INFO The models used ['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'mlp', 'ridge', 'rf', 'qda', 'gbc', 'lda', 'et', 'xgboost', 'catboost', 'dummy']
23-02-2024 13:36:48 INFO Time budget is 20 minutes
23-02-2024 13:37:31 INFO Training over: Total runtime 0.72 minutes
23-02-2024 13:37:31 INFO Analyse the best models and plotting them
23-02-2024 13:37:31 INFO Analyse the top 1 model: qda
23-02-2024 13:37:33 INFO Analyse the top 2 model: et
23-02-2024 13:37:35 INFO Analyse the top 3 model: rbfsvm
23-02-2024 13:37:36 INFO --------Stacking the best models--------
23-02-2024 13:37:36 INFO ----------Stacking the best models--------------
23-02-2024 13:37:39 INFO --------Creating an ensemble model--------
23-02-2024

##### Acces the ClaasificationExperiment from Pycaret and play with teh models if you want

In [8]:
experiment.pycaret

<pycaret.classification.oop.ClassificationExperiment at 0x1dd52a826d0>

In [11]:
test_set_predictions = training.generate_holdout_prediction(models)

In [14]:
training_output = "classification_results"
l = []
for tune_status, result_dict in results.items():
    for key, value in result_dict.items():
        classification.write_results(f"{training_output}/{tune_status}", *value, sheet_name=key)
    classification.write_results(f"{training_output}/{tune_status}", test_set_predictions[tune_status] , sheet_name=f"test_results")

## Generate Models

In [32]:
from BioML.models import save_model

In [33]:
data = save_model.DataParser("../data/esterase_features.xlsx", "../data/esterase_labels.csv", sheets="ch2_20")
experiment = save_model.PycaretInterface("classification", 200, budget_time=20, best_model=3, 
                                  output_path="classification_training", optimize="MCC", experiment_name="generate_model")
regressor = save_model.Classifier(test_size=0.2, optimize="MCC", selected="lr")
training = save_model.Trainer(experiment, regressor, 5, 30)


23-02-2024 13:09:43 INFO ------------------------------------------------------------------------------
23-02-2024 13:09:43 INFO PycaretInterface parameters
23-02-2024 13:09:43 INFO Seed: 200
23-02-2024 13:09:43 INFO Budget time: 20
23-02-2024 13:09:43 INFO The number of models to select: 3
23-02-2024 13:09:43 INFO Output path: classification_training
23-02-2024 13:09:43 INFO ----------------Trainer inputs-------------------------
23-02-2024 13:09:43 INFO Number of kfolds: 5
23-02-2024 13:09:43 INFO Number of iterations: 30


In [34]:
regressor.selected

'lr'

In [35]:
sorted_results, sorted_models, top_params = training.run_training(data.features, data.label)

23-02-2024 13:09:47 INFO --------------------------------------------------------
23-02-2024 13:09:47 INFO Training classification models
23-02-2024 13:09:47 INFO The models used ['lr']
23-02-2024 13:09:47 INFO Time budget is 20 minutes
23-02-2024 13:09:52 INFO Training over: Total runtime 0.077 minutes
23-02-2024 13:09:52 INFO Analyse the best models and plotting them
23-02-2024 13:09:52 INFO Analyse the top 1 model: lr


In [36]:
generate = save_model.GenerateModel(training)
models =  generate.train_by_strategy(sorted_models, "simple:0")
final_model = generate.finalize_model(models)
generate.save_model(final_model, "model_output/logistic")

23-02-2024 13:09:58 INFO ----------Finalizing the model by training it with all the data including test set--------------


Transformation Pipeline and Model Successfully Saved


## Prediction

In [37]:
from BioML.models import predict
import pandas as pd
import numpy as np
from scipy.spatial import distance

In [42]:
training_features = "../data/esterase_features.xlsx"
label = "../data/esterase_labels.csv"
test_features = "../data/esterase_features.xlsx"
outlier_train=()
outlier_test=()
sheet_name="ch2_20"
problem="classification"
model_path="model_output/logistic"
scaler="zscore"

In [43]:
feature = predict.DataParser(training_features, label, outliers=outlier_train, sheets=sheet_name)
test_features = feature.remove_outliers(feature.read_features(test_features, "ch2_20"), outlier_test)
predictions = predict.predict(test_features, model_path, problem)


In [44]:
transformed, scaler_dict, test_x = predict.scale(scaler, feature.drop(), test_features)
domain = predict.ApplicabilityDomain()
domain0 = predict.domain_filter(predictions, transformed, test_x, 5)

In [45]:
fasta = "../data/whole_sequence.fasta"
res_dir = "prediction_results_domain"

In [46]:
predictions.index = [f"sample_{x}" for x, _ in enumerate(predictions.index)]

In [47]:
col_name = ["prediction_score", "prediction_label", "AD_number"]
predictions = predictions.loc[:, predictions.columns.str.contains("|".join(col_name))]

In [48]:
extractor = predict.FastaExtractor(fasta, res_dir)
positive, negative = extractor.separate_negative_positive(domain0)


In [49]:
extractor.extract(positive, negative, positive_fasta=f"positive.fasta", negative_fasta=f"negative.fasta")