In [1]:

from functools import partial
import pandas as pd
import time
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor as rfr
import shap
import xgboost as xgb
import numpy as np

### Models

In [2]:
from BioML.features import selection
from pycaret.datasets import get_data
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path

In [3]:
data = get_data('diabetes')
new_data = data.drop("Class variable", axis=1)
labels = data["Class variable"]

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
ester_label = pd.read_csv("../data/esterase_labels.csv", index_col=0)
ester = "-../data/esterase_features.xlsx"

## Classification

In [1]:
import BioML.models.classification as classification
from collections import defaultdict
from functools import partial
import pandas as pd
import numpy as np

In [2]:
plot = ("learning", "confusion_matrix", "class_report")

In [3]:
excel = pd.read_excel("../data/esterase_features.xlsx", index_col=0, sheet_name="ch2_20")
label = pd.read_csv("../data/esterase_labels.csv", index_col=0)
l = pd.Series(label["label1"])

In [4]:
data = classification.DataParser("../data/esterase_features.xlsx", "../data/esterase_labels.csv", sheets="ch2_20")
experiment = classification.PycaretInterface("classification", 200, budget_time=20, best_model=3, 
                                  output_path="classification_training", optimize="MCC")
classifier = classification.Classifier(test_size=0.2, optimize="MCC", drop=())
training = classification.Trainer(experiment, classifier, 5, 30)


28-03-2024 12:33:06 INFO ------------------------------------------------------------------------------
28-03-2024 12:33:06 INFO PycaretInterface parameters
28-03-2024 12:33:06 INFO Seed: 200
28-03-2024 12:33:06 INFO Budget time: 20
28-03-2024 12:33:06 INFO The number of models to select: 3
28-03-2024 12:33:06 INFO Output path: classification_training
28-03-2024 12:33:06 INFO ----------------Trainer inputs-------------------------
28-03-2024 12:33:06 INFO Number of kfolds: 5
28-03-2024 12:33:06 INFO Number of retuning iterations: 30


In [5]:
c = classification.split.ClusterSpliter("../data/resultsDB_clu.tsv", 5, random_state=experiment.seed)
X_train, X_test = c.train_test_split(data.features)

In [6]:
results, models = training.generate_training_results(X_train, data.label, tune=True, test_data=X_test, fold_strategy=c)

28-03-2024 12:33:06 INFO --------------------------------------------------------
28-03-2024 12:33:06 INFO Training classification models
28-03-2024 12:33:06 INFO The models used ['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', 'gbc', 'lda', 'et', 'xgboost', 'lightgbm', 'catboost', 'dummy']
28-03-2024 12:33:06 INFO Time budget is 20 minutes
28-03-2024 12:33:09 INFO Model ridge trained in 0.039 minutes
28-03-2024 12:33:11 INFO Model et trained in 0.034 minutes
28-03-2024 12:33:12 INFO Model rbfsvm trained in 0.013 minutes
28-03-2024 12:33:20 INFO Model catboost trained in 0.138 minutes
28-03-2024 12:33:21 INFO Model dt trained in 0.015 minutes
28-03-2024 12:33:22 INFO Model knn trained in 0.018 minutes
28-03-2024 12:33:23 INFO Model lr trained in 0.013 minutes
28-03-2024 12:33:24 INFO Model nb trained in 0.013 minutes
28-03-2024 12:33:25 INFO Model gpc trained in 0.023 minutes
28-03-2024 12:33:26 INFO Model xgboost trained in 0.015 minutes
28-03-2024

In [10]:
results["not_tuned"]["majority"][0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Average Precision Score
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CV-Train,Mean,0.797,0.0,0.5742,0.8886,0.6936,0.5523,0.5837,0.0
CV-Train,Std,0.0213,0.0,0.0854,0.0159,0.0637,0.0637,0.0496,0.0
CV-Val,Mean,0.7703,0.0,0.5353,0.8779,0.6455,0.4841,0.5253,0.0
CV-Val,Std,0.0597,0.0,0.1275,0.1119,0.0955,0.0949,0.0812,0.0


In [11]:
results["tuned"]["majority"][0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Average Precision Score
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CV-Train,Mean,0.7992,0.0,0.5905,0.8773,0.7027,0.5597,0.5864,0.0
CV-Train,Std,0.017,0.0,0.0763,0.0177,0.0543,0.0532,0.041,0.0
CV-Val,Mean,0.7616,0.0,0.5535,0.8128,0.6364,0.4653,0.4961,0.0
CV-Val,Std,0.0538,0.0,0.1458,0.1267,0.1116,0.1051,0.0867,0.0


In [9]:
results["not_tuned"]["holdout"][0].loc["qda"].loc[("CV-Train", "Mean")]["MCC"]

0.7044

##### Acces the ClaasificationExperiment from Pycaret and play with teh models if you want

In [8]:
experiment.pycaret

<pycaret.classification.oop.ClassificationExperiment at 0x1dd52a826d0>

In [13]:
test_set_predictions = training.generate_holdout_prediction(models)

In [14]:
training_output = "classification_results"
l = []
for tune_status, result_dict in results.items():
    for key, value in result_dict.items():
        classification.write_results(f"{training_output}/{tune_status}", *value, sheet_name=key)
    classification.write_results(f"{training_output}/{tune_status}", test_set_predictions[tune_status] , sheet_name=f"test_results")

## Save Model from pre-trained

In [15]:
from BioML.models import save_model

In [18]:
generate = save_model.GenerateModel(training)
final_model = generate.finalize_model(models["tuned"]["majority"])
generate.save_model(final_model, "model_output/majority")

28-03-2024 12:38:51 INFO ----------Finalizing the model by training it with all the data including test set--------------


Transformation Pipeline and Model Successfully Saved


## Save Model from scratch

In [12]:
from BioML.models import save_model

In [17]:
data = save_model.DataParser("../data/esterase_features.xlsx", "../data/esterase_labels.csv", sheets="ch2_20")
experiment = save_model.PycaretInterface("classification", 200, budget_time=20, best_model=3, 
                                  output_path="classification_training", optimize="MCC", experiment_name="generate_model")
classifier = save_model.Classifier(test_size=0.2, optimize="MCC", selected=("qda", "rbfsvm", "ridge"))
training = save_model.Trainer(experiment, classifier, 5, 30)


27-03-2024 14:28:52 INFO ------------------------------------------------------------------------------
27-03-2024 14:28:52 INFO PycaretInterface parameters
27-03-2024 14:28:52 INFO Seed: 200
27-03-2024 14:28:52 INFO Budget time: 20
27-03-2024 14:28:52 INFO The number of models to select: 3
27-03-2024 14:28:52 INFO Output path: classification_training
27-03-2024 14:28:52 INFO ----------------Trainer inputs-------------------------
27-03-2024 14:28:52 INFO Number of kfolds: 5
27-03-2024 14:28:52 INFO Number of iterations: 30


In [18]:
regressor.selected

'lr'

In [19]:
sorted_results, sorted_models, top_params = training.run_training(data.features, data.label)

2024/03/27 14:28:55 INFO mlflow.tracking.fluent: Experiment with name 'generate_model' does not exist. Creating a new experiment.
27-03-2024 14:28:56 INFO --------------------------------------------------------
27-03-2024 14:28:56 INFO Training classification models
27-03-2024 14:28:56 INFO The models used ['lr']
27-03-2024 14:28:56 INFO Time budget is 20 minutes
27-03-2024 14:29:01 INFO Model lr trained in 0.092 minutes
27-03-2024 14:29:01 INFO Training over: Total runtime 0.092 minutes


Model lr trained in 0.092 minutes


In [20]:
generate = save_model.GenerateModel(training)
models =  generate.train_by_strategy(sorted_models, "majority")
final_model = generate.finalize_model(models)
generate.save_model(final_model, "model_output/logistic")

27-03-2024 14:29:02 INFO ----------Finalizing the model by training it with all the data including test set--------------


Transformation Pipeline and Model Successfully Saved


## Prediction

In [22]:
from BioML.models import predict
import pandas as pd
import numpy as np
from scipy.spatial import distance

In [23]:
training_features = "../data/esterase_features.xlsx"
label = "../data/esterase_labels.csv"
test_features = "../data/esterase_features.xlsx"
outlier_train=()
outlier_test=()
sheet_name="ch2_20"
problem="classification"
model_path="model_output/majority"
scaler="zscore"

#### generate the predictions

In [40]:
feature = predict.DataParser(training_features, label, outliers=outlier_train, sheets=sheet_name)
test_features = feature.remove_outliers(feature.read_features(test_features, "ch2_20"), outlier_test)
predictions = predict.predict(test_features, model_path, problem)


#### Optional if you want to filter predictions based if that test sample is within the applicability domain

Applicability domain compares eucleadian distance between the features from the training and features from the test.   
If the distance is far from a set threshold then that prediction is discarded since it deviates from the samples the model have seen during training

In [41]:
transformed, scaler_dict, test_x = predict.scale(scaler, feature.drop(), test_features)
filtered_pred = predict.domain_filter(predictions, transformed, test_x, 5) # it returns the predictions appended to the features

In [44]:
filtered_pred.shape, predictions.shape

((120, 20), (147, 19))

In [None]:
 # if you don't apply the domain filter, you can just use the predictions but you have to change the index to sample_0, sample_1, etc
predictions.index = [f"sample_{x}" for x, _ in enumerate(predictions.index)]
col_name = ["prediction_score", "prediction_label", "AD_number"]
predictions = predictions.loc[:, predictions.columns.str.contains("|".join(col_name))] # only keep the columns with the prediction scores

### Separate the fasta file into positive or negative

In [34]:
fasta = "../data/whole_sequence.fasta"
res_dir = "prediction_results_domain"

In [35]:
extractor = predict.FastaExtractor(fasta, res_dir)
positive, negative = extractor.separate_negative_positive(predictions)


In [36]:
extractor.extract(positive, negative, positive_fasta=f"positive.fasta", negative_fasta=f"negative.fasta")