In [1]:

from functools import partial
import pandas as pd
import time
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor as rfr
import shap
import xgboost as xgb
import numpy as np

### Models

In [2]:
from BioML.features import selection
from pycaret.datasets import get_data
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path

In [3]:
def hello(a="hello", **kwargs):
    print(a, kwargs)
    if kwargs:
        print("0")

def nested_hello(a="hello", **kwargs):
    hello(a, kwargs)

nested_hello("hello", b="world")

TypeError: hello() takes from 0 to 1 positional arguments but 2 were given

In [3]:
data = get_data('diabetes')
new_data = data.drop("Class variable", axis=1)
labels = data["Class variable"]

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
ester_label = pd.read_csv("../data/esterase_labels.csv", index_col=0)
ester = "-../data/esterase_features.xlsx"

## Regression

In [5]:
import BioML.training.classification as classification
from collections import defaultdict
from functools import partial
import pandas as pd
import numpy as np

In [6]:
plot = ("learning", "confusion_matrix", "class_report")

In [7]:
excel = pd.read_excel("../data/esterase_features.xlsx", index_col=0, sheet_name="ch2_20")
label = pd.read_csv("../data/esterase_labels.csv", index_col=0)
l = pd.Series(label["label1"])

In [8]:
data = classification.DataParser("../data/esterase_features.xlsx", "../data/esterase_labels.csv", sheets="ch2_20")
experiment = classification.PycaretInterface("classification", 200, budget_time=20, best_model=3, 
                                  output_path="classification_training", optimize="MCC")
training = classification.Trainer(experiment, 5)
regressor = classification.Classifier(test_size=0.2, optimize="MCC")

26-12-2023 13:18:06 INFO ------------------------------------------------------------------------------
26-12-2023 13:18:06 INFO PycaretInterface parameters
26-12-2023 13:18:06 INFO Seed: 200
26-12-2023 13:18:06 INFO Budget time: 20
26-12-2023 13:18:06 INFO The number of models to select: 3
26-12-2023 13:18:06 INFO Output path: classification_training
26-12-2023 13:18:06 INFO ----------------Trainer inputs-------------------------
26-12-2023 13:18:06 INFO Number of kfolds: 5
26-12-2023 13:18:06 INFO Number of iterations: 30


In [9]:
c = classification.split.ClusterSpliter("../data/resultsDB_clu.tsv", 5, random_state=experiment.seed, test_size=0.2)
X_train, X_test = c.train_test_split(data.features)

In [10]:
results, models = classification.generate_training_results(regressor, training, X_train, data.label, plot=None, tune=True, 
                                                           test_data=X_test, fold_strategy=c)

26-12-2023 13:18:09 INFO --------------------------------------------------------
26-12-2023 13:18:09 INFO Training classification models
26-12-2023 13:18:09 INFO The models used ['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'mlp', 'ridge', 'rf', 'qda', 'gbc', 'lda', 'et', 'xgboost', 'catboost', 'dummy']
26-12-2023 13:18:09 INFO Time budget is 20 minutes
26-12-2023 13:18:34 INFO Training over: Total runtime 0.411 minutes
26-12-2023 13:18:34 INFO --------Stacking the best models--------
26-12-2023 13:18:34 INFO ----------Stacking the best models--------------
26-12-2023 13:18:35 INFO --------Creating an ensemble model--------
26-12-2023 13:18:35 INFO ----------Creating a majority voting model--------------
26-12-2023 13:18:35 INFO fold: 5
26-12-2023 13:18:35 INFO weights: None
26-12-2023 13:18:36 INFO --------Retuning the best models--------
26-12-2023 13:18:36 INFO Retuning rbfsvm
26-12-2023 13:18:36 INFO ---------Retuning the best models--------------
26-12-2023 13:18:36 INFO num_iter: 3

In [14]:
experiment.pycaret

5

In [11]:
partial_sort = classification.partial(classification.sort_classification_prediction, optimize="MCC") 
test_set_predictions = classification.generate_test_prediction(models, training, partial_sort)

In [9]:
training_output = "classification_results"
for tune_status, result_dict in results.items():
    for key, value in result_dict.items():
        classification.write_results(f"{training_output}/{tune_status}", *value, sheet_name=key)
    classification.write_results(f"{training_output}/{tune_status}", test_set_predictions[tune_status] , sheet_name=f"test_results")

## Generate Models

In [15]:
from BioML.training import save_model

In [16]:
data = save_model.DataParser("../data/esterase_features.xlsx", "../data/esterase_labels.csv", sheets="ch2_20")
experiment = save_model.PycaretInterface("classification", 200, budget_time=20, best_model=3, 
                                  output_path="classification_training", optimize="MCC", experiment_name="generate_model")
training = save_model.Trainer(experiment, 5, 30)


26-12-2023 13:36:16 INFO ------------------------------------------------------------------------------
26-12-2023 13:36:16 INFO PycaretInterface parameters
26-12-2023 13:36:16 INFO Seed: 200
26-12-2023 13:36:16 INFO Budget time: 20
26-12-2023 13:36:16 INFO The number of models to select: 3
26-12-2023 13:36:16 INFO Output path: classification_training
26-12-2023 13:36:16 INFO ----------------Trainer inputs-------------------------
26-12-2023 13:36:16 INFO Number of kfolds: 5
26-12-2023 13:36:16 INFO Number of iterations: 30


In [18]:
regressor = save_model.Classifier(test_size=0.2, optimize="MCC", selected="lr")

In [15]:
regressor.selected

'lr'

In [19]:
sorted_results, sorted_models, top_params = regressor.run_training(training, data.features, data.label, plot=())

26-12-2023 13:36:48 INFO --------------------------------------------------------
26-12-2023 13:36:48 INFO Training classification models
26-12-2023 13:36:48 INFO The models used ['lr']
26-12-2023 13:36:48 INFO Time budget is 20 minutes
26-12-2023 13:36:51 INFO Training over: Total runtime 0.045 minutes


In [17]:
generate = save_model.GenerateModel(training)
models =  generate.train_by_strategy(sorted_models, "simple:0")
final_model = generate.finalize_model(models)
generate.save_model(final_model, "model_output/logistic")

31-10-2023 13:49:16 - model_training - INFO - ----------Finalizing the model by training it with all the data including test set--------------


Transformation Pipeline and Model Successfully Saved


## Prediction

In [1]:
from BioML import predict
import pandas as pd
import numpy as np
from scipy.spatial import distance

In [2]:
training_features = "../data/esterase_features.xlsx"
label = "../data/esterase_labels.csv"
test_features = "../data/esterase_features.xlsx"
outlier_train=()
outlier_test=()
sheet_name="ch2_20"
problem="classification"
model_path="model_output/svm_model"
scaler="robust"

In [3]:
feature = predict.DataParser(training_features, label, outliers=outlier_train, sheets=sheet_name)
test_features = feature.remove_outliers(feature.read_features(test_features, "ch2_20"), outlier_test)
predictions = predict.predict(test_features, model_path, problem)


In [4]:
transformed, scaler_dict, test_x = predict.scale(scaler, feature.drop(), test_features)
domain = predict.ApplicabilityDomain()
domain0 = predict.domain_filter(predictions, transformed, test_x, 5)

In [5]:
fasta = "../data/whole_sequence.fasta"
res_dir = "prediction_results_domain"

In [6]:
predictions.index = [f"sample_{x}" for x, _ in enumerate(predictions.index)]

In [7]:
col_name = ["prediction_score", "prediction_label", "AD_number"]
predictions = predictions.loc[:, predictions.columns.str.contains("|".join(col_name))]

In [8]:
extractor = predict.FastaExtractor(fasta, res_dir)
positive, negative = extractor.separate_negative_positive(domain0)


In [9]:
extractor.extract(positive, negative, positive_fasta=f"positive.fasta", negative_fasta=f"negative.fasta")