# let's install pycaret !


In [1]:
# Install PyCaret
#!pip install pycaret
#!pip install --upgrade pycaret #if you have installed beta version in past, run the below code to upgrade
import os, sys
sys.path.append('/home/maurizio/miniconda3/envs/mytorch/lib/python3.10/site-packages')
sys.path.append('/home/maurizio/miniconda3/envs/mytorch/lib')

import numpy as np
import pandas as pd
import os, sys
from IPython.display import display

from pycaret.utils import version

# PyCaret version
version()

'3.3.2'

In [2]:
from pycaret.classification import *

In [5]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from typing import List
def load_features(filenames: List[str] = [], fixnans= [], normalizes=[], colname: str="label", 
                  verbose: bool = False, show_progress: bool = False) -> pd.DataFrame:
    """
    Load and assemble features and labels for machine learning tasks.

    :param List[str] features: List of feature filepaths
    :param str colname: Name of the column in the label file to be used as the target variable. Default is "label".
    :param int seed: Random seed for reproducibility. Default is 1.
    :param bool verbose: Whether to print verbose messages during processing. Default is False.
    :param bool show_progress: Whether to print progress bar while loading file. Default is False.

    :returns: Tuple containing the assembled features (X) and labels (Y) DataFrames.
    :rtype: Tuple[pd.DataFrame, pd.DataFrame]
        
    :example:

    .. code-block:: python

        colname = "target_column"
        seed = 1
        verbose = False

        df_label = pd.read_csv("label_file.csv2, index_col=0)
        X, Y = load_features(['path/to/feature_file1.csv', 'path/to/feature_file2.csv'], fix_na=True, colname, seed, verbose)
    """

    # Common indices among labels and features
    x = pd.DataFrame()

    # Process each feature file
    for f,fixna,norm in zip(filenames, fixnans, normalizes):
        feat_df = pd.read_csv(f, index_col=0)
        feat_df.index = feat_df.index.map(str)
        fname = os.path.basename(f).rsplit('.', 1)[0]

        # Handle missing values if required
        if verbose:
            cntnan = feat_df.isna().sum().sum()
            print(f"[{fname}] found {cntnan} Nan...")
        if fixna:
            if verbose:
                print(f"[{fname}] Fixing NaNs with mean ...")
            feat_df = feat_df.fillna(feat_df.mean())

        # Normalize features
        if norm == 'std':
            scaler = MinMaxScaler()
            if verbose:
                print(f"[{fname}] Normalization with {norm} ...")
            feat_df = pd.DataFrame(scaler.fit_transform(feat_df), index=feat_df.index, columns=feat_df.columns)
        elif norm == 'max':
            scaler = StandardScaler()
            if verbose:
                print(f"[{fname}] Normalization with {norm}...")
            feat_df = pd.DataFrame(scaler.fit_transform(feat_df), index=feat_df.index, columns=feat_df.columns)
        else:
            if verbose:
                print(f"[{fname}] No normalization...")

        # merge features features
        x = pd.merge(x, feat_df, left_index=True, right_index=True, how='outer')

    # Return the assembled features (X) and labels (Y)
    return x

# Load the dataset and split

In [6]:
from sklearn.model_selection import train_test_split
path = '/home/maurizio/PLOS_CompBiology/HELP/data/'
attributes = load_features([os.path.join(path, 'Kidney_BIO.csv'), 
                            #os.path.join(path, 'Kidney_CCcfs.csv'),
                            os.path.join(path, 'Kidney_EmbN2V_128.csv')], 
                            fixnans=[True, True, False], normalizes=['std', 'std', None], verbose=True)
label = pd.read_csv(os.path.join(path,'Kidney_HELP.csv'), index_col=0).replace({'aE':'NE', 'sNE': 'NE'})
idx_common = np.intersect1d(attributes.index.values, label.index.values)
attributes = attributes.loc[idx_common]
label = label.loc[idx_common]
X_train, X_test, y_train, y_test = train_test_split(attributes, label, shuffle=False)
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

[Kidney_BIO] found 52532 Nan...
[Kidney_BIO] Fixing NaNs with mean ...
[Kidney_BIO] Normalization with std ...
[Kidney_EmbN2V_128] found 0 Nan...
[Kidney_EmbN2V_128] Fixing NaNs with mean ...
[Kidney_EmbN2V_128] Normalization with std ...


# Start tuning session

In [7]:
clf1 = setup(data = train, 
             target = 'label',
             numeric_imputation = 'mean',
             categorical_features = [], session_id = 444,
             fold_strategy = "stratifiedkfold", fold=5,
             #ignore_features = ['Name','Ticket','Cabin'],
             verbose = True)

Unnamed: 0,Description,Value
0,Session id,444
1,Target,label
2,Target type,Binary
3,Target mapping,"E: 0, NE: 1"
4,Original data shape,"(12927, 155)"
5,Transformed data shape,"(12927, 155)"
6,Transformed train set shape,"(9048, 155)"
7,Transformed test set shape,"(3879, 155)"
8,Numeric features,154
9,Preprocess,True


# Define our model

In [15]:
veLGBM(boosting_type='gbdt')

In [17]:
from sklearn.base import clone, BaseEstimator
from joblib import Parallel, delayed
from lightgbm import LGBMClassifier 
from sklearn.preprocessing import LabelEncoder
class veLGBM(BaseEstimator):

    def __init__(self, n_voters=10, voting='soft', n_jobs=-1, verbose=False, random_state=42, boosting_type:str='gbdt', learning_rate:float=0.1):
        # intialize ensemble ov voters
        self.voting = voting
        self.random_state = random_state
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.n_voters = n_voters
        self.learning_rate = learning_rate
        self.boosting_type = boosting_type
        self.estimators_ = [LGBMClassifier(verbose=-1, random_state=random_state, boosting_type=boosting_type, learning_rate=learning_rate) for i in range(n_voters)]
    
    def __sklearn_clone__(self):
        return self

    def _fit_single_estimator(self, i, X, y, index_ne, index_e):
        """Private function used to fit an estimator within a job."""
        df_X = np.append(X[index_ne], X[index_e], axis=0)
        df_y = np.append(y[index_ne], y[index_e], axis=0)
        clf = clone(self.estimators_[i])
        clf.fit(df_X, df_y)
        return clf
    
    def fit(self, X, y):
        # Find the majority and minority class
        #assert (isinstance(X, np.ndarray) or isinstance(X, pd.DataFrame)) and (isinstance(y, np.ndarray) or isinstance(y, pd.DataFrame)), "Only array or pandas dataframe input!"
        X = X.values
        encoder = LabelEncoder()
        y = encoder.fit_transform(y.values.ravel())

        unique, counts = np.unique(y, return_counts=True)
        minlab = unique[np.argmin(counts)]
        maxlab = unique[np.argmax(counts)]

        if self.verbose:
            print(f"Majority {maxlab} {max(counts)}, minority {minlab} {min(counts)}")

        # Separate majority and minority class
        all_index_ne = np.where(y == maxlab)[0]
        index_e = np.where(y == minlab)[0]

        # Split majority class among voters
        if self.random_state >= 0:
            np.random.seed(self.random_state)
            np.random.shuffle(all_index_ne)
            np.random.shuffle(index_e)
        splits = np.array_split(all_index_ne, self.n_voters)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(self._fit_single_estimator)(i,X, y, index_ne, index_e) 
                                                        for i,index_ne in enumerate(splits))
        return self
    
    def predict_proba(self, X, y=None):
        # Find the majority and minority class
        #assert isinstance(X, np.ndarray) or isinstance(X, pd.DataFrame), "Only array or pandas dataframe input!"
        X = X.values
        probabilities = np.array([self.estimators_[i].predict_proba(X) for i in range(self.n_voters)])
        return np.sum(probabilities, axis=0)/self.n_voters
    
    def predict(self, X, y=None):
        #assert isinstance(X, np.ndarray) or isinstance(X, pd.DataFrame), "Only array or pandas dataframe input!"
        X = X.values
        probabilities = np.array([self.estimators_[i].predict_proba(X) for i in range(self.n_voters)])
        return np.argmax(np.sum(probabilities, axis=0)/self.n_voters, axis=1)
    
velgbm = veLGBM()
# train using create_model
#velgbm_trained = create_model(velgbm)

In [9]:
from sklearn.metrics import *
from imblearn.metrics import specificity_score
try: 
    [remove_metric(m) for m in ['Precision', 'F1', 'Kappa']]  # remove unused metrics
    add_metric('Sensitivity', 'Sensitivity', specificity_score, greater_is_better = True)
    add_metric('Specificity', 'Specificity', recall_score, greater_is_better = True)
    add_metric('Balanced Accuracy', 'BA', balanced_accuracy_score, greater_is_better = True)
    #add_metric('ROC-AUC', 'ROC-AUC', roc_auc_score, greater_is_better = True, multiclass=False)
except: 
    pass
get_metrics()

Unnamed: 0_level_0,Name,Display Name,Score Function,Scorer,Target,Args,Greater is Better,Multiclass,Custom
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
acc,Accuracy,Accuracy,<function accuracy_score at 0x7fba5041c940>,accuracy,pred,{},True,True,False
auc,AUC,AUC,<pycaret.internal.metrics.BinaryMulticlassScor...,"make_scorer(roc_auc_score, response_method=('d...",pred_proba,"{'average': 'weighted', 'multi_class': 'ovr'}",True,True,False
recall,Recall,Recall,<pycaret.internal.metrics.BinaryMulticlassScor...,"make_scorer(recall_score, response_method='pre...",pred,{'average': 'weighted'},True,True,False
mcc,MCC,MCC,<function matthews_corrcoef at 0x7fba5041cee0>,"make_scorer(matthews_corrcoef, response_method...",pred,{},True,True,False
Sensitivity,Sensitivity,Sensitivity,<pycaret.internal.metrics.EncodedDecodedLabels...,"make_scorer(specificity_score, response_method...",pred,{},True,True,True
Specificity,Specificity,Specificity,<pycaret.internal.metrics.EncodedDecodedLabels...,"make_scorer(recall_score, response_method='pre...",pred,{},True,True,True
Balanced Accuracy,BA,BA,<pycaret.internal.metrics.EncodedDecodedLabels...,"make_scorer(balanced_accuracy_score, response_...",pred,{},True,True,True


In [100]:
from sklearn.metrics import balanced_accuracy_score
classifiers = [veLGBM(), 'lightgbm', 'xgboost', 'ada', 'rf', 'dt', 'gbc', 'lda', 'lr', 'et', 'svm']
#add_metric('Balanced Accuracy', 'BA', balanced_accuracy_score, greater_is_better = True) 
results = compare_models(include=classifiers, sort='BA')

Unnamed: 0,Model,Accuracy,AUC,Recall,MCC,Specificity,BA,TT (Sec)
0,veLGBM,0.9022,0.0,0.9022,0.5634,0.8717,0.8882,47.49
9,Logistic Regression,0.9497,0.9477,0.9497,0.6022,0.5761,0.7777,3.566
11,SVM - Linear Kernel,0.9415,0.9269,0.9415,0.56,0.5717,0.7712,3.062
8,Linear Discriminant Analysis,0.9224,0.8413,0.9224,0.4796,0.5716,0.7609,19.502
2,Extreme Gradient Boosting,0.9529,0.9505,0.9529,0.6004,0.5009,0.7448,42.216
1,Light Gradient Boosting Machine,0.953,0.9534,0.953,0.5993,0.4902,0.7399,17.828
3,Ada Boost Classifier,0.9415,0.9267,0.9415,0.5265,0.4932,0.7351,18.872
6,CatBoost Classifier,0.9514,0.9527,0.9514,0.5845,0.4797,0.7342,290.044
5,Decision Tree Classifier,0.9213,0.7234,0.9213,0.4355,0.4916,0.7234,16.874
7,Gradient Boosting Classifier,0.949,0.9429,0.949,0.5581,0.4465,0.7176,90.886


Processing:   0%|          | 0/53 [00:00<?, ?it/s]

In [105]:
df = pull()

In [114]:
results.get_params()

{'n_jobs': -1,
 'n_voters': 10,
 'random_state': 42,
 'verbose': False,
 'voting': 'soft'}

In [115]:
df.to_csv("pycaret_best_classifier_metrics.csv", index=True)

In [109]:
print(df.to_latex())

\begin{tabular}{llrrrrrrr}
\toprule
 & Model & Accuracy & AUC & Recall & MCC & Specificity & BA & TT (Sec) \\
\midrule
0 & veLGBM & 0.902200 & 0.000000 & 0.902200 & 0.563400 & 0.871700 & 0.888200 & 47.490000 \\
9 & Logistic Regression & 0.949700 & 0.947700 & 0.949700 & 0.602200 & 0.576100 & 0.777700 & 3.566000 \\
11 & SVM - Linear Kernel & 0.941500 & 0.926900 & 0.941500 & 0.560000 & 0.571700 & 0.771200 & 3.062000 \\
8 & Linear Discriminant Analysis & 0.922400 & 0.841300 & 0.922400 & 0.479600 & 0.571600 & 0.760900 & 19.502000 \\
2 & Extreme Gradient Boosting & 0.952900 & 0.950500 & 0.952900 & 0.600400 & 0.500900 & 0.744800 & 42.216000 \\
1 & Light Gradient Boosting Machine & 0.953000 & 0.953400 & 0.953000 & 0.599300 & 0.490200 & 0.739900 & 17.828000 \\
3 & Ada Boost Classifier & 0.941500 & 0.926700 & 0.941500 & 0.526500 & 0.493200 & 0.735100 & 18.872000 \\
6 & CatBoost Classifier & 0.951400 & 0.952700 & 0.951400 & 0.584500 & 0.479700 & 0.734200 & 290.044000 \\
5 & Decision Tree Classifi

In [None]:
velgbm = veLGBM()
# train using create_model
velgbm_trained = create_model(velgbm)

# tune model
tuned_dt = tune_model(velgbm_trained, 
                      optimize = 'BA',
                      return_train_score=True, 
                      custom_grid={'n_voters':[2, 4, 6, 8, 10, 12, 14, 16, 18, 20]})

In [120]:
df2 = pull()

In [122]:
df2.to_csv("tuned_veLGB.csv")

In [18]:
velgbm = veLGBM()
velgbm_trained  = create_model(velgbm) 
hparams = {"n_voters" :[4, 6, 8, 10, 12], 'boosting_type': ['gbdt', 'dart', 'rf'], 'learning_rate' :[0.1, 0.3, 0.5]}
tuned_rf, tuner = tune_model(velgbm_trained, optimize = 'BA', search_algorithm='grid', custom_grid=hparams, return_tuner=True)


Unnamed: 0_level_0,Accuracy,AUC,Recall,MCC,Sensitivity,Specificity,BA
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8834,0.0,0.8834,0.5308,0.8872,0.8831,0.8852
1,0.8945,0.0,0.8945,0.5466,0.8722,0.8962,0.8842
2,0.8939,0.0,0.8939,0.5556,0.8947,0.8939,0.8943
3,0.8706,0.0,0.8706,0.5019,0.8788,0.87,0.8744
4,0.8806,0.0,0.8806,0.5027,0.8409,0.8837,0.8623
Mean,0.8846,0.0,0.8846,0.5275,0.8748,0.8854,0.8801
Std,0.0089,0.0,0.0089,0.0221,0.0186,0.0093,0.0109


Unnamed: 0_level_0,Accuracy,AUC,Recall,MCC,Sensitivity,Specificity,BA
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8834,0.0,0.8834,0.5308,0.8872,0.8831,0.8852
1,0.8945,0.0,0.8945,0.5466,0.8722,0.8962,0.8842
2,0.8939,0.0,0.8939,0.5556,0.8947,0.8939,0.8943
3,0.8706,0.0,0.8706,0.5019,0.8788,0.87,0.8744
4,0.8806,0.0,0.8806,0.5027,0.8409,0.8837,0.8623
Mean,0.8846,0.0,0.8846,0.5275,0.8748,0.8854,0.8801
Std,0.0089,0.0,0.0089,0.0221,0.0186,0.0093,0.0109


Fitting 5 folds for each of 45 candidates, totalling 225 fits


In [20]:
pd.DataFrame(tuner.cv_results_).to_csv("veLGB_opt_lr_nv_bt.csv", index=True)

In [24]:
print(pd.DataFrame(tuner.cv_results_)[['param_actual_estimator__n_voters', 'param_actual_estimator__boosting_type','param_actual_estimator__learning_rate', 'rank_test_score', 'mean_test_score']].sort_values('mean_test_score', ascending=False).to_latex())

\begin{tabular}{llllrr}
\toprule
 & param_actual_estimator__n_voters & param_actual_estimator__boosting_type & param_actual_estimator__learning_rate & rank_test_score & mean_test_score \\
\midrule
3 & 10 & gbdt & 0.100000 & 1 & 0.880079 \\
23 & 10 & dart & 0.300000 & 1 & 0.880079 \\
13 & 10 & gbdt & 0.500000 & 1 & 0.880079 \\
8 & 10 & gbdt & 0.300000 & 1 & 0.880079 \\
38 & 10 & rf & 0.300000 & 1 & 0.880079 \\
43 & 10 & rf & 0.500000 & 1 & 0.880079 \\
33 & 10 & rf & 0.100000 & 1 & 0.880079 \\
18 & 10 & dart & 0.100000 & 1 & 0.880079 \\
28 & 10 & dart & 0.500000 & 1 & 0.880079 \\
27 & 8 & dart & 0.500000 & 10 & 0.872064 \\
2 & 8 & gbdt & 0.100000 & 10 & 0.872064 \\
32 & 8 & rf & 0.100000 & 10 & 0.872064 \\
17 & 8 & dart & 0.100000 & 10 & 0.872064 \\
22 & 8 & dart & 0.300000 & 10 & 0.872064 \\
42 & 8 & rf & 0.500000 & 10 & 0.872064 \\
37 & 8 & rf & 0.300000 & 10 & 0.872064 \\
12 & 8 & gbdt & 0.500000 & 10 & 0.872064 \\
7 & 8 & gbdt & 0.300000 & 10 & 0.872064 \\
41 & 6 & rf & 0.500000 & 19