# let's install pycaret !


In [1]:
# Install PyCaret
#!pip install pycaret
#!pip install --upgrade pycaret #if you have installed beta version in past, run the below code to upgrade
import os, sys
path = '/Users/maurizio/opt/anaconda3'
sys.path.append(f'{path}/lib/python3.10/site-packages')
sys.path.append(f'{path}/lib')

import numpy as np
import pandas as pd
import os, sys
from IPython.display import display

from pycaret.utils import version

# PyCaret version
version()

'3.2.0'

In [2]:
from pycaret.classification import *

In [2]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from typing import List
def load_features(filenames: List[str] = [], fixnans= [], normalizes=[], colname: str="label", 
                  verbose: bool = False, show_progress: bool = False) -> pd.DataFrame:
    """
    Load and assemble features and labels for machine learning tasks.

    :param List[str] features: List of feature filepaths
    :param str colname: Name of the column in the label file to be used as the target variable. Default is "label".
    :param int seed: Random seed for reproducibility. Default is 1.
    :param bool verbose: Whether to print verbose messages during processing. Default is False.
    :param bool show_progress: Whether to print progress bar while loading file. Default is False.

    :returns: Tuple containing the assembled features (X) and labels (Y) DataFrames.
    :rtype: Tuple[pd.DataFrame, pd.DataFrame]
        
    :example:

    .. code-block:: python

        colname = "target_column"
        seed = 1
        verbose = False

        df_label = pd.read_csv("label_file.csv2, index_col=0)
        X, Y = load_features(['path/to/feature_file1.csv', 'path/to/feature_file2.csv'], fix_na=True, colname, seed, verbose)
    """

    # Common indices among labels and features
    x = pd.DataFrame()

    # Process each feature file
    for f,fixna,norm in zip(filenames, fixnans, normalizes):
        feat_df = pd.read_csv(f, index_col=0)
        feat_df.index = feat_df.index.map(str)
        fname = os.path.basename(f).rsplit('.', 1)[0]

        # Handle missing values if required
        if verbose:
            cntnan = feat_df.isna().sum().sum()
            print(f"[{fname}] found {cntnan} Nan...")
        if fixna:
            if verbose:
                print(f"[{fname}] Fixing NaNs with mean ...")
            feat_df = feat_df.fillna(feat_df.mean())

        # Normalize features
        if norm == 'std':
            scaler = MinMaxScaler()
            if verbose:
                print(f"[{fname}] Normalization with {norm} ...")
            feat_df = pd.DataFrame(scaler.fit_transform(feat_df), index=feat_df.index, columns=feat_df.columns)
        elif norm == 'max':
            scaler = StandardScaler()
            if verbose:
                print(f"[{fname}] Normalization with {norm}...")
            feat_df = pd.DataFrame(scaler.fit_transform(feat_df), index=feat_df.index, columns=feat_df.columns)
        else:
            if verbose:
                print(f"[{fname}] No normalization...")

        # merge features features
        x = pd.merge(x, feat_df, left_index=True, right_index=True, how='outer')

    # Return the assembled features (X) and labels (Y)
    return x

# Load the dataset and split

In [3]:
from sklearn.model_selection import train_test_split
path = '../../data/'
attributes = load_features([os.path.join(path, 'Kidney_BIO.csv'), 
                            os.path.join(path, 'Kidney_CCcfs.csv'),
                            os.path.join(path, 'Kidney_EmbN2V_128.csv')
                            ], 
                            fixnans=[True, True, False], normalizes=['std', 'std', None], verbose=True)
label = pd.read_csv(os.path.join(path,'Kidney_HELP.csv'), index_col=0).replace({'aE':'NE', 'sNE': 'NE'})
idx_common = np.intersect1d(attributes.index.values, label.index.values)
attributes = attributes.loc[idx_common]
label = label.loc[idx_common]
X_train, X_test, y_train, y_test = train_test_split(attributes, label, shuffle=False)
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

[Kidney_BIO] found 52532 Nan...
[Kidney_BIO] Fixing NaNs with mean ...
[Kidney_BIO] Normalization with std ...
[Kidney_CCcfs] found 6682710 Nan...
[Kidney_CCcfs] Fixing NaNs with mean ...
[Kidney_CCcfs] Normalization with std ...
[Kidney_EmbN2V_128] found 0 Nan...
[Kidney_EmbN2V_128] No normalization...


# Feature elimination

In [4]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

min_features_to_select = 1  # Minimum number of features to consider
clf = LogisticRegression()
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(attributes, label)

print(f"Optimal number of features: {rfecv.n_features_}")

KeyboardInterrupt: 

# Start tuning session

In [34]:
clf1 = setup(data = train, 
             target = 'label',
             numeric_imputation = 'mean',
             categorical_features = [], session_id = 444,
             fold_strategy = "stratifiedkfold", fold=5,
             #ignore_features = ['Name','Ticket','Cabin'],
             verbose = True)

Unnamed: 0,Description,Value
0,Session id,444
1,Target,label
2,Target type,Binary
3,Target mapping,"E: 0, NE: 1"
4,Original data shape,"(12927, 3460)"
5,Transformed data shape,"(12927, 3460)"
6,Transformed train set shape,"(9048, 3460)"
7,Transformed test set shape,"(3879, 3460)"
8,Numeric features,3459
9,Preprocess,True


# Adjuts metrics

In [35]:
from sklearn.metrics import *
from imblearn.metrics import specificity_score
try: 
    [remove_metric(m) for m in ['Precision', 'F1', 'Kappa']]  # remove unused metrics
    add_metric('Sensitivity', 'Sensitivity', specificity_score, greater_is_better = True)
    add_metric('Specificity', 'Specificity', recall_score, greater_is_better = True)
    add_metric('Balanced Accuracy', 'BA', balanced_accuracy_score, greater_is_better = True)
    #add_metric('ROC-AUC', 'ROC-AUC', roc_auc_score, greater_is_better = True, multiclass=False)
except: 
    pass
get_metrics()

Unnamed: 0_level_0,Name,Display Name,Score Function,Scorer,Target,Args,Greater is Better,Multiclass,Custom
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
acc,Accuracy,Accuracy,<function accuracy_score at 0x7f8209a22700>,accuracy,pred,{},True,True,False
auc,AUC,AUC,<pycaret.internal.metrics.BinaryMulticlassScor...,"make_scorer(roc_auc_score, needs_proba=True, e...",pred_proba,"{'average': 'weighted', 'multi_class': 'ovr'}",True,True,False
recall,Recall,Recall,<pycaret.internal.metrics.BinaryMulticlassScor...,"make_scorer(recall_score, average=weighted)",pred,{'average': 'weighted'},True,True,False
mcc,MCC,MCC,<function matthews_corrcoef at 0x7f8209a229d0>,make_scorer(matthews_corrcoef),pred,{},True,True,False
Sensitivity,Sensitivity,Sensitivity,<pycaret.internal.metrics.EncodedDecodedLabels...,make_scorer(specificity_score),pred,{},True,True,True
Specificity,Specificity,Specificity,<pycaret.internal.metrics.EncodedDecodedLabels...,make_scorer(recall_score),pred,{},True,True,True
Balanced Accuracy,BA,BA,<pycaret.internal.metrics.EncodedDecodedLabels...,make_scorer(balanced_accuracy_score),pred,{},True,True,True


# Define our model

In [30]:
from HELPpy.models.prediction import VotingEnsembleLGBM
veLGBM = create_model(VotingEnsembleLGBM())

KeyboardInterrupt: 

In [9]:
veLGBM

VotingEnsembleLGBM(boosting_type='gbdt', learning_rate=0.1, n_jobs=-1,
                   n_voters=10, random_state=42, verbose=False, voting='soft')

In [8]:
from sklearn.metrics import balanced_accuracy_score
classifiers = [VotingEnsembleLGBM(), 'lightgbm', 'xgboost', 'ada', 'rf', 'dt', 'gbc', 'lda', 'lr', 'et', 'svm']
results = compare_models(include=classifiers, sort='BA')

Unnamed: 0,Model,Accuracy,AUC,Recall,MCC,Sensitivity,Specificity,BA,TT (Sec)
0,VotingEnsembleLGBM,0.8846,0.9473,0.8846,0.5275,0.8748,0.8854,0.8801,4.024
7,Linear Discriminant Analysis,0.9422,0.934,0.9422,0.5669,0.5867,0.9703,0.7785,0.364
10,SVM - Linear Kernel,0.9439,0.0,0.9439,0.5519,0.5126,0.9779,0.7453,0.188
3,Ada Boost Classifier,0.9402,0.9238,0.9402,0.5097,0.4691,0.9775,0.7233,2.364
8,Logistic Regression,0.9478,0.9428,0.9478,0.5494,0.448,0.9874,0.7177,0.396
1,Light Gradient Boosting Machine,0.9476,0.946,0.9476,0.5365,0.4057,0.9905,0.6981,0.858
2,Extreme Gradient Boosting,0.947,0.9412,0.947,0.5296,0.4012,0.9901,0.6957,8.574
6,Gradient Boosting Classifier,0.9444,0.9322,0.9444,0.4942,0.353,0.9912,0.6721,14.398
5,Decision Tree Classifier,0.8991,0.6538,0.8991,0.2941,0.3665,0.9412,0.6538,1.218
4,Random Forest Classifier,0.9358,0.9161,0.9358,0.3383,0.1312,0.9994,0.5653,2.388


In [10]:
df = pull()
df.to_csv(os.path.join('../../data4rev/pycaret/', "pycaret_best_classifier_metrics.csv"), index=True)
print(df.to_latex())

\begin{tabular}{llrrrrrrrr}
\toprule
{} &                            Model &  Accuracy &     AUC &  Recall &     MCC &  Sensitivity &  Specificity &      BA &  TT (Sec) \\
\midrule
0  &               VotingEnsembleLGBM &    0.8846 &  0.9473 &  0.8846 &  0.5275 &       0.8748 &       0.8854 &  0.8801 &     4.024 \\
7  &     Linear Discriminant Analysis &    0.9422 &  0.9340 &  0.9422 &  0.5669 &       0.5867 &       0.9703 &  0.7785 &     0.364 \\
10 &              SVM - Linear Kernel &    0.9439 &  0.0000 &  0.9439 &  0.5519 &       0.5126 &       0.9779 &  0.7453 &     0.188 \\
3  &             Ada Boost Classifier &    0.9402 &  0.9238 &  0.9402 &  0.5097 &       0.4691 &       0.9775 &  0.7233 &     2.364 \\
8  &              Logistic Regression &    0.9478 &  0.9428 &  0.9478 &  0.5494 &       0.4480 &       0.9874 &  0.7177 &     0.396 \\
1  &  Light Gradient Boosting Machine &    0.9476 &  0.9460 &  0.9476 &  0.5365 &       0.4057 &       0.9905 &  0.6981 &     0.858 \\
2  &     

In [114]:
results.get_params()

{'n_jobs': -1,
 'n_voters': 10,
 'random_state': 42,
 'verbose': False,
 'voting': 'soft'}

# Tune the veLGBM classifier

In [37]:
velgbm = create_model(VotingEnsembleLGBM())
hparams = {"n_voters" :[4, 6, 8, 10, 12, 14, 16], 
           'boosting_type': ['gbdt', 'dart'], 
           'learning_rate' :[0.1, 0.3, 0.5]
           }
tuned_veLGBM, tuner = tune_model(velgbm, optimize = 'BA', search_algorithm='grid', custom_grid=hparams, return_tuner=True)

Unnamed: 0_level_0,Accuracy,AUC,Recall,MCC,Sensitivity,Specificity,BA
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.895,0.9574,0.895,0.5478,0.8722,0.8968,0.8845
1,0.9033,0.962,0.9033,0.5829,0.9098,0.9028,0.9063
2,0.9099,0.9607,0.9099,0.5855,0.8797,0.9123,0.896
3,0.8933,0.9348,0.8933,0.5251,0.8333,0.898,0.8657
4,0.9093,0.9543,0.9093,0.5757,0.8636,0.9129,0.8883
Mean,0.9022,0.9538,0.9022,0.5634,0.8717,0.9046,0.8882
Std,0.007,0.0099,0.007,0.0234,0.0247,0.0069,0.0135


Unnamed: 0_level_0,Accuracy,AUC,Recall,MCC,Sensitivity,Specificity,BA
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8989,0.9594,0.8989,0.5563,0.8722,0.901,0.8866
1,0.9022,0.9634,0.9022,0.5837,0.9173,0.901,0.9092
2,0.9127,0.9602,0.9127,0.5957,0.8872,0.9147,0.901
3,0.8955,0.9385,0.8955,0.5472,0.8712,0.8974,0.8843
4,0.9055,0.9537,0.9055,0.5664,0.8636,0.9088,0.8862
Mean,0.903,0.9551,0.903,0.5698,0.8823,0.9046,0.8934
Std,0.0059,0.0088,0.0059,0.0177,0.0191,0.0063,0.0099


Fitting 5 folds for each of 42 candidates, totalling 210 fits


# learning curve

In [19]:
#plot_model(estimator = tuned_veLGBM, plot = 'learning')

# AUC curve

In [20]:
#plot_model(estimator = tuned_veLGBM, plot = 'BA', display_format="streamlit", save=True)

# Confusion Matrix

In [21]:
#plot_model(estimator = tuned_veLGBM, plot = 'confusion_matrix')

# Future importance

In [22]:
#plot_model(estimator = tuned_veLGBM, plot = 'feature')

In [23]:
#interpret_model(tuned_veLGBM)

## all params

In [24]:
evaluate_model(tuned_veLGBM)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

# Save reports

In [38]:
df2 = pull()
df2.to_csv(os.path.join('../../data4rev/pycaret/', "tuned_veLGB.csv"))
pd.DataFrame(tuner.cv_results_).to_csv("veLGB_opt_lr_nv_bt.csv", index=True)

In [41]:
pd.DataFrame(tuner.cv_results_)

42

In [40]:
print(pd.DataFrame(tuner.cv_results_)[['param_actual_estimator__n_voters', 
                                 'param_actual_estimator__boosting_type',
                                 'param_actual_estimator__learning_rate',
                                 'rank_test_score', 
                                 'mean_test_score']].sort_values('mean_test_score', 
                                                                       ascending=False).to_latex())

\begin{tabular}{llllrr}
\toprule
{} & param\_actual\_estimator\_\_n\_voters & param\_actual\_estimator\_\_boosting\_type & param\_actual\_estimator\_\_learning\_rate &  rank\_test\_score &  mean\_test\_score \\
\midrule
17 &                               10 &                                  gbdt &                                   0.5 &                1 &         0.893450 \\
10 &                               10 &                                  gbdt &                                   0.3 &                2 &         0.892156 \\
12 &                               14 &                                  gbdt &                                   0.3 &                3 &         0.889302 \\
3  &                               10 &                                  gbdt &                                   0.1 &                4 &         0.888158 \\
4  &                               12 &                                  gbdt &                                   0.1 &                5 &      

In [42]:
tuned_veLGBM

VotingEnsembleLGBM(boosting_type='gbdt', learning_rate=0.5, n_jobs=-1,
                   n_voters=10, random_state=42, verbose=False, voting='soft')