### Importing Required Libraries

In [7]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import make_scorer, fbeta_score, roc_curve, precision_recall_curve
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE, ADASYN, KMeansSMOTE, SVMSMOTE, BorderlineSMOTE
from imblearn.under_sampling import TomekLinks, RepeatedEditedNearestNeighbours, NearMiss
from imblearn.combine import SMOTETomek, SMOTEENN

# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
from tqdm import tqdm

import joblib
import itertools

import pandas as pd
import numpy as np

## Pycaret
from pycaret.classification import *

### Loading the dataset after feature selection is done

In [8]:
def load_datasets(file_path: str) -> dict:
    """
    Load Feature Selected Datasets.

    Args:
        file_path (str): Path of saved Data

    Returns:
        dict: Feature Selected Data
    """
    return joblib.load(file_path)['meta_data']

### Concatinating Train and Test datasets to get the original data since pycaret requires whole data to work on.

In [25]:
def get_full_data(data_dict: dict):
    X_train, X_test, y_train, y_test = data_dict['X_train'], data_dict['X_test'],data_dict['y_train'],data_dict['y_test']
    y_train = pd.DataFrame({'target_label': y_train})
    y_test = pd.DataFrame({'target_label': y_test})
    train = pd.concat([X_train, y_train],axis = 1)
    test = pd.concat([X_test, y_test], axis = 1)
    data = pd.concat([train,test])
    return data

### Method that implements PyCaret steps and returns model_comparisions df.

In [62]:
def pycarat_modelling(data:pd.DataFrame,test_size:float,kpi_indicator:str, data_type: str):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    if data_type =='balanced':
        clf1 = setup(data, target = 'target_label', train_size= 1.0 - test_size, numeric_features = 
                 list(data.select_dtypes(include=numerics).columns)[:-1])
    else:
        clf1 = setup(data, target = 'target_label', train_size= 1.0 - test_size, numeric_features = 
                 list(data.select_dtypes(include=numerics).columns)[:-1], fix_imbalance = True)
    # Compare models
    compare_models(sort = kpi_indicator)

    df = pull()
    return df

### Method to get the final models dataframe by iterating through each feature selection method.

In [57]:
def model_building(data_dict:dict, data:pd.DataFrame, test_size: float, kpi_indicator:str, data_type: str):
    fts = list(data_dict['selected_features'].keys())[:4]
    result = pd.DataFrame()
    for data_name in fts:
        print("--------------------- : ", data_name)
        for key,value in data_dict['selected_features'][data_name].items():
            print(key)
            if len(list(data_dict['selected_features'][data_name][key])) > 0:
                data_filtered = data[list(data_dict['selected_features'][data_name][key]) + ['target_label']]
                models_res = pycarat_modelling(data = data_filtered, test_size = test_size, kpi_indicator = kpi_indicator,
                                              data_type = data_type)
                models_res['feature_used'] = data_name+'_'+key
                result = pd.concat([result, models_res])
    result = result.sort_values(by = [kpi_indicator], ascending = False).reset_index(drop = True).iloc[:10, :]
    return result

### Compile method that executes all required functions to get models list.

In [61]:
def compile_method(file_path:str, data_type: str, test_size: float,kpi_indicator:str):
    print("-------- loading data ----------------------------")
    data_dict = load_datasets(file_path=file_path)
    print("-------- loading data done -------------------")
    
    print("-------- Merging the data ---------------")
    data = get_full_data(data_dict)
    print(" ------- Data Merging done -------------------")

    print("--------- Modelling Starts ----------")
    fin_res = model_building(data_dict = data_dict, data = data, test_size = test_size, kpi_indicator = kpi_indicator,
                            data_type = data_type)
    print("-------- Modelling Ends  --------------")
    return fin_res


### Config file

In [58]:
config = {
        'file_path' :'fin_feature_selected_data_v1.joblib',
        'data_type': 'balanced',
        'test_size': 0.2,
        'kpi_indicator': 'AUC'
    }

models = compile_method(**config)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.5431,0.5084,0.2036,0.4321,0.2767,0.0022,0.0026,0.597
rf,Random Forest Classifier,0.5438,0.5068,0.2135,0.4359,0.2864,0.0061,0.007,0.498
lightgbm,Light Gradient Boosting Machine,0.5569,0.5042,0.1339,0.4469,0.206,0.0097,0.0134,0.077
et,Extra Trees Classifier,0.5359,0.503,0.2397,0.4276,0.307,-0.0018,-0.0019,0.41
dt,Decision Tree Classifier,0.5261,0.5017,0.2989,0.4259,0.3512,-0.0043,-0.0044,0.052
dummy,Dummy Classifier,0.5708,0.5,0.0,0.0,0.0,0.0,0.0,0.006
qda,Quadratic Discriminant Analysis,0.5566,0.4995,0.1091,0.4346,0.1742,0.0025,0.0036,0.013
nb,Naive Bayes,0.5642,0.4988,0.028,0.4062,0.0514,-0.0051,-0.011,0.007
ada,Ada Boost Classifier,0.5663,0.4962,0.0304,0.4277,0.0568,-0.0003,-0.0007,0.247
gbc,Gradient Boosting Classifier,0.5651,0.4955,0.0261,0.3998,0.0488,-0.004,-0.0104,0.74


In [59]:
models

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec),feature_used
0,Decision Tree Classifier,0.5362,0.5107,0.3029,0.4328,0.3562,0.0113,0.0117,0.038,mutual_info_classif_selection_mutual_info_classif
1,Extreme Gradient Boosting,0.5477,0.5094,0.1949,0.4254,0.2671,0.0017,0.002,0.626,permutation_impt_selection_random_forest
2,Quadratic Discriminant Analysis,0.4415,0.5087,0.8953,0.3831,0.5365,0.0006,0.0057,0.015,anova_f_value_selection_anova_f_value
3,Extreme Gradient Boosting,0.5538,0.5085,0.1501,0.4302,0.2224,0.003,0.0039,0.383,anova_f_value_selection_anova_f_value
4,Extreme Gradient Boosting,0.5431,0.5084,0.2036,0.4321,0.2767,0.0022,0.0026,0.597,permutation_impt_selection_logistic_regression
5,Ada Boost Classifier,0.5743,0.5084,0.0197,0.4875,0.0378,0.0053,0.0172,0.175,anova_f_value_selection_anova_f_value
6,Extra Trees Classifier,0.5408,0.5081,0.2562,0.4298,0.3209,0.0069,0.0072,0.365,mutual_info_classif_selection_mutual_info_classif
7,Linear Discriminant Analysis,0.5745,0.5075,0.0003,0.1333,0.0006,-0.0002,-0.001,0.015,anova_f_value_selection_anova_f_value
8,Extreme Gradient Boosting,0.5562,0.5074,0.1787,0.4396,0.2539,0.0126,0.0152,0.548,logit_selection_logit
9,Naive Bayes,0.4816,0.5074,0.6323,0.4223,0.4819,0.0024,0.0022,0.008,permutation_impt_selection_catboost
