### Importing Required Libraries

In [38]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
     -------------------------------------- 100.3/100.3 kB 2.9 MB/s eta 0:00:00
Collecting pyaml>=16.9
  Downloading pyaml-21.10.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.10.1 scikit-optimize-0.9.0




In [6]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import make_scorer, fbeta_score, roc_curve, precision_recall_curve
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE, ADASYN, KMeansSMOTE, SVMSMOTE, BorderlineSMOTE
from imblearn.under_sampling import TomekLinks, RepeatedEditedNearestNeighbours, NearMiss
from imblearn.combine import SMOTETomek, SMOTEENN

# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
from tqdm import tqdm

import joblib
import itertools

import pandas as pd
import numpy as np

## Pycaret
from pycaret.classification import *

### Loading the dataset after feature selection is done

In [2]:
def load_datasets(file_path: str) -> dict:
    """
    Load Feature Selected Datasets.

    Args:
        file_path (str): Path of saved Data

    Returns:
        dict: Feature Selected Data
    """
    return joblib.load(file_path)['meta_data']

### Concatinating Train and Test datasets to get the original data since pycaret requires whole data to work on.

In [7]:
def get_full_data(data_dict: dict) -> pd.DataFrame:
    """
    Merges Train and Test data into a single dataframe.
    
    Args:
        data_dict (dict) : loaded data dictionary.
        
    Returns:
        pd.DataFrame : Merged DataFrame.
    """
    X_train, X_test, y_train, y_test = data_dict['X_train'], data_dict['X_test'],data_dict['y_train'],data_dict['y_test']
    y_train = pd.DataFrame({'target_label': y_train})
    y_test = pd.DataFrame({'target_label': y_test})
    train = pd.concat([X_train, y_train],axis = 1)
    test = pd.concat([X_test, y_test], axis = 1)
    data = pd.concat([train,test])
    return data

### Method that implements PyCaret steps and returns model_comparisions df and best model.

In [100]:
def pycarat_modelling(data:pd.DataFrame,test_size:float,kpi_indicator:str, data_type: str):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    if data_type =='balanced':
        clf1 = setup(data, target = 'target_label', train_size= 1.0 - test_size, numeric_features = 
                 list(data.select_dtypes(include=numerics).columns)[:-1], html = False)
    else:
        clf1 = setup(data, target = 'target_label', train_size= 1.0 - test_size, numeric_features = 
                 list(data.select_dtypes(include=numerics).columns)[:-1], fix_imbalance = True, html = False)
    # Compare models
    best_model = compare_models(sort = kpi_indicator)

    df = pull()
    return best_model, df

### Method to implement HyperParameter tuning using Bayesian optimization technique.

In [78]:
def hyperparam_tuning(best_model:object, kpi_indicator:str):
    model_tuned  = tune_model(best_model, search_library = 'scikit-optimize',optimize = kpi_indicator)
    return model_tuned

### Method to get the final models dictionary by iterating through each feature selection method along with hyperparameter tuning.

In [79]:
def model_building_tuning(data_dict:dict, data:pd.DataFrame, test_size: float, kpi_indicator:str, data_type: str):
    fts = list(data_dict['selected_features'].keys())[3:5]
    li = []
    al_dict = {}
    for data_name in fts:
        data_name_dict = {}
        print("--------------------- : ", data_name)
        for key,value in data_dict['selected_features'][data_name].items():
            print(key)
            if len(list(data_dict['selected_features'][data_name][key])) > 0:
                data_filtered = data[list(data_dict['selected_features'][data_name][key]) + ['target_label']]
                best_model,df = pycarat_modelling(data = data_filtered, test_size = test_size, kpi_indicator = kpi_indicator,
                                              data_type = data_type)
                tuned_model = hyperparam_tuning(best_model, kpi_indicator)
                #data_name_dict[data_name+'_'+key] = tuned_model
                data_name_dict[key] = tuned_model
        al_dict[data_name] = data_name_dict
        #li.append(data_name_dict)
    return al_dict

### Method to get the final models dataframe by iterating through each feature selection method.

In [8]:
def model_building(data_dict:dict, data:pd.DataFrame, test_size: float, kpi_indicator:str, data_type: str):
    fts = list(data_dict['selected_features'].keys())[0:1]
    result = pd.DataFrame()
    for data_name in fts:
        print("--------------------- : ", data_name)
        for key,value in data_dict['selected_features'][data_name].items():
            print(key)
            if len(list(data_dict['selected_features'][data_name][key])) > 0:
                data_filtered = data[list(data_dict['selected_features'][data_name][key]) + ['target_label']]
                best_model,models_res = pycarat_modelling(data = data_filtered, test_size = test_size, kpi_indicator = kpi_indicator,
                                              data_type = data_type)
                models_res['feature_used'] = data_name+'-'+key
                result = pd.concat([result, models_res])
    result = result.sort_values(by = [kpi_indicator], ascending = False).reset_index(drop = True).iloc[:10, :]
    return result

### Tuning only top 10 feature selection methods from the model comparision results.

In [9]:
def top_10_tuning(data:pd.DataFrame, test_size: float, kpi_indicator:str, data_type: str, res_df: pd.DataFrame):
    features_used = list(res_df['feature_used'].unique())
    tuned_dict = {}
    for ft_used in features_used:
        method_name, sub_method_name = ft_used.split('-')
        data_filtered = data[list(data_dict['selected_features'][method_name][sub_method_name]) + ['target_label']]
        best_model,models_res = pycarat_modelling(data = data_filtered, test_size = test_size, kpi_indicator = kpi_indicator,
                                              data_type = data_type)
        tuned_model = hyperparam_tuning(best_model, kpi_indicator)
        tuned_dict[method_name+':'+sub_method_name] = tuned_model
    return tuned_dict

### Compile method that executes all required functions to get models list.

In [107]:
def compile_method(file_path : str,data_type : str,test_size : float,kpi_indicator : str,with_tuning : bool):
    print("-------- loading data ----------------------------")
    data_dict = load_datasets(file_path=file_path)
    print("-------- loading data done -------------------")
    
    print("-------- Merging the data ---------------")
    data = get_full_data(data_dict)
    print(" ------- Data Merging done -------------------")

    print("--------- Modelling Starts ----------")
    if with_tuning:
        fin_res = model_building_tuning(data_dict = data_dict, data = data, test_size = test_size, kpi_indicator = kpi_indicator,
                                data_type = data_type)
    else:
        res = model_building(data_dict = data_dict, data = data, test_size = test_size, kpi_indicator = kpi_indicator,
                                data_type = data_type)
        
        print("RESULT ______________________")
        print(res)
        fin_res = top_10_tuning(data = data, test_size = test_size, kpi_indicator = kpi_indicator,
                                data_type = data_type, res_df = res)
    print("-------- Modelling Ends  --------------")
    return fin_res


### Config file

In [108]:
config = {
        'file_path' :'fin_feature_selected_data_v1.joblib',
        'data_type': 'balanced',
        'test_size': 0.2,
        'kpi_indicator': 'AUC',
        'with_tuning': False
    }

models = compile_method(**config)

-------- loading data ----------------------------
-------- loading data done -------------------
-------- Merging the data ---------------
 ------- Data Merging done -------------------
--------- Modelling Starts ----------
fts -  ['anova_f_value_selection']
--------------------- :  anova_f_value_selection
anova_f_value


Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
topcoat color_BK05,Numeric
Basecoat Humidity,Numeric
topcoat color_BK08,Numeric
topcoat color_BK06,Numeric
primer color_MK05,Numeric
primer color_MK04,Numeric
topcoat color_BK34,Numeric
topcoat color_BK33,Numeric
primer color_PK02,Numeric
topcoat color_BK25,Numeric


KeyboardInterrupt: Interrupted by user

In [141]:
models ## taking top 10 and tuning them.

{'anova_f_value_selection:anova_f_value': ExtraTreesClassifier(bootstrap=True, ccp_alpha=0.0,
                      class_weight='balanced_subsample', criterion='entropy',
                      max_depth=3, max_features=0.46202644076613025,
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=2.0738458306192575e-05,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=9, min_weight_fraction_leaf=0.0,
                      n_estimators=55, n_jobs=-1, oob_score=False,
                      random_state=7444, verbose=0, warm_start=False),
 'logit_selection:logit': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                      metric_params=None, n_jobs=-1, n_neighbors=45, p=2,
                      weights='uniform')}

In [71]:
models ## tuning for all combinations of methods and sub methods.

{'permutation_impt_selection': {'random_forest': GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                             learning_rate=0.012401549537023198, loss='deviance',
                             max_depth=6, max_features=0.6256381592381297,
                             max_leaf_nodes=None,
                             min_impurity_decrease=0.00034065548673317605,
                             min_impurity_split=None, min_samples_leaf=5,
                             min_samples_split=6, min_weight_fraction_leaf=0.0,
                             n_estimators=134, n_iter_no_change=None,
                             presort='deprecated', random_state=7134,
                             subsample=0.7515343192991089, tol=0.0001,
                             validation_fraction=0.1, verbose=0,
                             warm_start=False),
  'catboost': AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                     learning_rate=3.

In [None]:
########### TESTING

In [143]:
t_knn = create_model(list(models.values())[1])

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5652,0.5012,0.114,0.4514,0.1821,0.0131,0.0189
1,0.5637,0.4878,0.0772,0.4231,0.1306,-0.0005,-0.0008
2,0.57,0.5266,0.1054,0.4688,0.1722,0.0195,0.0294
3,0.5641,0.5125,0.116,0.4459,0.1841,0.011,0.0156
4,0.5529,0.5102,0.0931,0.3869,0.1501,-0.0172,-0.0253
5,0.5156,0.4864,0.4341,0.4296,0.4318,0.0098,0.0098
6,0.5037,0.4948,0.4306,0.4174,0.4239,-0.0118,-0.0118
7,0.5075,0.5057,0.4218,0.4196,0.4207,-0.0077,-0.0077
8,0.5082,0.5068,0.4148,0.4192,0.417,-0.0083,-0.0083
9,0.5328,0.5219,0.4482,0.4489,0.4485,0.0432,0.0432


In [144]:
predict_model(t_knn)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.5624,0.5065,0.1013,0.4431,0.1648,0.0073,0.011


Unnamed: 0,Topcoat Last Used,Ambient Humidity,Ambient Temperature,Paint Temperature,Paint Pressure,Electrical Resistivity,Paint Viscosity,Basecoat Humidity,Basecoat Temperature,Clearcoat Humidity,...,topcoat color_BK16,topcoat color_BK21,topcoat color_BK25,topcoat color_BK26,topcoat color_BK32,topcoat color_BK33,topcoat color_BK34,target_label,Label,Score
0,1.719010,-0.257420,-0.429320,0.41436,-0.331800,2.660760,0.740700,-0.649690,-0.139610,-0.233200,...,-0.774400,-0.048240,-0.022720,-0.085600,-0.047700,-0.034470,-0.060640,0.0,0.0,0.5778
1,-0.019485,-0.084228,-0.057762,0.21326,-0.331063,0.078066,0.066668,0.237685,-0.091914,0.020019,...,-0.020119,-0.000323,-0.017884,-0.001604,-0.013088,0.013377,0.019342,0.0,0.0,0.6000
2,-0.019485,-0.084228,-0.057762,0.21326,-0.331063,0.078066,0.066668,0.237685,-0.091914,0.020019,...,-0.020119,-0.000323,-0.017884,-0.001604,-0.013088,0.013377,0.019342,1.0,0.0,0.6000
3,-0.449960,1.063220,-0.188720,-1.30537,-0.379550,-1.063070,1.574000,0.883700,-0.000530,-0.144340,...,1.291310,-0.048240,-0.022720,-0.085600,-0.047700,-0.034470,-0.060640,1.0,0.0,0.6000
4,-0.449960,0.837770,-1.947700,-1.81854,-0.702570,1.943380,-0.546910,0.662270,-0.653920,0.203030,...,1.291310,-0.048240,-0.022720,-0.085600,-0.047700,-0.034470,-0.060640,0.0,0.0,0.6000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5761,-0.019485,-0.084228,-0.057762,0.21326,-0.331063,0.078066,0.066668,0.237685,-0.091914,0.020019,...,-0.020119,-0.000323,-0.017884,-0.001604,-0.013088,0.013377,0.019342,1.0,0.0,0.6000
5762,2.471650,0.694550,0.064900,-1.19519,-0.241770,-0.914350,1.033520,0.780090,-0.000530,-1.500810,...,-0.774400,-0.048240,-0.022720,-0.085600,-0.047700,-0.034470,-0.060640,0.0,0.0,0.5111
5763,-0.536700,1.171210,-0.233410,-0.08177,-0.458590,-0.712440,-0.491920,0.811920,0.134360,1.455780,...,1.291310,-0.048240,-0.022720,-0.085600,-0.047700,-0.034470,-0.060640,1.0,0.0,0.6889
5764,-0.186200,0.811470,-1.935630,-1.46584,0.906410,1.818950,-1.086700,0.679220,-0.631170,0.293720,...,-0.774400,-0.048240,-0.022720,-0.085600,-0.047700,-0.034470,-0.060640,0.0,0.0,0.5778


In [1]:
# data_dict = load_datasets(file_path='fin_feature_selected_data_v1.joblib')
# data = get_full_data(data_dict)
# numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
# setup(data, target = 'target_label', train_size= 0.7, numeric_features = 
#                  list(data.select_dtypes(include=numerics).columns)[:-1])
# best_model = compare_models(sort = 'AUC')
# best_model


# td = tune_model(top_1)
# td

In [2]:
# model_tuned = tune_model(estimator = best_model,search_library = 'scikit-optimize', n_iter = 2, optimize = 'AUC')
# model_tuned

In [3]:
#plot_model(model_tuned, plot = 'class_report')

In [4]:
#predict_model(model_tuned)