In [10]:
import sys
sys.path.append(r'Classification/dsa/dsa_single_model')
import os
import pickle
import pandas as pd
import json
import dalex as dx
from sklearn.metrics import accuracy_score
import yaml

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.4f}'.format)

In [11]:
yaml_path = r"C:\Users\gustavo\Documents\Data Science\08-GitHub\Portifolio\Classification\dsa\dsa_single_model\src\config.yaml"
with open(yaml_path, "r", encoding="utf-8") as f:
    config = yaml.safe_load(f)

In [12]:
params = {
        'X_train_feat_sel': os.path.join(
            config['init_path'],
            config['feat_selection']['path'],
            config['feat_selection']['X_train']),
        'y_train_feat_sel': os.path.join(
            config['init_path'],
            config['feat_selection']['path'],
            config['feat_selection']['y_train']),
        'X_val_feat_sel': os.path.join(
            config['init_path'],
            config['feat_selection']['path'],
            config['feat_selection']['X_val']) ,
        'y_val_feat_sel': os.path.join(
            config['init_path'],
            config['feat_selection']['path'],
            config['feat_selection']['y_val']),
        'model': os.path.join(
            config['init_path'],
            config['model']['path']),       
        'removed_cols': os.path.join(
            config['init_path'],
            config['save_reports']['path_reports']),
        'model_version': config['model']['model_version']
        }

In [13]:
X_train = pd.read_parquet(params['X_train_feat_sel'])
y_train = pd.read_parquet(params['y_train_feat_sel']) 
    
X_train.drop(
        columns=config['model_selection']['cols_2_drop'],
        inplace=True)
       
y_train = y_train.astype('int')
X_val = pd.read_parquet(params['X_val_feat_sel'])
y_val = pd.read_parquet(params['y_val_feat_sel'])
X_val.drop(
        columns=config['model_selection']['cols_2_drop'],
        inplace=True) 
y_val = y_val.astype('int')      
 
model_path = os.path.join(
        params['model'],
        f"model_{params['model_version']}.pkl")
    
with open(model_path, "rb") as file:
        model = pickle.load(file)

In [14]:
explainer = dx.Explainer(model, X_train, y_train)

Preparation of a new explainer is initiated

  -> data              : 450 rows 6 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 450 values
  -> model_class       : sklearn.linear_model._logistic.LogisticRegression (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x000001EA011AF560> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.013, mean = 0.342, max = 0.983
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.935, mean = -4.61e-06, max = 0.928
  -> model_info        : package sklearn

A new explainer has been created!


In [15]:
vi = explainer.model_parts()
vi.result

Unnamed: 0,variable,dropout_loss,label
0,numerical_pipe__grossura_pele,0.164,LogisticRegression
1,_full_model_,0.1642,LogisticRegression
2,numerical_pipe__insulina,0.1643,LogisticRegression
3,numerical_pipe__idade,0.1697,LogisticRegression
4,numerical_pipe__num_gestacoes,0.1762,LogisticRegression
5,numerical_pipe__bmi,0.2429,LogisticRegression
6,numerical_pipe__glicose,0.3473,LogisticRegression
7,_baseline_,0.4915,LogisticRegression


In [16]:
vi.plot(max_vars=10)

In [17]:
pdp_num = explainer.model_profile(type = 'partial', label="pdp")
ale_num = explainer.model_profile(type = 'accumulated', label="ale")

Calculating ceteris paribus: 100%|██████████| 6/6 [00:00<00:00, 122.35it/s]
Calculating ceteris paribus: 100%|██████████| 6/6 [00:00<00:00, 236.59it/s]
Calculating accumulated dependency: 100%|██████████| 6/6 [00:00<00:00, 19.40it/s]


In [18]:
pdp_num.plot(ale_num)