## Exploration of SHAP values

In [None]:
import pandas as pd
import numpy as np

from utils_modelling import read_and_preprocess_dataset, split_dataset

In [None]:
df_regr = read_and_preprocess_dataset(classification=False)
df_class = read_and_preprocess_dataset(classification=True)

X_regr, y_regr, group_data_regr = split_dataset(df_regr)
X_class, y_class, group_data_class = split_dataset(df_class)

FileNotFoundError: [Errno 2] No such file or directory: 'task_indices/335_task_indices.npy'

In [None]:
shap_regr = pd.read_csv('results_model/gp_boosting/regression/shap_values/shap_values.csv')
shap_class = pd.read_csv('results_model/gp_boosting/classification/shap_values/shap_values.csv')

In [None]:
X_regr.columns

Index(['learning_rate', 'min_data_in_leaf', 'max_depth', 'lambda_l2',
       'num_leaves', 'max_bin', 'bagging_fraction', 'feature_fraction'],
      dtype='object')

In [None]:
shap_regr.mean(axis=0)

learning_rate      -0.013586
min_data_in_leaf    0.056955
max_depth          -0.006704
lambda_l2          -0.003365
num_leaves         -0.030945
max_bin            -0.016547
bagging_fraction    0.003472
feature_fraction    0.010720
dtype: float64

In [None]:
shap_class.mean(axis=0)

learning_rate      -0.034114
min_data_in_leaf   -0.023667
max_depth           0.029693
lambda_l2          -0.009739
num_leaves          0.020285
max_bin            -0.009903
bagging_fraction   -0.011570
feature_fraction    0.039015
dtype: float64

In [None]:
for setting in ['Regression', 'Classification']:
    if setting == 'Regression':
        shap = shap_regr
        X = X_regr
    else:
        shap = shap_class
        X = X_class

    print(setting + ':\n')
    for col in shap.columns:
        print(f'{col} for maximum SHAP value: ', X[col][shap[col].idxmax()])
        print('Max SHAP value: ', shap[col][shap[col].idxmax()])
        print(f'{col} for minimum SHAP value: ', X[col][shap[col].idxmin()])
        print('Min SHAP value: ', shap[col][shap[col].idxmin()], '\n')

Regression:

learning_rate for maximum SHAP value:  0.2551823159557322
Max SHAP value:  2.4420473320846168
learning_rate for minimum SHAP value:  0.870254118745322
Min SHAP value:  -5.380000187711938 

min_data_in_leaf for maximum SHAP value:  1
Max SHAP value:  5.55318709904525
min_data_in_leaf for minimum SHAP value:  648
Min SHAP value:  -5.18751054406352 

max_depth for maximum SHAP value:  -1
Max SHAP value:  1.2245962057860778
max_depth for minimum SHAP value:  10
Min SHAP value:  -3.429023595070789 

lambda_l2 for maximum SHAP value:  38.9543739714224
Max SHAP value:  1.5108298862797576
lambda_l2 for minimum SHAP value:  32.17523265648738
Min SHAP value:  -2.459033323800345 

num_leaves for maximum SHAP value:  7
Max SHAP value:  2.5150262629338966
num_leaves for minimum SHAP value:  1024
Min SHAP value:  -1.8457962540859472 

max_bin for maximum SHAP value:  256
Max SHAP value:  4.684500257964882
max_bin for minimum SHAP value:  3598
Min SHAP value:  -3.4696713510140835 

baggi

In [None]:
def summarize(col, X, shap_values):
    feature = X[col]
    shap = shap_values[col]

    def_intervals = {
        'learning_rate': np.linspace(0.001, 1, 100),
        'min_data_in_leaf': np.linspace(1, 1000, 100),
        'max_depth': np.linspace(1, 10, 100),
        'lambda_l2': np.linspace(0, 100, 100),
        'num_leaves': np.linspace(2, 1024, 100),
        'max_bin': np.linspace(1, 10000, 100),
        'bagging_fraction': np.linspace(0.5, 1, 100),
        'feature_fraction': np.linspace(0.5, 1, 100)
    }

    low_thresh = def_intervals[col][35] if col in ['learning_rate', 'lambda_l2', 'bagging_fraction', 'feature_fraction'] else int(def_intervals[col][35])
    high_thresh = def_intervals[col][65] if col in ['learning_rate', 'lambda_l2', 'bagging_fraction', 'feature_fraction'] else int(def_intervals[col][65])

    conditions = [
        (feature <= low_thresh),
        (feature > low_thresh) & (feature <= high_thresh),
        (feature > high_thresh)
    ]

    categories = ['low', 'moderate', 'high']
    labels = np.select(conditions, categories, default='N/A')
    intervals = {
        'low': f"(-inf, {low_thresh:.2f}]",
        'moderate': f"({low_thresh:.2f}, {high_thresh:.2f}]",
        'high': f"({high_thresh:.2f}, +inf)"
    }

    # Compute the summary dataframe
    df = pd.DataFrame({
        'Feature Value': ['min', 'low', 'moderate', 'high', 'max'],
        'Feature': [
            feature[shap.idxmin()],
            intervals['low'],
            intervals['moderate'],
            intervals['high'],
            feature[shap.idxmax()]
        ],
        'SHAP Value': [
            shap.min(),
            shap[labels == 'low'].mean(),
            shap[labels == 'moderate'].mean(),
            shap[labels == 'high'].mean(),
            shap.max()
        ]
    })

    return df

for setting in ['regression', 'classification']:
    if setting == 'regression':
        shap = shap_regr
        X = X_regr
    else:
        shap = shap_class
        X = X_class

    for col in shap.columns:
        df = summarize(col, X, shap)
        df.to_csv(f'results_model/gp_boosting/{setting}/shap_values/shap_summary_{setting}_{col}.csv', index=False)


### Regression

In [None]:
shap_regr[(X_regr['learning_rate'] > 0.05) & (X_regr['learning_rate'] < 0.2)]['learning_rate'].mean()

0.020280348149971268

In [None]:
shap_regr[X_regr['max_bin'] < 500]['max_bin'].mean()

0.10184979960935614

In [None]:
shap_regr[X_regr['min_data_in_leaf'] < 500]['min_data_in_leaf'].mean()

0.09651930305577557

In [None]:
shap_regr[X_regr['max_depth'] > 5]['max_depth'].mean()

-0.09791233412704684

In [None]:
shap_regr[X_regr['max_depth'] == -1]['max_depth'].mean()

0.06855037049888754

### Classification

In [None]:
shap_class[X_class['max_bin'] < 500]['max_bin'].mean()

-0.1664793685626219

In [None]:
shap_class[X_class['num_leaves'] > 900]['num_leaves'].mean()

0.14696124320467835

In [None]:
# Compute the mean SHAP value when 'max_depth' is -1
shap_class[X_class['bagging_fraction'] > 0.99]['bagging_fraction'].mean()

0.13916882664914162

In [None]:
shap_class[X_class['max_depth'] == -1]['max_depth'].mean()

0.1896446397783991