In [None]:
import pandas as pd
import os
import sys

In [None]:
sys.path.append("../src/ga4_mlops/pipelines")

In [None]:
from data_preparation_utils import extract_column_names

In [None]:
df = pd.read_csv('../data/05_model_input/abt_test.csv')

In [None]:
_, _, _, target_col = extract_column_names(df)
target_col

In [None]:
df[target_col].value_counts()

In [None]:
df.shape[0]

In [None]:
df[target_col].value_counts() / df.shape[0]

In [None]:
n_obs = 500
seed = 42

In [None]:
frac = n_obs / df.shape[0]

In [None]:
df_sample = df.groupby(target_col).apply(lambda x: x.sample(frac=frac, random_state=seed)).reset_index(drop=True)
df_sample

In [None]:
df_sample[target_col].value_counts()

In [None]:
df_sample.shape[0]

In [None]:
proportions = df_sample[target_col].value_counts() / df_sample.shape[0]
proportions

In [None]:
proportions.to_string()

---

In [None]:
import shap
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
sys.path.append("../src/ga4_mlops/pipelines")
from data_preparation_utils import extract_column_names

In [None]:
def sample_data(abt: pd.DataFrame, n_obs: int, seed: int) -> pd.DataFrame:
    """Sample model input data preserving target proportions.

    Args:
        abt (pd.DataFrame): input data frame
        n_obs (int): number of observations in a sample

    Returns:
        pd.DataFrame: data frame sample
    """
    original_n_obs = abt.shape[0]
    n_obs = max(1, min(n_obs, original_n_obs))
    # logger.info(
    #     f"Sampling data for SHAP explanations. Original size: {original_n_obs}; Sample size: {n_obs}"
    # )

    _, _, _, target_col = extract_column_names(abt)
    # logger.info(f"Target name: {target_col}")

    original_proportions = abt[target_col].value_counts() / original_n_obs
    # logger.info(f"Original target proportions:\n{original_proportions.to_string()}")

    frac = n_obs / original_n_obs
    abt_sample = (
        abt.groupby(target_col)
        .apply(lambda x: x.sample(frac=frac, random_state=seed))
        .reset_index(drop=True)
    )

    proportions = abt_sample[target_col].value_counts() / n_obs
    # logger.info(f"Sample target proportions:\n{proportions.to_string()}")

    return abt_sample

In [None]:
abt_train = pd.read_csv('../data/05_model_input/abt_train.csv')
abt = pd.read_csv('../data/05_model_input/abt_test.csv')

In [None]:
abt_train_sample = sample_data(abt_train, 100, 42)
abt_sample = sample_data(abt, 100, 42)

_, num_cols, cat_cols, _ = extract_column_names(abt_train_sample)

features_train_sample = abt_train_sample[num_cols + cat_cols]
features_sample = abt_sample[num_cols + cat_cols]

In [None]:
with open('../data/06_models/model.pkl', 'rb') as pickle_file:
    model = pickle.load(pickle_file)

In [None]:
explainer = shap.KernelExplainer(model.predict_proba, features_train_sample)

In [None]:
shap_values = explainer.shap_values(features_sample)

In [None]:
shap.summary_plot(shap_values, features=features_sample)

In [None]:
shap.summary_plot(shap_values, features=features_sample, plot_size=(10, 10))

In [None]:
features_sample.columns[np.argsort(np.abs(shap_values).mean(0))]

In [None]:
vals= np.abs(shap_values).mean(0)
feature_importance = pd.DataFrame(list(zip(features_sample.columns, sum(vals))), columns=['feature','importance'])
feature_importance.sort_values(by=['importance'], ascending=False, inplace=True)
feature_importance

In [None]:
# feature_importance = dict(zip(features_sample.columns, sum(vals)))
# feature_importance = dict(sorted(feature_importance.items(), key=lambda item: item[1], reverse=True))
# feature_importance

In [None]:
{k: v for k, v in zip(feature_importance["feature"], feature_importance["importance"])}

In [None]:
feature_importance.index[:5].to_list()

In [None]:
# top_5_features = feature_importance["feature"].head(5).to_list()
top_5_features = feature_importance.index[:5].to_list()
top_5_features

In [None]:
shap.plots.partial_dependence(30, lambda x: model.predict_proba(x)[:, 1], features_sample)

In [None]:
for idx in top_5_features:
    shap.plots.partial_dependence(idx, lambda x: model.predict_proba(x)[:, 1], features_sample)

In [None]:
type(shap_values)

---

In [None]:
import matplotlib.pyplot as plt
import io

In [None]:
shap.summary_plot(
    shap_values,
    features=features_sample,
    show=False
)
fig = plt.gcf()
# plt.plot()

In [None]:
shap.plots.partial_dependence(
    0,
    lambda x: model.predict_proba(x)[:, 1],
    features_sample,
    model_expected_value=True,
    feature_expected_value=True,
    show=False
)
fig = plt.gcf()
# plt.plot()

In [None]:
plots_dict = dict()
for idx in top_5_features:
    shap.plots.partial_dependence(
        idx,
        lambda x: model.predict_proba(x)[:, 1],
        features_sample,
        model_expected_value=True,
        feature_expected_value=True,
        show=False
    )
    feature_name = feature_importance["feature"][idx]
    plots_dict[f"{feature_name}.png"] = plt.gcf()
    # plt.plot()