calculate feature-importances using SHAP-values

In [40]:
# setup

import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.ensemble import RandomForestRegressor

# add root to sys path
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")
from config.constants import GIT_DIRECTORY
from data_preparation.feature_set_helpers import stratified_cv_feature_importance

# parameters
task_name = "cookieTheft"
target = "SemanticFluencyScore"
folds_path = os.path.join(GIT_DIRECTORY, "data", "stratified_folds.csv")
scores_path = os.path.join(GIT_DIRECTORY, "data/language_scores_all_subjects.csv")
features_path = os.path.join(GIT_DIRECTORY, f"results/features/filtered/{task_name}_filtered.csv")
save_dir = os.path.join(GIT_DIRECTORY, "results/feature_importance", task_name, target)
os.makedirs(save_dir, exist_ok=True)

# load data
df = pd.read_csv(features_path)
fold_df = pd.read_csv(folds_path)
scores_df = pd.read_csv(scores_path)

# extract and encode demographics from fold_df
df_demo = fold_df[["Subject_ID", "Gender", "Education", "Country", "Age"]].copy()

# Gender: f = 0, m = 1
df_demo["Gender"] = df_demo["Gender"].map({"f": 0, "m": 1})

# Education: ordinal encoding
education_map = {
    "less_than_highschool": 1,
    "high_school": 2,
    "vocational": 3,
    "bachelor": 4,
    "master": 5,
    "phd": 6,
    "no_answer": np.nan
}
df_demo["Education"] = df_demo["Education"].map(education_map)

# Country: uk = 0, usa = 1
df_demo["Country"] = df_demo["Country"].map({"uk": 0, "usa": 1})

# merge all data
df = pd.merge(df, df_demo, on="Subject_ID", how="left")
df = pd.merge(df, fold_df[["Subject_ID", "fold"]], on="Subject_ID", how="left")
df = pd.merge(df, scores_df[["Subject_ID", target]], on="Subject_ID", how="left")

feature_cols = [col for col in df.columns if col not in ["Subject_ID", "fold", target]]
df = df.dropna(subset=[target] + feature_cols)

In [41]:
# run CV + importances
shap_explanation, shap_table = stratified_cv_feature_importance(
    df=df,
    fold_column="fold",
    model_type=RandomForestRegressor,
    model_params={"n_estimators": 150},
    target_column=target,
    feature_columns=feature_cols,
    save_dir=save_dir,
    task_name=task_name
)

In [42]:
# SHAP plots
shap.plots.bar(shap_explanation, max_display=20, show=False)
plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.savefig(os.path.join(save_dir, f"{task_name}_{target}_shap_bar.png"), dpi=300)
plt.close()

shap.summary_plot(shap_explanation, plot_type="bar", show=False)
plt.savefig(os.path.join(save_dir, f"{task_name}_{target}_shap_summary.png"), dpi=300)
plt.close()

shap.summary_plot(shap_explanation, plot_type="violin", show=False)
plt.savefig(os.path.join(save_dir, f"{task_name}_{target}_shap_violin.png"), dpi=300)
plt.close()

shap.plots.beeswarm(shap_explanation, show=False)
plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.savefig(os.path.join(save_dir, f"{task_name}_{target}_shap_beeswarm.png"), dpi=300)
plt.close()