calculate feature-importances using SHAP-values

In [1]:
# setup

import os
import sys
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns
import shap
from sklearn.ensemble import RandomForestRegressor

# add root to sys path
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")
from config.constants import GIT_DIRECTORY
from config.feature_sets import get_linguistic_features, get_acoustic_features
from data_preparation.feature_set_helpers import stratified_cv_feature_importance
from regression.evaluation_helpers import format_title

# parameters
task_name = "picnicScene"
target = "PictureNamingScore"
folds_path = os.path.join(GIT_DIRECTORY, "data/stratified_folds.csv")
scores_path = os.path.join(GIT_DIRECTORY, "data/language_scores_all_subjects.csv")
features_path = os.path.join(GIT_DIRECTORY, f"results/features/filtered/{task_name}_filtered.csv")
save_dir = os.path.join(GIT_DIRECTORY, "results/feature_importance", task_name, target)
os.makedirs(save_dir, exist_ok=True)

# load data
df = pd.read_csv(features_path)
fold_df = pd.read_csv(folds_path)
scores_df = pd.read_csv(scores_path)
# extract and encode demographics from fold_df
df_demo = fold_df[["Subject_ID", "Gender", "Education", "Country", "Age"]].copy()
# Gender: f = 0, m = 1
df_demo["Gender"] = df_demo["Gender"].map({"f": 0, "m": 1})
# Education: group encoding
education_map = {
    "less_than_highschool": 1,
    "high_school": 2,
    "vocational": 3,
    "bachelor": 4,
    "master": 5,
    "phd": 6,
    "no_answer": np.nan
}
df_demo["Education"] = df_demo["Education"].map(education_map)
df_demo["Education_group_code"] = df_demo["Education"].map({
    1: 0,            # low
    2: 1, 3: 1,      # medium
    4: 2, 5: 2, 6: 2 # high
})
# Country: uk = 0, usa = 1
df_demo["Country"] = df_demo["Country"].map({"uk": 0, "usa": 1})

# merge all data
df = pd.merge(df, df_demo, on="Subject_ID", how="left")
df = pd.merge(df, fold_df[["Subject_ID", "fold"]], on="Subject_ID", how="left")
df = pd.merge(df, scores_df[["Subject_ID", target]], on="Subject_ID", how="left")
feature_cols = [c for c in df.columns if c not in ["Subject_ID", "fold", target, "Education"]]
df = df.dropna(subset=[target] + feature_cols)

# parameters for random forest
rf_params={"n_estimators": 524, "random_state": 42, "min_samples_leaf": 3, "max_features": "sqrt", "bootstrap": True, "max_depth": 15, "min_samples_split": 5}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# define categories for features

linguistic_features = get_linguistic_features()
acoustic_features = get_acoustic_features()

demographic_features = ["Age", "Gender", "Education_group_code", "Country", "Socioeconomic"]

def feature_category(name: str) -> str:
    if name in linguistic_features:
        return "linguistic"
    if name in acoustic_features:
        return "acoustic"
    if name in demographic_features:
        return "demographics"
    return "other"

superscript_for = {
    "linguistic": "¹",
    "acoustic": "²",
    "demographics": "³"
}

In [3]:
# plot style
plt.style.use("default")
plt.rcParams.update({
    "axes.facecolor": "white",
    "figure.facecolor": "white",
    "axes.edgecolor": "black",
    "axes.labelcolor": "black",
    "xtick.color": "black",
    "ytick.color": "black",
    "font.family": "Arial",
    "savefig.dpi": 300,
    "savefig.bbox": "tight"
})

In [4]:
# run CV + importances
shap_explanation, shap_table = stratified_cv_feature_importance(
    df=df,
    fold_column="fold",
    model_type=RandomForestRegressor,
    model_params=rf_params,
    target_column=target,
    feature_columns=feature_cols,
    save_dir=save_dir,
    task_name=task_name
)



In [10]:
# SHAP beeswarm plot with superscripts and legend

orig_expl = shap_explanation
orig_names = list(orig_expl.feature_names)

name_to_cat = {n: feature_category(n) for n in orig_names}
name_to_label = {n: f"{n}{superscript_for[name_to_cat[n]]}" for n in orig_names}

shap_expl_labeled = shap.Explanation(
    values=orig_expl.values,
    base_values=orig_expl.base_values,
    data=orig_expl.data,
    feature_names=[name_to_label[n] for n in orig_names]
)

shap.plots.beeswarm(shap_expl_labeled, max_display=20, show=False)
# add footnote
footnote = "¹ linguistic   ² acoustic   ³ demographics"
plt.figtext(0.5, 0.03, footnote,
            ha="center", va="top", fontsize=10, fontfamily="Arial")

plt.tight_layout(rect=[0, 0.05, 0.85, 1])
plt.savefig(os.path.join(save_dir, f"{task_name}_{target}_shap_beeswarm_20.png"),
            dpi=300, bbox_inches="tight")
plt.close()

In [6]:
# SHAP plots
# shap.plots.bar(shap_explanation, max_display=20, show=False)
# plt.tight_layout(rect=[0, 0, 0.85, 1])
# plt.savefig(os.path.join(save_dir, f"{task_name}_{target}_shap_bar.png"), dpi=300)
# plt.close()
#
# shap.summary_plot(shap_explanation, plot_type="bar", max_display=20,show=False)
# plt.savefig(os.path.join(save_dir, f"{task_name}_{target}_shap_summary.png"), dpi=300)
# plt.close()

# shap.summary_plot(shap_explanation, max_display=20, plot_type="violin", show=False)
# plt.savefig(os.path.join(save_dir, f"{task_name}_{target}_shap_violin.png"), dpi=300)
# plt.close()
#
# shap.plots.beeswarm(shap_explanation, max_display=20, show=False)
# plt.tight_layout(rect=[0, 0, 0.85, 1])
# plt.savefig(os.path.join(save_dir, f"{task_name}_{target}_shap_beeswarm.png"), dpi=300)
# plt.close()

In [11]:
# local SHAP-plot

# pick a random subject from df
rng = np.random.default_rng(0)
rand_idx = int(rng.integers(0, len(df)))
subject_id = df.iloc[rand_idx]["Subject_ID"]
subject_fold = df.iloc[rand_idx]["fold"]

# split train/test respecting the subject's fold
train_df = df[df["fold"] != subject_fold].copy()
test_row = df[df["Subject_ID"] == subject_id].copy()

X_train = train_df[feature_cols]
y_train = train_df[target]
X_test_one = test_row[feature_cols]
y_test_one = test_row[target].iloc[0]

rf = RandomForestRegressor(**rf_params)
rf.fit(X_train, y_train)

background = shap.sample(X_train, min(200, len(X_train)), random_state=0)
explainer = shap.TreeExplainer(
    rf,
    data=background,
    feature_perturbation="interventional",
    model_output="raw"
)
# local explanation for one subject
ex = explainer(X_test_one)

orig_names = list(ex.feature_names)
name_to_cat = {n: feature_category(n) for n in orig_names}
name_to_label = {n: f"{n}{superscript_for[name_to_cat[n]]}" for n in orig_names}

ex_labeled = shap.Explanation(
    values=ex.values,
    base_values=ex.base_values,
    data=ex.data,
    feature_names=[name_to_label[n] for n in orig_names]
)

# waterfall plot
subject_id_fmt = f"{float(subject_id):.0f}"
title = f"Local SHAP Values: {format_title(target)} (Subject {subject_id_fmt}, {task_name})"

shap.plots.waterfall(ex_labeled[0], max_display=15, show=False)
plt.title(title)
shap.plots.waterfall(ex_labeled[0], max_display=15, show=False)
plt.title(title)

# footnote at bottom
footnote = "¹ linguistic   ² acoustic   ³ demographics"
plt.figtext(0.5, 0.03, footnote,
            ha="center", va="top", fontsize=10, fontfamily="Arial")

plt.tight_layout(rect=[0, 0.05, 0.85, 1])
out_path = os.path.join(save_dir, f"{task_name}_{target}_local_waterfall_subject-{subject_id_fmt}.png")
plt.savefig(out_path, dpi=300, bbox_inches="tight")
plt.close()
print(f"Saved: {out_path}")


Saved: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/feature_importance/picnicScene/PictureNamingScore/picnicScene_PictureNamingScore_local_waterfall_subject-1176.png
