calculate feature-importances using SHAP-values

In [6]:
# setup

import os
import sys
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns
import shap
from sklearn.ensemble import RandomForestRegressor

# add root to sys path
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")
from config.constants import GIT_DIRECTORY
from config.feature_sets import get_linguistic_features, get_acoustic_features
from data_preparation.feature_set_helpers import stratified_cv_feature_importance
from regression.plotting_helpers import format_title

# parameters
task_name = "picnicScene"
target = "PictureNamingScore"
folds_path = os.path.join(GIT_DIRECTORY, "data/stratified_folds.csv")
scores_path = os.path.join(GIT_DIRECTORY, "data/language_scores_all_subjects.csv")
features_path = os.path.join(GIT_DIRECTORY, f"results/features/filtered/{task_name}_filtered2.csv")
save_dir = os.path.join(GIT_DIRECTORY, "results/feature_importance_filtered2", task_name, target)
os.makedirs(save_dir, exist_ok=True)

# load data
df = pd.read_csv(features_path)
fold_df = pd.read_csv(folds_path)
scores_df = pd.read_csv(scores_path)
# extract and encode demographics from fold_df
demographics = fold_df[["Subject_ID", "Gender", "Education", "Country", "Age", "Socioeconomic"]].copy()
demographics["Socioeconomic"] = pd.to_numeric(demographics["Socioeconomic"], errors="coerce")
# Gender: f = 0, m = 1
demographics["Gender"] = demographics["Gender"].map({"f": 0, "m": 1})
# Education: group encoding
education_map = {
    "less_than_highschool": 1,
    "high_school": 2,
    "vocational": 3,
    "bachelor": 4,
    "master": 5,
    "phd": 6,
    "no_answer": np.nan
}
demographics["Education"] = demographics["Education"].map(education_map)
demographics["Education_level"] = demographics["Education"].map({
    1: 0,            # low
    2: 1, 3: 1,      # medium
    4: 2, 5: 2, 6: 2 # high
})
# Country: uk = 0, usa = 1
demographics["Country"] = demographics["Country"].map({"uk": 0, "usa": 1})

# merge all data
df = pd.merge(df, demographics, on="Subject_ID", how="left")
df = pd.merge(df, fold_df[["Subject_ID", "fold"]], on="Subject_ID", how="left")
df = pd.merge(df, scores_df[["Subject_ID", target]], on="Subject_ID", how="left")
feature_cols = [c for c in df.columns if c not in ["Subject_ID", "fold", target, "Education"]]
df = df.dropna(subset=[target] + feature_cols)

# parameters for random forest
rf_params={"n_estimators": 625, "random_state": 42, "min_samples_leaf": 4, "max_features": "sqrt", "bootstrap": True, "max_depth": 14, "min_samples_split": 9}

In [7]:
# define categories for features

linguistic_features = get_linguistic_features()
acoustic_features = get_acoustic_features()

demographic_features = ["Age", "Gender", "Education_level", "Country", "Socioeconomic"]

def feature_category(name: str) -> str:
    if name in linguistic_features:
        return "linguistic"
    if name in acoustic_features:
        return "acoustic"
    if name in demographic_features:
        return "demographics"
    return "other"

superscript_for = {
    "linguistic": "¹",
    "acoustic": "²",
    "demographics": "³"
}

In [8]:
# plot style
plt.style.use("default")
plt.rcParams.update({
    "axes.facecolor": "white",
    "figure.facecolor": "white",
    "axes.edgecolor": "black",
    "axes.labelcolor": "black",
    "xtick.color": "black",
    "ytick.color": "black",
    "font.family": "Arial",
    "savefig.dpi": 600,
    "savefig.bbox": "tight"
})

In [9]:
# run CV + importances
shap_explanation, shap_table = stratified_cv_feature_importance(
    df=df,
    fold_column="fold",
    model_type=RandomForestRegressor,
    model_params=rf_params,
    target_column=target,
    feature_columns=feature_cols,
    save_dir=save_dir,
    task_name=task_name
)

In [11]:
# SHAP beeswarm plot with superscripts and legend

orig_expl = shap_explanation
orig_names = list(orig_expl.feature_names)

name_to_cat = {n: feature_category(n) for n in orig_names}
name_to_label = {n: f"{n}{superscript_for[name_to_cat[n]]}" for n in orig_names}

shap_expl_labeled = shap.Explanation(
    values=orig_expl.values,
    base_values=orig_expl.base_values,
    data=orig_expl.data,
    feature_names=[name_to_label[n] for n in orig_names]
)

shap.plots.beeswarm(shap_expl_labeled, max_display=20, show=False)
# add footnote
footnote = "¹ linguistic   ² acoustic   ³ demographics"
plt.figtext(0.0, 0.03, footnote,
            ha="left", va="top", fontsize=10, fontfamily="Arial")

plt.tight_layout(rect=[0, 0.05, 0.85, 1])
plt.savefig(os.path.join(save_dir, f"{task_name}_{target}_shap_beeswarm_20.png"),
            dpi=600, bbox_inches="tight")
plt.close()

In [62]:
# SHAP plots
# shap.plots.bar(shap_explanation, max_display=20, show=False)
# plt.tight_layout(rect=[0, 0, 0.85, 1])
# plt.savefig(os.path.join(save_dir, f"{task_name}_{target}_shap_bar.png"), dpi=300)
# plt.close()
#
# shap.summary_plot(shap_explanation, plot_type="bar", max_display=20,show=False)
# plt.savefig(os.path.join(save_dir, f"{task_name}_{target}_shap_summary.png"), dpi=300)
# plt.close()

# shap.summary_plot(shap_explanation, max_display=20, plot_type="violin", show=False)
# plt.savefig(os.path.join(save_dir, f"{task_name}_{target}_shap_violin.png"), dpi=300)
# plt.close()
#
# shap.plots.beeswarm(shap_explanation, max_display=20, show=False)
# plt.tight_layout(rect=[0, 0, 0.85, 1])
# plt.savefig(os.path.join(save_dir, f"{task_name}_{target}_shap_beeswarm.png"), dpi=300)
# plt.close()

In [9]:
# Waterfall plot (local SHAP values one subject)

# choose subject & fit model for that subject's fold
rng = np.random.default_rng(0)
rand_idx = int(rng.integers(0, len(df)))
subject_id = int(df.iloc[rand_idx]["Subject_ID"])
subject_fold = int(df.iloc[rand_idx]["fold"])

train_df = df[df["fold"] != subject_fold].copy()
test_row = df[df["Subject_ID"] == subject_id].copy()

X_train = train_df[feature_cols]
y_train = train_df[target]
X_test_one = test_row[feature_cols]
y_test_one = test_row[target].iloc[0]

rf = RandomForestRegressor(**rf_params)
rf.fit(X_train, y_train)

# SHAP explainer
background = shap.sample(X_train, min(200, len(X_train)), random_state=42)
explainer = shap.TreeExplainer(
    rf,
    data=background,
    feature_perturbation="interventional",
    model_output="raw"
)

# local explanation for the chosen subject
ex = explainer(X_test_one)

# label features with superscripts
orig_names = list(ex.feature_names)
name_to_cat = {n: feature_category(n) for n in orig_names}
name_to_label = {n: f"{n}{superscript_for[name_to_cat[n]]}" for n in orig_names}

ex_labeled = shap.Explanation(
    values=ex.values,
    base_values=ex.base_values,
    data=ex.data,
    feature_names=[name_to_label[n] for n in orig_names]
)

# plot: waterfall
subject_id_fmt = f"{subject_id:d}"
title = f"Local SHAP Values: {format_title(target)} (Subject {subject_id_fmt}, {task_name})"

plt.figure(figsize=(9.5, 6))
shap.plots.waterfall(ex_labeled[0], max_display=9, show=False)
plt.title(title)

ax = plt.gca()

# plot spacing
plt.subplots_adjust(left=0.30, right=0.96, bottom=0.15, top=0.88)

xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin - 0.10 * (xmax - xmin), xmax + 0.15 * (xmax - xmin))

for t in list(ax.texts):
    t.set_clip_on(False)

for t in list(ax.texts):
    s = t.get_text() or ""
    if "f(x" in s or "E[f" in s or "$f(x)$" in s or "$E[f(X)]$" in s:
        t.remove()

# footnote
plt.figtext(0.00, 0.02, "¹ linguistic   ² acoustic   ³ demographics",
            ha="left", va="top", fontsize=10, fontfamily="Arial")

# save
os.makedirs(save_dir, exist_ok=True)
out_path = os.path.join(save_dir, f"{task_name}_{target}_local_waterfall_subject-{subject_id_fmt}.png")
plt.savefig(out_path, dpi=600, bbox_inches="tight")
plt.close()
print(f"Saved: {out_path}")



Saved: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/feature_importance_filtered2/picnicScene/SemanticFluencyScore/picnicScene_SemanticFluencyScore_local_waterfall_subject-1174.png


In [10]:
# add heatmap

# SHAP heatmap across scores & tasks 
import os, sys, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
import shap
from sklearn.ensemble import RandomForestRegressor
from matplotlib.colors import TwoSlopeNorm  

# project paths
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")
from config.constants import GIT_DIRECTORY
from data_preparation.feature_set_helpers import stratified_cv_feature_importance
from config.feature_sets import get_linguistic_features, get_acoustic_features

# config 
TASKS   = ["cookieTheft","picnicScene","journaling"]
SCORES  = ["PictureNamingScore","SemanticFluencyScore","PhonemicFluencyScore"]
ORDERED_COLS = [
    ("PictureNamingScore","cookieTheft"), ("PictureNamingScore","picnicScene"), ("PictureNamingScore","journaling"),
    ("SemanticFluencyScore","cookieTheft"), ("SemanticFluencyScore","picnicScene"), ("SemanticFluencyScore","journaling"),
    ("PhonemicFluencyScore","cookieTheft"), ("PhonemicFluencyScore","picnicScene"), ("PhonemicFluencyScore","journaling"),
]
FEATURES_DIR_TMPL = os.path.join(GIT_DIRECTORY, "results", "features", "filtered", "{task}_filtered2.csv")
FOLDS_PATH  = os.path.join(GIT_DIRECTORY, "data", "stratified_folds.csv")
SCORES_PATH = os.path.join(GIT_DIRECTORY, "data", "language_scores_all_subjects.csv")

rf_params = {"n_estimators":625, "random_state":42, "min_samples_leaf":4, "max_features":"sqrt",
             "bootstrap":True, "max_depth":14, "min_samples_split":9}

# output 
OUT_DIR = os.path.join(GIT_DIRECTORY, "results", "feature_importance_filtered2", "shap_heatmap")
os.makedirs(OUT_DIR, exist_ok=True)
CSV_PATH = os.path.join(OUT_DIR, "mean_shap_heatmap_RF.csv")
PNG_PATH = os.path.join(OUT_DIR, "mean_shap_heatmap_RF.png")

# helpers 
def _prep_task_df(task, target):
    """Load filtered features, merge folds + target; mirror your setup (no changes)."""
    df = pd.read_csv(FEATURES_DIR_TMPL.format(task=task))
    folds = pd.read_csv(FOLDS_PATH)
    scores = pd.read_csv(SCORES_PATH)

    # demographics (as in your code)
    demographics = folds[["Subject_ID","Gender","Education","Country","Age","Socioeconomic"]].copy()
    demographics["Socioeconomic"] = pd.to_numeric(demographics["Socioeconomic"], errors="coerce")
    demographics["Gender"] = demographics["Gender"].map({"f":0,"m":1})
    education_map = {"less_than_highschool":1,"high_school":2,"vocational":3,"bachelor":4,"master":5,"phd":6,"no_answer":np.nan}
    demographics["Education"] = demographics["Education"].map(education_map)
    demographics["Education_level"] = demographics["Education"].map({1:0, 2:1, 3:1, 4:2, 5:2, 6:2})
    demographics["Country"] = demographics["Country"].map({"uk":0,"usa":1})

    # merge
    df = df.merge(demographics, on="Subject_ID", how="left")
    df = df.merge(folds[["Subject_ID","fold"]], on="Subject_ID", how="left")
    df = df.merge(scores[["Subject_ID", target]], on="Subject_ID", how="left")

    # feature columns 
    feature_cols = [c for c in df.columns if c not in ["Subject_ID","fold",target,"Education"]]
    df = df.dropna(subset=[target] + feature_cols)
    return df, feature_cols

def _extract_mean_shap(shap_explanation, shap_table):
    """Return mean absolute SHAP per feature (global importance, directionless)."""
    vals = np.asarray(shap_explanation.values)  # [n_samples, n_features]
    names = list(shap_explanation.feature_names)
    mean_abs = np.nanmean(np.abs(vals), axis=0)
    return pd.Series(mean_abs, index=names)

# compute matrix 
all_series = {}
for score in SCORES:
    for task in TASKS:
        df, feature_cols = _prep_task_df(task, score)

        # run your existing CV + importance function
        shap_expl, shap_table = stratified_cv_feature_importance(
            df=df,
            fold_column="fold",
            model_type=RandomForestRegressor,
            model_params=rf_params,
            target_column=score,
            feature_columns=feature_cols,
            save_dir=None,
            task_name=task
        )
        s = _extract_mean_shap(shap_expl, shap_table)
        s = s.reindex(feature_cols).dropna()
        all_series[(score, task)] = s

# union of all features
all_features = sorted(set().union(*[s.index for s in all_series.values()]))

# build MultiIndex columns in requested order
cols = pd.MultiIndex.from_tuples(ORDERED_COLS, names=["Score","Task"])
mat = pd.DataFrame(index=all_features, columns=cols, dtype=float)

for (score, task), s in all_series.items():
    if (score, task) in mat.columns:
        mat[(score, task)].loc[s.index] = s.values

# sort rows by overall absolute mean (values are already >=0)
mat["__absmean__"] = mat.mean(axis=1, skipna=True)
mat = mat.sort_values("__absmean__", ascending=False).drop(columns="__absmean__")

# save CSV (MultiIndex header preserved)
mat.to_csv(CSV_PATH, float_format="%.3f")
print("Saved table to:", CSV_PATH)

# plot heatmap 
flat_cols = [f"{sc}\n{ta}" for (sc, ta) in mat.columns.to_list()]
plot_df = mat.copy()
plot_df.columns = flat_cols

# add superscripts to feature labels 
linguistic_features = get_linguistic_features()
acoustic_features = get_acoustic_features()
demographic_features = ["Age","Gender","Education_level","Country","Socioeconomic"]

def feature_category(name):
    if name in linguistic_features:
        return "¹"
    if name in acoustic_features:
        return "²"
    if name in demographic_features:
        return "³"
    return ""

plot_df.index = [f"{f}{feature_category(f)}" for f in plot_df.index]

# sequential color scale from 0 → high |SHAP|
vmax = np.nanpercentile(plot_df.values.astype(float), 99)

plt.figure(figsize=(14, max(6, 0.28*len(plot_df))))
plot_df = plot_df.astype(float)
ax = sns.heatmap(
    plot_df,
    cmap="Reds",            # sequential; no sign
    vmin=0.0,
    vmax=float(vmax),
    linewidths=0.2,
    linecolor="white",
    cbar_kws={"label": "Mean |SHAP|"},
    square=False,
    mask=plot_df.isna(),     # hide NaNs (white)
)

# clean white background
plt.gcf().patch.set_facecolor('white')
ax.set_facecolor('white')

# move labels to top
ax.xaxis.set_ticks_position('top')
ax.xaxis.set_label_position('top')
plt.xticks(rotation=45, ha='left')

ax.set_xlabel("")
ax.set_ylabel("Feature")
ax.set_title("Mean |SHAP| Values by Score × Task (Random Forest)", pad=50)

# footnote: keep at bottom, but tuck it closer to the plot
ax.text(
    0.0, -0.035,
    "¹ linguistic   ² acoustic   ³ demographics",
    transform=ax.transAxes,
    ha="left", va="top", fontsize=10, fontfamily="Arial"
)

plt.tight_layout(rect=[0, 0.03, 1, 1])
plt.savefig(PNG_PATH, dpi=600, bbox_inches="tight")
plt.close()

print("Saved heatmap to:", PNG_PATH)


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  mat[(score, task)].loc[s.index] = s.values
  mat = mat.sort_values("__absmean__", ascending=False).drop(columns="__absmean__")


Saved table to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/feature_importance_filtered2/shap_heatmap/mean_shap_heatmap_RF.csv
Saved heatmap to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/feature_importance_filtered2/shap_heatmap/mean_shap_heatmap_RF.png
