# Analysis of Classification Results – TSST vs. f-TSST (Talk Only)

## Imports and Helper Functions

In [None]:
import re
import json

from pathlib import Path

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from fau_colors import cmaps, register_fausans_font

import biopsykit as bp
from biopsykit.classification.model_selection import SklearnPipelinePermuter
from biopsykit.classification.analysis import (
    predictions_as_df,
    predict_proba_from_estimator,
    plot_conf_matrix,
    plot_conf_matrix_proba,
)

from stresspose_analysis.dataset import StressPoseDataset
from stresspose_analysis.classification.utils import (
    feature_data_long_to_wide,
    get_feature_counts,
    feature_counts_to_latex,
    get_number_features_per_fold,
)


%load_ext autoreload
%autoreload 2
%matplotlib widget

In [None]:
register_fausans_font()
plt.close("all")

palette = sns.color_palette(cmaps.faculties)
sns.set_theme(context="notebook", style="ticks", palette=palette)

plt.rcParams["figure.figsize"] = (8, 4)
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["mathtext.default"] = "regular"
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = "FAUSans Office"


palette

## Setup Paths

In [None]:
deploy_type = "local"

In [None]:
config_path = Path("../../../config.json")
config_dict = json.load(config_path.open(encoding="utf-8"))

base_path = Path(config_dict[deploy_type]["base_path"])
dataset = StressPoseDataset(base_path)

In [None]:
label_mapping = {"tsst": "TSST", "ftsst": "f-TSST"}

## Load Data

In [None]:
classification_type = "cumulative_time_5min"

In [None]:
root_path = Path("../../../")
input_path = root_path.joinpath("output/classification/detailed")
output_path = root_path.joinpath("results")

In [None]:
img_path = output_path.joinpath("plots")

bp.utils.file_handling.mkdirs([img_path])

In [None]:
pickle_files = sorted(input_path.glob(f"*_{classification_type}_*.pkl"))
feature_files = sorted(input_path.glob("*.csv"))

In [None]:
pickle_files

In [None]:
index = 0

pickle_file = pickle_files[index]
feature_file = feature_files[index]

print("Selected Files:")
print(f"{pickle_file.name}")
print(f"{feature_file.name}")

### Pickled `SklearnPipelinePermuter` object

In [None]:
pipeline_permuter = SklearnPipelinePermuter.from_pickle(pickle_file)

### Features

In [None]:
data = bp.io.load_long_format_csv(feature_file)
data = data.rename(index=label_mapping, level="condition")
data.head()

In [None]:
data_wide = feature_data_long_to_wide(data, index_levels_out=["subject", "condition"])
data_wide.head()

## Display Results

### Metric Summary

The summary of all relevant metrics (performance scores, confusion matrix, true and predicted labels) of the **best pipelines** for each fold (i.e., the `best_estimator_` parameter of each inner `cv` object), evaluated for each evaluated pipeline combination.

In [None]:
metric_summary = pipeline_permuter.metric_summary(additional_metrics=["f1_score", "precision"], pos_label="tsst")
metric_summary = metric_summary.sort_values(by="mean_test_accuracy", ascending=False)
metric_summary.head()

### Best Pipeline per Classifier

In [None]:
best_clfs = metric_summary.groupby("pipeline_clf", group_keys=False).apply(
    lambda df: df.sort_values(by="mean_test_accuracy", ascending=False).iloc[[0]]
)
best_clfs = best_clfs.sort_values(by="mean_test_accuracy", ascending=False)
best_clfs = best_clfs.droplevel(level="pipeline_remove_var")
best_clfs

In [None]:
latex_output = pipeline_permuter.metric_summary_to_latex(
    data=best_clfs,
    pipeline_steps=["pipeline_scaler", "pipeline_reduce_dim", "pipeline_clf"],
    clines=None,
    sparse_index=False,
    highlight_best=True,
    caption=r"Mean $\pm$ standard deviation of classification performance metrics over the 5-fold model evaluation CV. For each evaluated classifier, the classification pipeline combination with the highest mean accuracy is shown. The classification pipelines scoring the highest metrics are highlighted in \textbf{bold}.",
    label="tab:classification_results",
)
# some dirty manual postprocessing of output
latex_output = re.sub(r"\\cline{1-4} \\cline{2-4}\n\\bottomrule", r"\\bottomrule", latex_output)
latex_output = re.sub(r"\\cline{1-4} \\cline{2-4}", r"\\cline{1-4}", latex_output)
latex_output = re.sub(
    r"{} & {} & {}", r"{Scaler} & {\\makecell[lc]{Feature\\\\ Selection}} & {Classifier}", latex_output, count=1
)
latex_output = re.sub(
    r"{Scaler} & {\\makecell\[lc\]{Feature\\\\ Selection}} & {Classifier} & {} & {} & {} \\\\\n", r"", latex_output
)

print(latex_output)

## Detailed Analysis

In [None]:
selected_pipeline = ("VarianceThreshold", "MinMaxScaler", "RFE", "SVC")

labels = ["TSST", "f-TSST"]

In [None]:
best_estimator_summary = pipeline_permuter.best_estimator_summary()
best_estimator_summary.head()

In [None]:
predictions = predictions_as_df(pipeline_permuter, data_wide, selected_pipeline, label_mapping)

predictions = predictions.join(dataset.condition_first).join(dataset.cortisol_non_responder)
predictions.head()

### Confusion Matrix

In [None]:
fig, ax = plt.subplots(figsize=(3, 3))
plot_conf_matrix(predictions, labels, label_name="condition", ax=ax)
fig.tight_layout(pad=0)

fig.savefig(img_path.joinpath("img_confusion_matrix_talk_only.pdf"), transparent=True)

### Confusion Matrix by Condition Order

In [None]:
fig, axs = plt.subplots(figsize=(6, 3), ncols=2)

for (key, df), ax in zip(predictions.groupby("condition_first"), axs):
    plot_conf_matrix(df, labels, label_name="condition", ax=ax)
    ax.set_title(key)

fig.tight_layout(pad=0, w_pad=1)

### Confusion Matrix by Cortisol Non-Responder

In [None]:
fig, axs = plt.subplots(figsize=(6, 3), ncols=2)

for (key, df), ax in zip(predictions.groupby("non_responder"), axs):
    plot_conf_matrix(df, labels, ax=ax)
    ax.set_title(f"Non-Responder: {key}")

fig.tight_layout(pad=0, w_pad=1)

### Prediction Probability

In [None]:
predictions_proba = predict_proba_from_estimator(
    pipeline_permuter, data_wide, selected_pipeline, label_col="condition", column_names=label_mapping
)

In [None]:
fig, ax = plt.subplots(figsize=(3, 3))

plot_conf_matrix_proba(predictions_proba, labels=labels, label_col="condition", ax=ax)

fig.tight_layout(pad=0)

In [None]:
predictions_proba_cond = predictions_proba.join(dataset.condition_first).join(dataset.cortisol_non_responder)
predictions_proba_cond = predictions_proba_cond.set_index(["condition_first", "non_responder"], append=True)

### Prediction Probability by Condition Order

In [None]:
fig, axs = plt.subplots(figsize=(6, 3), ncols=2)

for (key, df), ax in zip(predictions_proba_cond.groupby("condition_first"), axs):
    plot_conf_matrix_proba(df, labels=labels, label_col="condition", ax=ax)
    ax.set_title(key)

fig.tight_layout(w_pad=1)

### Prediction Probability by Cortisol Non-Responder

In [None]:
fig, axs = plt.subplots(figsize=(6, 3), ncols=2)

for (key, df), ax in zip(predictions_proba_cond.groupby("non_responder"), axs):
    plot_conf_matrix_proba(df, labels=labels, label_col="condition", ax=ax)
    ax.set_title(f"Non-Responder: {key}")

fig.tight_layout(w_pad=1)

## Get Feature Statistics

In [None]:
# number of features selected by the feature selection algorithm
num_features_per_fold = get_number_features_per_fold(pipeline_permuter, selected_pipeline)
num_features_per_fold

In [None]:
feature_counts = get_feature_counts(pipeline_permuter, data=data, pipeline=selected_pipeline, num_features=1)
feature_counts.head()

### Total Number of Selected Features

In [None]:
number_features = pd.DataFrame(feature_counts.groupby("feature_type").size(), columns=["Count"]).T
number_features["total"] = [len(feature_counts)]

number_features

### LaTeX table

In [None]:
feature_counts_export = feature_counts[feature_counts["Count"] >= 3]

caption = r"Overview of features that were selected by the RFE feature selection algorithm in at least 3 out of 5 CV folds.\\\textit{Note:} Stat. Per. = Static Periods."
label = "tab:feature_counts"
feature_counts_tex = feature_counts_to_latex(feature_counts_export, caption=caption, label=label)
# tab_path_paper.joinpath("tab_feature_counts.tex").open(mode="w+").write(feature_counts_tex)
print(feature_counts_tex)

In [None]:
upper_extremities = ["RightHand", "LeftForeArm", "RightForeArm"]
trunk = ["Spine3"]
head = ["Head"]

feature_counts_body_part = feature_counts.groupby("body_part").size().sort_values(ascending=False)
feature_counts_body_part = pd.DataFrame(feature_counts_body_part, columns=["Counts"])

print(f"Upper Extremities: {feature_counts_body_part.loc[upper_extremities].sum().sum()}")
print(f"Trunk: {feature_counts_body_part.loc[trunk].sum().sum()}")
print(f"Head: {feature_counts_body_part.loc[head].sum().sum()}")

feature_counts_body_part

In [None]:
features_plot = feature_counts.iloc[0:5].index
data_unstack = data["data"].unstack(["subject", "condition"]).T
data_plot = data_unstack.loc[:, features_plot]
data_plot.columns = ["-".join(col) for col in data_plot.columns]

pairgrid, features = bp.plotting.feature_pairplot(data=data_plot, hue="condition")
display(features)