In [9]:
import pandas as pd
import numpy as np
from pathlib import Path
from petTOAD_load import RES_DIR
from scipy import stats

TBL_DIR = RES_DIR / "Tables"
if not Path.exists(TBL_DIR):
    Path.mkdir(TBL_DIR)

In [10]:
# Load the clinical dataframe with the groupings
df_petTOAD_pre = pd.read_csv(RES_DIR / "df_petTOAD.csv", index_col = 0)
df_petTOAD_pre["WMH_bin"] = np.where(
    df_petTOAD_pre["Group_bin_Fazekas"].str.contains("no_WMH"), "no_WMH", "WMH"
)
df_petTOAD = df_petTOAD_pre.copy()


In [11]:
# Read in data
n_pts = len(df_petTOAD)
age_min = round(df_petTOAD.describe()["Age"]["min"], 0)
age_max = round(df_petTOAD.describe()["Age"]["max"], 0)
sex_counts = df_petTOAD["Sex"].value_counts()
DX_counts = df_petTOAD["Group"].value_counts()
females_num = sex_counts["F"]
males_num = sex_counts["M"]
cu_num = DX_counts["CN"]
mci_num = DX_counts["MCI"]

cu_no_wmh = len(df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "CN_no_WMH"])
cu_wmh = len(df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "CN_WMH"])
mci_no_wmh = len(df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "MCI_no_WMH"])
mci_wmh = len(df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "MCI_WMH"])

print(
    f"After exclusion, {n_pts} patients (age-range: {age_min}-{age_max}, {females_num} ({round(females_num / n_pts * 100, 2)}%) females, {males_num} ({round(males_num / n_pts * 100, 2)}%) males, {cu_num} ({round(cu_num / n_pts* 100, 1)}%) CU and {mci_num} ({round(mci_num / n_pts * 100, 1)}%) MCI) were considered for the modeling pipeline."
)
print(
    f"Patients were further subdivided into subgroups according to the previously defined Fazekas score cutoff of <= 2."
)
print(
    f"Accordingly, there were: {cu_no_wmh} ({round(cu_no_wmh / cu_num * 100, 1)}%) CU without WMH, {cu_wmh} ({round(cu_wmh / cu_num * 100, 1)}%) CU with WMH, {mci_no_wmh} ({round(mci_no_wmh / mci_num * 100, 1)}%) MCI without WMH, {mci_wmh} ({round(mci_wmh / mci_num* 100, 1)}%) MCI with WMH,"
)
# Create contingency table for comparing Fazekas binary classification between groups with Chi-squared
ct_wmh = pd.crosstab(df_petTOAD["WMH_bin"], df_petTOAD["Group"])
_, p_wmh, _, _ = stats.chi2_contingency(ct_wmh)
if p_wmh < 0.05:
    print(
        f"with statistically significant differences in frequency between subgroups (p = {round(p_wmh,3)})"
    )
else:
    print(
        f"with no statistically significant differences in frequency between subgroups (p = {round(p_wmh, 3)})"
    )
# Get age summary stats
age_summary = df_petTOAD.groupby(["Group"])["Age"].describe()
age_summary_subgroups = df_petTOAD.groupby(["Group_bin_Fazekas"])["Age"].describe()
print("################# Results #################")
print("# Age #")
pval_age_diff_cn_vs_mci = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group"] == "CN"]["Age"],
    df_petTOAD[df_petTOAD["Group"] == "MCI"]["Age"],
)[1]
pval_age_diff_cn_no_wmh_vs_wmh = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "CN_WMH"]["Age"],
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "CN_no_WMH"]["Age"],
)[1]
pval_age_diff_mci_no_wmh_vs_wmh = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "MCI_WMH"]["Age"],
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "MCI_no_WMH"]["Age"],
)[1]
print(
    f"Median age for CU subjects = {age_summary['50%']['CN']} (IQR = {age_summary['25%']['CN']} - {age_summary['75%']['CN']})"
)
print(
    f"Median age for MCI subjects = {age_summary['50%']['MCI']} (IQR = {age_summary['25%']['MCI']} - {age_summary['75%']['MCI']})"
)
print(f"P-value age CU vs MCI: {round(pval_age_diff_cn_vs_mci, 5)}")

print(
    f"Median age for CU no WMH subjects = {age_summary_subgroups['50%']['CN_no_WMH']} (IQR = {age_summary_subgroups['25%']['CN_no_WMH']} - {age_summary_subgroups['75%']['CN_no_WMH']})"
)
print(
    f"Median age for CU WMH subjects = {age_summary_subgroups['50%']['CN_WMH']} (IQR = {age_summary_subgroups['25%']['CN_WMH']} - {age_summary_subgroups['75%']['CN_WMH']})"
)
print(f"P-value age CU no WMH vs CU WMH: {round(pval_age_diff_cn_no_wmh_vs_wmh, 5)}")

print(
    f"Median age for MCI no WMH subjects = {age_summary_subgroups['50%']['MCI_no_WMH']} (IQR = {age_summary_subgroups['25%']['MCI_no_WMH']} - {age_summary_subgroups['75%']['MCI_no_WMH']})"
)
print(
    f"Median age for MCI WMH subjects = {age_summary_subgroups['50%']['MCI_WMH']} (IQR = {age_summary_subgroups['25%']['MCI_WMH']} - {age_summary_subgroups['75%']['MCI_WMH']})"
)
print(f"P-value age MCI no WMH vs MCI WMH: {round(pval_age_diff_mci_no_wmh_vs_wmh, 5)}")


print("# WMH #")
wmh_summary = df_petTOAD.groupby(["Group"])["WMH_load_subj_space"].describe()
wmh_summary_subgroups = df_petTOAD.groupby(["Group_bin_Fazekas"])[
    "WMH_load_subj_space"
].describe()

pval_wmh_diff_cn_vs_mci = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group"] == "CN"]["WMH_load_subj_space"],
    df_petTOAD[df_petTOAD["Group"] == "MCI"]["WMH_load_subj_space"],
)[1]
pval_wmh_diff_cn_no_wmh_vs_wmh = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "CN_WMH"]["WMH_load_subj_space"],
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "CN_no_WMH"]["WMH_load_subj_space"],
)[1]
pval_wmh_diff_mci_no_wmh_vs_wmh = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "MCI_WMH"]["WMH_load_subj_space"],
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "MCI_no_WMH"]["WMH_load_subj_space"],
)[1]

print(
    f"Median WMH volume load for CN subjects = {wmh_summary['50%']['CN']} (IQR = {wmh_summary['25%']['CN']} - {wmh_summary['75%']['CN']})"
)
print(
    f"Median WMH volume load for MCI subjects = {wmh_summary['50%']['MCI']} (IQR = {wmh_summary['25%']['MCI']} - {wmh_summary['75%']['MCI']})"
)
print(f"P-value WMH volum CU vs MCI: {pval_wmh_diff_cn_vs_mci}")

print(
    f"Median WMH volume load for CN no WMH subjects = {wmh_summary_subgroups['50%']['CN_no_WMH']} (IQR = {wmh_summary_subgroups['25%']['CN_no_WMH']} - {wmh_summary_subgroups['75%']['CN_no_WMH']})"
)
print(
    f"Median WMH volume load for CN WMH subjects = {wmh_summary_subgroups['50%']['CN_WMH']} (IQR = {wmh_summary_subgroups['25%']['CN_WMH']} - {wmh_summary_subgroups['75%']['CN_WMH']})"
)
print(f"P-value WMH volume CU no WMH vs CU WMH: {pval_wmh_diff_cn_no_wmh_vs_wmh}")

print(
    f"Median WMH volume load for MCI no WMH subjects = {wmh_summary_subgroups['50%']['MCI_no_WMH']} (IQR = {wmh_summary_subgroups['25%']['MCI_no_WMH']} - {wmh_summary_subgroups['75%']['MCI_no_WMH']})"
)
print(
    f"Median WMH volume load for MCI WMH subjects = {wmh_summary_subgroups['50%']['MCI_WMH']} (IQR = {wmh_summary_subgroups['25%']['MCI_WMH']} - {wmh_summary_subgroups['75%']['MCI_WMH']})"
)
print(f"P-value WMH volume MCI no WMH vs MCI WMH: {pval_wmh_diff_mci_no_wmh_vs_wmh}")


print("# WMH log#")
df_petTOAD["wmh_log"] = np.where(df_petTOAD["WMH_load_subj_space"] == 0, 0, np.log10(df_petTOAD["WMH_load_subj_space"]))
wmh_log_summary = df_petTOAD.groupby(["Group"])["wmh_log"].describe()
wmh_log_summary_subgroups = df_petTOAD.groupby(["Group_bin_Fazekas"])[
    "wmh_log"
].describe()

pval_wmh_log_diff_cn_vs_mci = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group"] == "CN"]["wmh_log"],
    df_petTOAD[df_petTOAD["Group"] == "MCI"]["wmh_log"],
)[1]
pval_wmh_log_diff_cn_no_wmh_vs_wmh = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "CN_WMH"]["wmh_log"],
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "CN_no_WMH"]["wmh_log"],
)[1]
pval_wmh_log_diff_mci_no_wmh_vs_wmh = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "MCI_WMH"]["wmh_log"],
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "MCI_no_WMH"]["wmh_log"],
)[1]

print(
    f"Median WMH volume load for CN subjects = {wmh_log_summary['50%']['CN']} (IQR = {wmh_log_summary['25%']['CN']} - {wmh_log_summary['75%']['CN']})"
)
print(
    f"Median WMH volume load for MCI subjects = {wmh_log_summary['50%']['MCI']} (IQR = {wmh_log_summary['25%']['MCI']} - {wmh_log_summary['75%']['MCI']})"
)
print(f"P-value WMH volum CU vs MCI: {pval_wmh_log_diff_cn_vs_mci}")

print(
    f"Median WMH volume load for CN no WMH subjects = {wmh_log_summary_subgroups['50%']['CN_no_WMH']} (IQR = {wmh_log_summary_subgroups['25%']['CN_no_WMH']} - {wmh_log_summary_subgroups['75%']['CN_no_WMH']})"
)
print(
    f"Median WMH volume load for CN WMH subjects = {wmh_log_summary_subgroups['50%']['CN_WMH']} (IQR = {wmh_log_summary_subgroups['25%']['CN_WMH']} - {wmh_log_summary_subgroups['75%']['CN_WMH']})"
)
print(f"P-value WMH volume CU no WMH vs CU WMH: {pval_wmh_log_diff_cn_no_wmh_vs_wmh}")

print(
    f"Median WMH volume load for MCI no WMH subjects = {wmh_log_summary_subgroups['50%']['MCI_no_WMH']} (IQR = {wmh_log_summary_subgroups['25%']['MCI_no_WMH']} - {wmh_log_summary_subgroups['75%']['MCI_no_WMH']})"
)
print(
    f"Median WMH volume load for MCI WMH subjects = {wmh_log_summary_subgroups['50%']['MCI_WMH']} (IQR = {wmh_log_summary_subgroups['25%']['MCI_WMH']} - {wmh_log_summary_subgroups['75%']['MCI_WMH']})"
)
print(f"P-value WMH volume MCI no WMH vs MCI WMH: {pval_wmh_log_diff_mci_no_wmh_vs_wmh}")


print("# MMSE #")
mmse_summary = df_petTOAD.groupby(["Group"])["MMSE"].describe()
mmse_summary_subgroups = df_petTOAD.groupby(["Group_bin_Fazekas"])["MMSE"].describe()

pval_mmse_diff_cn_vs_mci = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group"] == "CN"]["MMSE"],
    df_petTOAD[df_petTOAD["Group"] == "MCI"]["MMSE"],
)[1]
pval_mmse_diff_cn_no_wmh_vs_wmh = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "CN_WMH"]["MMSE"],
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "CN_no_WMH"]["MMSE"],
)[1]
pval_mmse_diff_mci_no_wmh_vs_wmh = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "MCI_WMH"]["MMSE"],
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "MCI_no_WMH"]["MMSE"],
)[1]

print(
    f"Median MMSE score for CN subjects = {mmse_summary['50%']['CN']} (IQR = {mmse_summary['25%']['CN']} - {mmse_summary['75%']['CN']})"
)
print(
    f"Median MMSE score for MCI subjects = {mmse_summary['50%']['MCI']} (IQR = {mmse_summary['25%']['MCI']} - {mmse_summary['75%']['MCI']})"
)
print(f"P-value MMSE score CU vs MCI: {pval_mmse_diff_cn_vs_mci}")

print(
    f"Median MMSE score for CN no WMH subjects = {mmse_summary_subgroups['50%']['CN_no_WMH']} (IQR = {mmse_summary_subgroups['25%']['CN_no_WMH']} - {mmse_summary_subgroups['75%']['CN_no_WMH']})"
)
print(
    f"Median MMSE score for CN WMH subjects = {mmse_summary_subgroups['50%']['CN_WMH']} (IQR = {mmse_summary_subgroups['25%']['CN_WMH']} - {mmse_summary_subgroups['75%']['CN_WMH']})"
)
print(f"P-value MMSE score CU no WMH vs CU WMH: {pval_mmse_diff_cn_no_wmh_vs_wmh}")

print(
    f"Median MMSE score for MCI no WMH subjects = {mmse_summary_subgroups['50%']['MCI_no_WMH']} (IQR = {mmse_summary_subgroups['25%']['MCI_no_WMH']} - {mmse_summary_subgroups['75%']['MCI_no_WMH']})"
)
print(
    f"Median MMSE score for MCI WMH subjects = {mmse_summary_subgroups['50%']['MCI_WMH']} (IQR = {mmse_summary_subgroups['25%']['MCI_WMH']} - {mmse_summary_subgroups['75%']['MCI_WMH']})"
)
print(f"P-value MMSE score MCI no WMH vs MCI WMH: {pval_mmse_diff_mci_no_wmh_vs_wmh}")

print("# Education #")
educ_summary = df_petTOAD.groupby(["Group"])["PTEDUCAT"].describe()
educ_summary_subgroups = df_petTOAD.groupby(["Group_bin_Fazekas"])[
    "PTEDUCAT"
].describe()

pval_educ_diff_cn_vs_mci = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group"] == "CN"]["PTEDUCAT"],
    df_petTOAD[df_petTOAD["Group"] == "MCI"]["PTEDUCAT"],
)[1]
pval_educ_diff_cn_no_wmh_vs_wmh = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "CN_WMH"]["PTEDUCAT"],
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "CN_no_WMH"]["PTEDUCAT"],
)[1]
pval_educ_diff_mci_no_wmh_vs_wmh = stats.mannwhitneyu(
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "MCI_WMH"]["PTEDUCAT"],
    df_petTOAD[df_petTOAD["Group_bin_Fazekas"] == "MCI_no_WMH"]["PTEDUCAT"],
)[1]

print(
    f"Median MMSE score for CN subjects = {educ_summary['50%']['CN']} (IQR = {educ_summary['25%']['CN']} - {educ_summary['75%']['CN']})"
)
print(
    f"Median MMSE score for MCI subjects = {educ_summary['50%']['MCI']} (IQR = {educ_summary['25%']['MCI']} - {educ_summary['75%']['MCI']})"
)
print(f"P-value MMSE score CU vs MCI: {pval_educ_diff_cn_vs_mci}")

print(
    f"Median MMSE score for CN no WMH subjects = {educ_summary_subgroups['50%']['CN_no_WMH']} (IQR = {educ_summary_subgroups['25%']['CN_no_WMH']} - {educ_summary_subgroups['75%']['CN_no_WMH']})"
)
print(
    f"Median MMSE score for CN WMH subjects = {educ_summary_subgroups['50%']['CN_WMH']} (IQR = {educ_summary_subgroups['25%']['CN_WMH']} - {educ_summary_subgroups['75%']['CN_WMH']})"
)
print(f"P-value MMSE score CU no WMH vs CU WMH: {pval_educ_diff_cn_no_wmh_vs_wmh}")

print(
    f"Median MMSE score for MCI no WMH subjects = {educ_summary_subgroups['50%']['MCI_no_WMH']} (IQR = {educ_summary_subgroups['25%']['MCI_no_WMH']} - {educ_summary_subgroups['75%']['MCI_no_WMH']})"
)
print(
    f"Median MMSE score for MCI WMH subjects = {educ_summary_subgroups['50%']['MCI_WMH']} (IQR = {educ_summary_subgroups['25%']['MCI_WMH']} - {educ_summary_subgroups['75%']['MCI_WMH']})"
)
print(f"P-value MMSE score MCI no WMH vs MCI WMH: {pval_educ_diff_mci_no_wmh_vs_wmh}")


print("# Sex #")
# Create contingency table for chi-squared
ct_sex = pd.crosstab(df_petTOAD["Sex"], df_petTOAD["Group"])
c_sex, p_sex, _, _ = stats.chi2_contingency(ct_sex)
if p_sex < 0.05:
    print(
        f"MCI and CU showed statistically significant sex differences, p = {round(p_sex, 3)}"
    )
else:
    print(f"MCI and CU showed no sex groups differences, p = {round(p_sex,3)}")

# Create contingency table for chi-squared
ct_sex_wmh = pd.crosstab(df_petTOAD["Sex"], df_petTOAD["Group_bin_Fazekas"])
ct_sex_cn = ct_sex_wmh.iloc[:2, :2]
c_sex_cn, p_sex_cn, _, _ = stats.chi2_contingency(ct_sex_cn)
if p_sex_cn < 0.05:
    print(
        f"CU WMH and CU no WMH showed statistically significant sex differences, p = {round(p_sex_cn, 3)}"
    )
else:
    print(
        f"CU WMH and CU no WMH showed no sex groups differences, p = {round(p_sex_cn,3)}"
    )

# Create contingency table for chi-squared
ct_sex_mci = ct_sex_wmh.iloc[:2, 2:]
c_sex_mci, p_sex_mci, _, _ = stats.chi2_contingency(ct_sex_mci)
if p_sex_mci < 0.05:
    print(
        f"CU WMH and CU no WMH showed statistically significant sex differences, p = {round(p_sex_mci, 3)}"
    )
else:
    print(
        f"MCI WMH and MCI no WMH showed no sex groups differences, p = {round(p_sex_mci,3)}"
    )

After exclusion, 188 patients (age-range: 56.0-90.0, 101 (53.72%) females, 87 (46.28%) males, 120 (63.8%) CU and 68 (36.2%) MCI) were considered for the modeling pipeline.
Patients were further subdivided into subgroups according to the previously defined Fazekas score cutoff of <= 2.
Accordingly, there were: 63 (52.5%) CU without WMH, 57 (47.5%) CU with WMH, 37 (54.4%) MCI without WMH, 31 (45.6%) MCI with WMH,
with no statistically significant differences in frequency between subgroups (p = 0.92)
################# Results #################
# Age #
Median age for CU subjects = 69.5 (IQR = 66.0 - 76.0)
Median age for MCI subjects = 72.0 (IQR = 68.75 - 79.0)
P-value age CU vs MCI: 0.03996
Median age for CU no WMH subjects = 68.0 (IQR = 66.0 - 71.5)
Median age for CU WMH subjects = 73.0 (IQR = 68.0 - 77.0)
P-value age CU no WMH vs CU WMH: 0.00055
Median age for MCI no WMH subjects = 71.0 (IQR = 65.0 - 76.0)
Median age for MCI WMH subjects = 76.0 (IQR = 71.5 - 79.5)
P-value age MCI no WMH 

In [12]:
# Create a summary DataFrame
summary_data = {
    "Age Median (IQR)": [
        f"{age_summary['50%']['CN']} ({age_summary['25%']['CN']} - {age_summary['75%']['CN']})",
        f"{age_summary['50%']['MCI']} ({age_summary['25%']['MCI']} - {age_summary['75%']['MCI']})",
        f"{round(pval_age_diff_cn_vs_mci, 4)}",
    ],
    "Sex n (%)": ["", "", round(p_sex, 3)],
    "Woman": [
        f"{ct_sex.iloc[0, 0]} ({round(ct_sex.iloc[0, 0]/cu_num * 100, 1)}%)",
        f"{ct_sex.iloc[0, 1]} ({round(ct_sex.iloc[0, 1] / mci_num * 100, 1)}%)",
        "",
    ],
    "Man": [
        f"{ct_sex.iloc[1, 0]} ({round(ct_sex.iloc[1, 0]/cu_num * 100, 1)}%)",
        f"{ct_sex.iloc[1, 1]} ({round(ct_sex.iloc[1, 1] / mci_num * 100, 1)}%)",
        "",
    ],
    "WMH volume (mm^3)": [
        f"{int(wmh_summary['50%']['CN'])} ({int(wmh_summary['25%']['CN'])}-{int(wmh_summary['75%']['CN'])})",
        f"{int(wmh_summary['50%']['MCI'])} ({int(wmh_summary['25%']['MCI'])}-{int(wmh_summary['75%']['MCI'])})",
        f"{round(pval_wmh_diff_cn_vs_mci, 3)}",
    ],
    "WMH volume (log)": [
        f"{wmh_log_summary['50%']['CN']:.2f} ({wmh_log_summary['25%']['CN']:.2f}-{wmh_log_summary['75%']['CN']:.2f})",
        f"{wmh_log_summary['50%']['MCI']:.2f} ({wmh_log_summary['25%']['MCI']:.2f}-{wmh_log_summary['75%']['MCI']:.2f})",
        f"{round(pval_wmh_log_diff_cn_vs_mci, 3)}",
    ],

    "Fazekas score binned n (%)": ["", "", round(p_wmh, 3)],
    "WMH": [
        f"{ct_wmh.iloc[0, 0]} ({round(ct_wmh.iloc[0, 0]/cu_num * 100, 1)}%)",
        f"{ct_wmh.iloc[0, 1]} ({round(ct_wmh.iloc[0, 1] / mci_num * 100, 1)}%)",
        "",
    ],
    "no WMH": [
        f"{ct_wmh.iloc[1, 0]} ({round(ct_wmh.iloc[1, 0]/cu_num * 100, 1)}%)",
        f"{ct_wmh.iloc[1, 1]} ({round(ct_wmh.iloc[1, 1] / mci_num * 100, 1)}%)",
        "",
    ],
}

# Create the summary DataFrame
df_summary = pd.DataFrame(data=summary_data)

# Print the summary DataFrame
df_table1_bis = df_summary.T
df_table1_bis.columns = [f"CU, n = {cu_num}", f"MCI, n = {mci_num}", "p-value"]
#df_table1_bis.to_csv(RES_DIR / "table_1_group_together.csv")
df_table1_bis

Unnamed: 0,"CU, n = 120","MCI, n = 68",p-value
Age Median (IQR),69.5 (66.0 - 76.0),72.0 (68.75 - 79.0),0.04
Sex n (%),,,0.014
Woman,73 (60.8%),28 (41.2%),
Man,47 (39.2%),40 (58.8%),
WMH volume (mm^3),4686 (2054-11644),4524 (2294-12377),0.629
WMH volume (log),3.67 (3.31-4.07),3.66 (3.36-4.09),0.629
Fazekas score binned n (%),,,0.92
WMH,57 (47.5%),31 (45.6%),
no WMH,63 (52.5%),37 (54.4%),


In [31]:
def format_pval(pval):
    return "< 0.001" if pval < 0.001 else f"{pval:.3f}"


# Create a summary DataFrame
summary_data = {
    "Age Median (IQR)": [
        f"{age_summary_subgroups['50%']['CN_no_WMH']} ({age_summary_subgroups['25%']['CN_no_WMH']} - {age_summary_subgroups['75%']['CN_no_WMH']})",
        f"{age_summary_subgroups['50%']['CN_WMH']} ({age_summary_subgroups['25%']['CN_WMH']} - {age_summary_subgroups['75%']['CN_WMH']})",
        f"{format_pval(pval_age_diff_cn_no_wmh_vs_wmh)}",
        f"{age_summary_subgroups['50%']['MCI_no_WMH']} ({age_summary_subgroups['25%']['MCI_no_WMH']} - {age_summary_subgroups['75%']['MCI_no_WMH']})",
        f"{age_summary_subgroups['50%']['MCI_WMH']} ({age_summary_subgroups['25%']['MCI_WMH']} - {age_summary_subgroups['75%']['MCI_WMH']})",
        f"{format_pval(pval_age_diff_mci_no_wmh_vs_wmh)}",
    ],
    "Sex n (%)": ["", "", round(p_sex_cn, 3), "", "", round(p_sex_mci)],
    "Woman": [
        f"{ct_sex_cn['CN_no_WMH']['F']} ({round(ct_sex_cn['CN_no_WMH']['F'] / ct_sex_cn['CN_no_WMH'].sum() * 100, 1)}%)",
        f"{ct_sex_cn['CN_WMH']['F']} ({round(ct_sex_cn['CN_WMH']['F'] / ct_sex_cn['CN_WMH'].sum() * 100, 1)}%)",
        " ",
        f"{ct_sex_mci['MCI_no_WMH']['F']} ({round(ct_sex_mci['MCI_no_WMH']['F'] / ct_sex_mci['MCI_no_WMH'].sum() * 100, 1)}%)",
        f"{ct_sex_mci['MCI_WMH']['F']} ({round(ct_sex_mci['MCI_WMH']['F'] / ct_sex_mci['MCI_WMH'].sum() * 100, 1)}%)",
        " ",
    ],
    "Man": [
        f"{ct_sex_cn['CN_no_WMH']['M']} ({round(ct_sex_cn['CN_no_WMH']['M'] / ct_sex_cn['CN_no_WMH'].sum() * 100, 1)}%)",
        f"{ct_sex_cn['CN_WMH']['M']} ({round(ct_sex_cn['CN_WMH']['M'] / ct_sex_cn['CN_WMH'].sum() * 100, 1)}%)",
        " ",
        f"{ct_sex_mci['MCI_no_WMH']['M']} ({round(ct_sex_mci['MCI_no_WMH']['M'] / ct_sex_mci['MCI_no_WMH'].sum() * 100, 1)}%)",
        f"{ct_sex_mci['MCI_WMH']['M']} ({round(ct_sex_mci['MCI_WMH']['M'] / ct_sex_mci['MCI_WMH'].sum() * 100, 1)}%)",
        " ",
    ],
    "WMH volume (mm^3)": [
        f"{int(wmh_summary_subgroups['50%']['CN_no_WMH'])} ({int(wmh_summary_subgroups['25%']['CN_no_WMH'])}-{int(wmh_summary_subgroups['75%']['CN_no_WMH'])})",
        f"{int(wmh_summary_subgroups['50%']['CN_WMH'])} ({int(wmh_summary_subgroups['25%']['CN_WMH'])}-{int(wmh_summary_subgroups['75%']['CN_WMH'])})",
        f"{format_pval(pval_wmh_diff_cn_no_wmh_vs_wmh)}",
        f"{int(wmh_summary_subgroups['50%']['MCI_no_WMH'])} ({int(wmh_summary_subgroups['25%']['MCI_no_WMH'])}-{int(wmh_summary_subgroups['75%']['MCI_no_WMH'])})",
        f"{int(wmh_summary_subgroups['50%']['MCI_WMH'])} ({int(wmh_summary_subgroups['25%']['MCI_WMH'])}-{int(wmh_summary_subgroups['75%']['MCI_WMH'])})",
        f"{format_pval(pval_wmh_diff_mci_no_wmh_vs_wmh)}",
    ],
    "WMH volume (log)": [
        f"{wmh_log_summary_subgroups['50%']['CN_no_WMH']:.2f} ({wmh_log_summary_subgroups['25%']['CN_no_WMH']:.2f}-{wmh_log_summary_subgroups['75%']['CN_no_WMH']:.2f})",
        f"{wmh_log_summary_subgroups['50%']['CN_WMH']:.2f} ({wmh_log_summary_subgroups['25%']['CN_WMH']:.2f}-{wmh_log_summary_subgroups['75%']['CN_WMH']:.2f})",
        f"{format_pval(pval_wmh_diff_cn_no_wmh_vs_wmh)}",
        f"{wmh_log_summary_subgroups['50%']['MCI_no_WMH']:.2f} ({wmh_log_summary_subgroups['25%']['MCI_no_WMH']:.2f}-{wmh_log_summary_subgroups['75%']['MCI_no_WMH']:.2f})",
        f"{wmh_log_summary_subgroups['50%']['MCI_WMH']:.2f} ({wmh_log_summary_subgroups['25%']['MCI_WMH']:.2f}-{wmh_log_summary_subgroups['75%']['MCI_WMH']:.2f})",
        f"{format_pval(pval_wmh_log_diff_mci_no_wmh_vs_wmh)}",
    ],
    "MMSE": [
        f"{int(mmse_summary_subgroups['50%']['CN_no_WMH'])} ({int(mmse_summary_subgroups['25%']['CN_no_WMH'])}-{int(mmse_summary_subgroups['75%']['CN_no_WMH'])})",
        f"{int(mmse_summary_subgroups['50%']['CN_WMH'])} ({int(mmse_summary_subgroups['25%']['CN_WMH'])}-{int(mmse_summary_subgroups['75%']['CN_WMH'])})",
        f"{format_pval(pval_mmse_diff_cn_no_wmh_vs_wmh)}",
        f"{int(mmse_summary_subgroups['50%']['MCI_no_WMH'])} ({int(mmse_summary_subgroups['25%']['MCI_no_WMH'])}-{int(mmse_summary_subgroups['75%']['MCI_no_WMH'])})",
        f"{int(mmse_summary_subgroups['50%']['MCI_WMH'])} ({int(mmse_summary_subgroups['25%']['MCI_WMH'])}-{int(mmse_summary_subgroups['75%']['MCI_WMH'])})",
        f"{format_pval(pval_mmse_diff_mci_no_wmh_vs_wmh)}",
    ],
    "Education (yrs)": [
        f"{int(educ_summary_subgroups['50%']['CN_no_WMH'])} ({int(educ_summary_subgroups['25%']['CN_no_WMH'])}-{int(educ_summary_subgroups['75%']['CN_no_WMH'])})",
        f"{int(educ_summary_subgroups['50%']['CN_WMH'])} ({int(educ_summary_subgroups['25%']['CN_WMH'])}-{int(educ_summary_subgroups['75%']['CN_WMH'])})",
        f"{format_pval(pval_educ_diff_cn_no_wmh_vs_wmh)}",
        f"{int(educ_summary_subgroups['50%']['MCI_no_WMH'])} ({int(educ_summary_subgroups['25%']['MCI_no_WMH'])}-{int(educ_summary_subgroups['75%']['MCI_no_WMH'])})",
        f"{int(educ_summary_subgroups['50%']['MCI_WMH'])} ({int(educ_summary_subgroups['25%']['MCI_WMH'])}-{int(educ_summary_subgroups['75%']['MCI_WMH'])})",
        f"{format_pval(pval_educ_diff_mci_no_wmh_vs_wmh)}",
    ],
}

# Create the summary DataFrame
df_summary = pd.DataFrame(data=summary_data)

# Print the summary DataFrame
df_table1 = df_summary.T
df_table1.columns = [
    f"CU no WMH (n = {cu_no_wmh})",
    f"CU WMH (n = {cu_wmh})",
    "p",
    f"MCI no WMH (n = {mci_no_wmh})",
    f"MCI WMH (n = {mci_wmh})",
    "p",
]
#df_table1.to_csv(TBL_DIR / "table_1.csv")
df_table1

Unnamed: 0,CU no WMH (n = 63),CU WMH (n = 57),p,MCI no WMH (n = 37),MCI WMH (n = 31),p.1
Age Median (IQR),68.0 (66.0 - 71.5),73.0 (68.0 - 77.0),< 0.001,71.0 (65.0 - 76.0),76.0 (71.5 - 79.5),0.003
Sex n (%),,,0.494,,,1
Woman,36 (57.1%),37 (64.9%),,15 (40.5%),13 (41.9%),
Man,27 (42.9%),20 (35.1%),,22 (59.5%),18 (58.1%),
WMH volume (mm^3),2083 (912-3446),12202 (6950-23818),< 0.001,2456 (1629-4416),13519 (6451-24825),< 0.001
WMH volume (log),3.32 (2.96-3.54),4.09 (3.84-4.38),< 0.001,3.39 (3.21-3.65),4.13 (3.81-4.39),< 0.001
MMSE,29 (29-30),29 (29-30),0.468,29 (28-29),28 (27-29),0.519
Education (yrs),18 (16-18),16 (14-18),0.057,16 (15-18),16 (15-18),0.960


In [14]:
RES_GROUP_DIR = RES_DIR / "final_simulations"
df_best_val_cn_wmh = pd.read_csv(RES_GROUP_DIR / "CN_WMH" / "group-CN_WMH_data-empirical_model-all_desc-df-best-values-and-indices.csv", index_col = 0)
df_best_val_mci_wmh = pd.read_csv(RES_GROUP_DIR / "MCI_WMH" / "group-MCI_WMH_data-empirical_model-all_desc-df-best-values-and-indices.csv", index_col = 0)
df_best_val_cn_no_wmh = pd.read_csv(RES_GROUP_DIR / "group-CN-no-WMH_desc-best-G.csv", index_col = 0)

df_best_val = pd.concat([df_best_val_cn_wmh, df_best_val_mci_wmh], axis = 1)
rows_to_keep = [1, 3]
df_best_val = df_best_val.iloc[rows_to_keep, :]
df_best_val["CN_no_WMH"] = [df_best_val_cn_no_wmh["K_gl"].values[0], -0.02]
df_best_val["MCI_no_WMH"] = [df_best_val_cn_no_wmh["K_gl"].values[0], -0.02]

In [15]:
df_best_val

Unnamed: 0,CN_WMH,MCI_WMH,CN_no_WMH,MCI_no_WMH
best_G_model-homogeneous_G,1.77,1.62,1.98,1.98
best_a_model-homogeneous_a,-0.021,-0.026,-0.02,-0.02


In [16]:
# from itertools import combinations

# def results_combo(df, all_pairs, rand=False):
#     # Create a dictionary to store the results
#     results = {}
#     # Loop through the pairs and perform wilcoxon test to see if model performance is different among all possible pairs
#     for pair in all_pairs:
#         group1 = df[df["Model_type"] == pair[0]]
#         group2 = df[df["Model_type"] == pair[1]]
#         # Note: Reported as Wilcoxon T test since Scipy.stats.wilcoxon() method reports the T value and not the W value
#         t_statistic_comp_score, p_value_comp_score = stats.wilcoxon(
#             group1["comp_score"], group2["comp_score"]
#         )
#         # Save everything in a results dictionary
#         results[f"{pair[0]} vs {pair[1]}"] = {
#             "t_statistic_comp_score": t_statistic_comp_score,
#             "p_value_comp_score": p_value_comp_score,
#         }

#     return results

# # Use itertools.combinations to create a combination of all model types (same for CN and MCI)
# model_pairs = list(combinations(df_max_comp_score_CN["Model_type"].unique(), 2))

# # Perform all the comparisons both for CU with WMH and for MCI with WMH for non-random vs. baseline and vs. one another...
# results_pairs_cn = results_combo(df_max_comp_score_CN, model_pairs, rand=False)
# results_pairs_mci = results_combo(df_max_comp_score_mci, model_pairs, rand=False)
# # Store everythin in dataframes
# df_results_pairs_cn = pd.DataFrame().from_dict(results_pairs_cn).T
# df_results_pairs_mci = pd.DataFrame().from_dict(results_pairs_mci).T

# # Now we create a summary dataset with quartiles and median for the different observables for CU WMH
# df_describe_model_fit_cn_wmh = round(
#     df_max_comp_score_CN.groupby("Model_type").describe()[
#         [
#             ("comp_score", "25%"),
#             ("comp_score", "50%"),
#             ("comp_score", "75%"),
#         ]
#     ],
#     3,
# )
# # ... and for MCI WMH
# df_describe_model_fit_mci_wmh = round(
#     df_max_comp_score_mci.groupby("Model_type").describe()[
#         [
#             ("comp_score", "25%"),
#             ("comp_score", "50%"),
#             ("comp_score", "75%"),
#         ]
#     ],
#     3,
# )

# # This is just to create a subIndex with the group name for better visualization
# df_describe_model_fit_cn_wmh_nicer = pd.concat(
#     {"CU with WMH": df_describe_model_fit_cn_wmh}, names=["Group"]
# )
# df_describe_model_fit_mci_wmh_nicer = pd.concat(
#     {"MCI with WMH": df_describe_model_fit_mci_wmh}, names=["Group"]
# )
# df_describe_model_fit_together = pd.concat(
#     [df_describe_model_fit_cn_wmh_nicer, df_describe_model_fit_mci_wmh_nicer]
# )

# # Here we are going to summarise in just one column median (IQR) for all observables...
# df_describe_model_fit_together[("GFS", "Median (IQR)")] = (
#     df_describe_model_fit_together[("comp_score", "50%")].astype(str)
#     + " ("
#     + df_describe_model_fit_together[("comp_score", "25%")].astype(str)
#     + "-"
#     + df_describe_model_fit_together[("comp_score", "75%")].astype(str)
#     + ")"
# )

# # ... and we get rid of previous columns, as they are now redundant
# df_describe_model_fit_final= df_describe_model_fit_together.iloc[:, -1:]

# # Now we want to add to the table all the p-values of the comparisons between model fits that were previously calculated
# # It is a bit tedious now, as I set them one by one, probably there is a faster way?
# # We populate the table twice with the same numbers (col, row) and (row, col), but I feel it's more understandable
# # compared to just leaving a lot of empty cells...
# # We also change the names of the columns, to be in line with the observables described in the paper
# list_obs = [("p-value GFS", "p_value_comp_score"),
#             ]

# # For different group we need to feed from the specific group dataset
# group_dfs = [("CU with WMH", df_results_pairs_cn),
#             ("MCI with WMH", df_results_pairs_mci),]

# for obs in list_obs:
#     # We first create empty columns
#     df_describe_model_fit_final[(obs[0], "vs. Base")] = np.nan
#     #df_describe_model_fit_final[(obs[0], "vs. Disconn.")] = np.nan
#     df_describe_model_fit_final[(obs[0], "vs. G-weight")] = np.nan
#     df_describe_model_fit_final[(obs[0], "vs. Hetero.")] = np.nan
#     df_describe_model_fit_final[(obs[0], "vs. Homo.")] = np.nan

#     for df in group_dfs:
#         # df_describe_model_fit_final.loc[
#         #     [(df[0], "Disconn.")], [(obs[0], "vs. Base")]
#         # ] = df[1].loc["Base vs Disconn.", obs[1]]
#         df_describe_model_fit_final.loc[
#             [(df[0], "G-weight")], [(obs[0], "vs. Base")]
#         ] = df[1].loc["Base vs G-weight", obs[1]]
#         df_describe_model_fit_final.loc[
#             [(df[0], "Hetero.")], [(obs[0], "vs. Base")]
#         ] = df[1].loc["Base vs Hetero.", obs[1]]
#         df_describe_model_fit_final.loc[
#             [(df[0], "Homo.")], [(obs[0], "vs. Base")]
#         ] = df[1].loc["Base vs Homo.", obs[1]]

#         # df_describe_model_fit_final.loc[
#         #     [(df[0], "Base")], [(obs[0], "vs. Disconn.")]
#         # ] = df[1].loc["Base vs Disconn.", obs[1]]
#         df_describe_model_fit_final.loc[
#             [(df[0], "Base")], [(obs[0], "vs. G-weight")]
#         ] = df[1].loc["Base vs G-weight", obs[1]]
#         df_describe_model_fit_final.loc[
#             [(df[0], "Base")], [(obs[0], "vs. Hetero.")]
#         ] = df[1].loc["Base vs Hetero.", obs[1]]
#         df_describe_model_fit_final.loc[
#             [(df[0], "Base")], [(obs[0], "vs. Homo.")]
#         ] = df[1].loc["Base vs Homo.", obs[1]]

#         # df_describe_model_fit_final.loc[
#         #     [(df[0], "G-weight")], [(obs[0], "vs. Disconn.")]
#         # ] = df[1].loc["G-weight vs Disconn.", obs[1]]
#         # df_describe_model_fit_final.loc[
#         #     [(df[0], "Hetero.")], [(obs[0], "vs. Disconn.")]
#         # ] = df[1].loc["Hetero. vs Disconn.", obs[1]]
#         # df_describe_model_fit_final.loc[
#         #     [(df[0], "Homo.")], [(obs[0], "vs. Disconn.")]
#         # ] = df[1].loc["Homo. vs Disconn.", obs[1]]

#         # df_describe_model_fit_final.loc[
#         #     [(df[0], "Disconn.")], [(obs[0], "vs. G-weight")]
#         # ] = df[1].loc["G-weight vs Disconn.", obs[1]]
#         # df_describe_model_fit_final.loc[
#         #     [(df[0], "Disconn.")], [(obs[0], "vs. Hetero.")]
#         # ] = df[1].loc["Hetero. vs Disconn.", obs[1]]
#         # df_describe_model_fit_final.loc[
#         #     [(df[0], "Disconn.")], [(obs[0], "vs. Homo.")]
#         # ] = df[1].loc["Homo. vs Disconn.", obs[1]]

#         df_describe_model_fit_final.loc[
#             [(df[0], "Hetero.")], [(obs[0], "vs. G-weight")]
#         ] = df[1].loc["Hetero. vs G-weight", obs[1]]
#         df_describe_model_fit_final.loc[
#             [(df[0], "Homo.")], [(obs[0], "vs. G-weight")]
#         ] = df[1].loc["Homo. vs G-weight", obs[1]]

#         df_describe_model_fit_final.loc[
#             [(df[0], "G-weight")], [(obs[0], "vs. Hetero.")]
#         ] = df[1].loc["Hetero. vs G-weight", obs[1]]
#         df_describe_model_fit_final.loc[
#             [(df[0], "G-weight")], [(obs[0], "vs. Homo.")]
#         ] = df[1].loc["Homo. vs G-weight", obs[1]]

#         df_describe_model_fit_final.loc[
#             [(df[0], "Homo.")], [(obs[0], "vs. Hetero.")]
#         ] = df[1].loc["Homo. vs Hetero.", obs[1]]

#         df_describe_model_fit_final.loc[
#             [(df[0], "Hetero.")], [(obs[0], "vs. Homo.")]
#         ] = df[1].loc["Homo. vs Hetero.", obs[1]]


# # Better formatting
# all_cols = df_describe_model_fit_final.columns.to_list()
# num = df_describe_model_fit_final._get_numeric_data()
# num_cols = num.columns.to_list()
# non_num_cols = [col for col in all_cols if col not in num_cols]
# num[num.astype(float) < 0.001] = "< 0.001"
# df_newly_formatted = pd.concat([num, df_describe_model_fit_final[non_num_cols]], axis = 1)
# new_cols = df_newly_formatted.columns.to_list()
# new_cols_ordered = new_cols[-1:] + new_cols[:-1]
# suppl_table1 = df_newly_formatted[new_cols_ordered]
# suppl_table1.to_csv(RES_DIR / "table_compare_all_models.csv")
# suppl_table1