In [1]:
import pandas as pd
import numpy as np

In [2]:
df =  pd.read_excel('../Data/SAGs.xlsx')

In [3]:
df['demo_sex'].value_counts(normalize=True) * 100

demo_sex
F    67.418033
M    32.581967
Name: proportion, dtype: float64

In [4]:
age = df['demo_age']

age.mean(), age.std(), age.min(), age.max()

(65.23599726775956, 10.847577790993858, 18, 88)

In [5]:
# ============================================================
# Sample by country
# ============================================================

total_by_diag = (
       df
      .groupby(["Country", "clinical_diagnosis"])
      .size()
      .unstack(fill_value=0)
)

order = ["CN", "DCL", "AD", "FTD", "FTD-L"]

total_by_diag = total_by_diag.reindex(columns=order, fill_value=0)

print(total_by_diag)


clinical_diagnosis   CN  DCL   AD  FTD  FTD-L
Country                                      
Argentina            77    0  104   11      3
Chile               158    2  101   29      9
Colombia            403   22  558  100     44
Mexico              353    0  129   23      4
Peru                513    0  176   92     17


In [6]:
# ============================================================
# ANOVA: age ~ group 
# ============================================================
import statsmodels.api as sm


model = ols("demo_age ~ C(clinical_diagnosis)", data=df).fit()
anova_tbl = sm.stats.anova_lm(model, typ=2)

ss_between = float(anova_tbl.loc["C(clinical_diagnosis)", "sum_sq"])
ss_within  = float(anova_tbl.loc["Residual", "sum_sq"])
ss_total   = ss_between + ss_within

df_between = int(anova_tbl.loc["C(clinical_diagnosis)", "df"])
df_within  = int(anova_tbl.loc["Residual", "df"])

F_value = float(anova_tbl.loc["C(clinical_diagnosis)", "F"])
p_value = float(anova_tbl.loc["C(clinical_diagnosis)", "PR(>F)"])
eta_sq = ss_between / ss_total

print(f"\nANOVA Age: F({df_between}, {df_within}) = {F_value:.2f}, p = {p_value:.3g}, eta^2 = {eta_sq:.3f}")

# Summary like the table
age_summary = df.groupby("clinical_diagnosis")["demo_age"].agg(["count", "mean", "std"])
print("\nAge summary:")
print(age_summary)

NameError: name 'ols' is not defined

In [None]:
# ============================================================
# Chi-square: Sex distribution across groups
# ============================================================
from scipy import stats

ct = pd.crosstab(df["clinical_diagnosis"], df["demo_sex"])
chi2, p, dof, expected = stats.chi2_contingency(ct)
N = int(ct.to_numpy().sum())

print("\nSex counts table:")
print(ct)
print(f"\nChi-square Sex: chi2({dof}, N={N}) = {chi2:.2f}, p = {p:.3g}")