# Descriptive statistics for dataset: demographics, scores, tasks

In [105]:
import os
import pandas as pd
import matplotlib.pyplot as plt

from config.constants import DATA_DIRECTORY, GIT_DIRECTORY, ID_COL, SCORES, TASKS
from feature_extraction.helpers import load_audio_file, infer_wav_duration_seconds
from additional_analyses.descriptive_plots import plot_score_distributions

# paths
demo_path   = os.path.join(GIT_DIRECTORY, "data", "demographics_data.csv")
scores_path = os.path.join(GIT_DIRECTORY, "data", "language_scores_all_subjects.csv")

task_paths = {
    task: os.path.join(GIT_DIRECTORY, "results", "features", f"{task}.csv")
    for task in TASKS
}

picture_description_paths = {
    "1min": os.path.join(GIT_DIRECTORY, "results/features/picture_description_1min.csv"),
    "2min": os.path.join(GIT_DIRECTORY, "results/features/picture_description_2min.csv"),
    "full": os.path.join(GIT_DIRECTORY, "results/features/picture_description.csv"),
}

# load data
demographics = pd.read_csv(demo_path)
scores = pd.read_csv(scores_path)

# save path
save_dir = os.path.join(GIT_DIRECTORY, "results", "descriptives")
os.makedirs(save_dir, exist_ok=True)

## Demographic Variables

In [106]:
# create demographics-csv

#demographic_rows = {
#    "Age": 4,
#    "Gender": 5,
#    "Education": 6,
#    "Language": 7,
#    "Country": 8,
#    "Socioeconomic": 9,
#}

#valid_subjects = sorted(
#    [
#        name for name in os.listdir(DATA_DIRECTORY)
#        if os.path.isdir(os.path.join(DATA_DIRECTORY, name)) and name.isdigit()
#    ],
#    key=lambda x: int(x),
#)

#rows = []

#for subject_id in valid_subjects:
#    subject_folder = os.path.join(DATA_DIRECTORY, subject_id)
#    submission_file = os.path.join(subject_folder, "submission.csv")
#    if os.path.isfile(submission_file):
#        try:
#            data = pd.read_csv(submission_file, header=None)
#            row = {ID_COL: subject_id}
#            for col_name, row_idx in demographic_rows.items():
#                try:
#                    row[col_name] = data.iloc[row_idx, 1]
#                except Exception:
#                    row[col_name] = None
#            rows.append(row)
#        except Exception as e:
#            print(f"[WARNING] Failed to read demographics from {submission_file}: {e}")
#demo_df = pd.DataFrame(rows)

# save CSV
#out_path = os.path.join(GIT_DIRECTORY, "data", "demographics_data.csv")
#os.makedirs(os.path.dirname(out_path), exist_ok=True)
#demo_df.to_csv(out_path, index=False)

#print(f"saved demographics data to:\n{out_path}")


In [107]:
# Age: M & SD

s = pd.to_numeric(demographics["Age"], errors="coerce")
age_table = pd.DataFrame([{
    "Variable": "Age",
    "N": s.notna().sum(),
    "M": s.mean(),
    "SD": s.std(),
    "youngest": s.min(),
    "oldest": s.max(),
}])

age_table

Unnamed: 0,Variable,N,M,SD,youngest,oldest
0,Age,1003,65.464606,4.808041,58.0,89.0


In [108]:
 # SES: M & SD

s = pd.to_numeric(demographics["Socioeconomic"], errors="coerce")
ses_table = pd.DataFrame([{
    "Variable": "Socioeconomic Status",
    "N": s.notna().sum(),
    "M": s.mean(),
    "SD": s.std(),
}])

ses_table

Unnamed: 0,Variable,N,M,SD
0,Socioeconomic Status,998,5.582164,1.636704


In [109]:
# Gender: n & %

gender_counts = demographics["Gender"].value_counts(dropna=False)
total = gender_counts.sum()

gender_table = pd.DataFrame({
    "Gender": gender_counts.index,
    "n": gender_counts.values,
    "percent": (gender_counts.values / total * 100).round(2),
})

gender_table

Unnamed: 0,Gender,n,percent
0,f,610,60.82
1,m,386,38.48
2,other,5,0.5
3,no_answer,2,0.2


In [110]:
# Country: n & %

country_counts = demographics["Country"].value_counts(dropna=False)
total = country_counts.sum()

country_table = pd.DataFrame({
    "Country": country_counts.index,
    "n": country_counts.values,
    "percent": (country_counts.values / total * 100).round(2),
})

country_table

Unnamed: 0,Country,n,percent
0,uk,501,49.95
1,usa,501,49.95
2,no_answer,1,0.1


In [111]:
# Language: n & %

language_counts = demographics["Language"].value_counts(dropna=False)
total = language_counts.sum()

language_table = pd.DataFrame({
    "Language": language_counts.index,
    "n": language_counts.values,
    "percent": (language_counts.values / total * 100).round(2),
})

language_table

Unnamed: 0,Language,n,percent
0,english_british,498,49.65
1,english_american,492,49.05
2,english_other,13,1.3


In [112]:
# Education: n & %

edu_counts = demographics["Education"].value_counts(dropna=False)
total = edu_counts.sum()

education_table = pd.DataFrame({
    "Education": edu_counts.index,
    "n": edu_counts.values,
    "percent": (edu_counts.values / total * 100).round(2),
})

education_table

Unnamed: 0,Education,n,percent
0,bachelor,389,38.78
1,high_school,257,25.62
2,master,158,15.75
3,vocational,117,11.67
4,phd,54,5.38
5,less_than_highschool,24,2.39
6,no_answer,4,0.4


###  plot demographics distributions

In [113]:
# plot age distribution

# ensure Age is numeric
demographics["Age"] = pd.to_numeric(demographics["Age"], errors="coerce")

# count number of participants per age
age_counts = demographics["Age"].value_counts().sort_index()

ages = age_counts.index.astype(int)  # x axis
counts = age_counts.values  # y axis

# barplot
# barplot
fig, ax = plt.subplots(figsize=(10, 6))

# draw bars first (so y-limits & ticks are set)
ax.bar(
    ages,
    counts,
    color="skyblue",
    edgecolor="black",
    width=1.0,
    align="center",
    zorder=2,  # bars above reference lines
)

# now draw faint horizontal lines at tick positions
ax.grid(False)
for y in ax.get_yticks():
    ax.hlines(
        y,
        xmin=min(ages) - 0.5,
        xmax=max(ages) + 0.5,
        colors="lightgray",
        linestyles="--",
        linewidth=0.5,
        alpha=0.7,
        zorder=1,  # behind the bars
    )

ax.set_axisbelow(True)

# labels
ax.set_xlabel("Age", fontsize=18)
ax.set_ylabel("Number of Participants", fontsize=18)
ax.tick_params(axis="x", labelsize=16)
ax.tick_params(axis="y", labelsize=16)

# spines
for side in ("left", "bottom"):
    ax.spines[side].set_color("#cccccc")
    ax.spines[side].set_linewidth(0.5)
    ax.spines[side].set_visible(True)
for side in ("top", "right"):
    ax.spines[side].set_visible(False)

# save plot
out_path = os.path.join(save_dir, "age_distribution.png")
plt.savefig(out_path, dpi=600, bbox_inches="tight")
plt.close()
print(f"saved age distribution to:\n{save_dir}")


saved age distribution to:
/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/descriptives


In [114]:
# plot country distribution

# normalize country strings
country_clean = demographics["Country"].astype(str).str.strip().str.lower()

# keep only UK + USA
mask = country_clean.isin(["uk", "usa"])
country_filtered = country_clean[mask]

# counts
country_counts = country_filtered.value_counts()

# mapping back to nice labels
label_mapping = {"uk": "UK", "usa": "USA"}

labels = [label_mapping[c] for c in country_counts.index]
sizes = country_counts.values.tolist()

plt.figure(figsize=(6, 6))
wedges, texts, autotexts = plt.pie(
    sizes,
    labels=labels,
    autopct="%1.1f%%",
    startangle=90,
    colors=["tomato", "royalblue"],
)

plt.setp(texts, fontsize=20)
plt.setp(autotexts, fontsize=20)

out_path = os.path.join(save_dir, "country_distribution.png")
plt.savefig(out_path, dpi=600, bbox_inches="tight")
plt.close()
print(f"saved country distribution to:\n{out_path}")

saved country distribution to:
/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/descriptives/country_distribution.png


In [115]:
# plot gender distribution

# count occurrences of 'm' and 'f'
gender_counts = demographics["Gender"].value_counts()

# labels
labels = ["Male", "Female"]
sizes = [gender_counts.get("m", 0), gender_counts.get("f", 0)]

# pieplot
plt.figure(figsize=(6, 6))
wedges, texts, autotexts = plt.pie(
    sizes, labels=labels, autopct="%1.1f%%", startangle=90, colors=["lightskyblue", "hotpink"]
)

# labels
plt.setp(texts, fontsize=20)
plt.setp(autotexts, fontsize=20)

# save
out_path = os.path.join(save_dir, "gender_distribution.png")
plt.savefig(out_path, dpi=600, bbox_inches="tight")
plt.close()
print(f"saved gender distribution to:\n{save_dir}")

saved gender distribution to:
/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/descriptives


In [116]:
# plot SES distribution

# ensure SES is numeric
ses = pd.to_numeric(demographics["Socioeconomic"], errors="coerce")

ses_counts = ses.value_counts().sort_index()

x_vals = ses_counts.index.astype(int)
y_vals = ses_counts.values

fig, ax = plt.subplots(figsize=(8, 6))

ax.bar(
    x_vals,
    y_vals,
    color="steelblue",
    edgecolor="black",
    width=0.8,
    align="center",
    zorder=2,
)

ax.grid(False)
for y in ax.get_yticks():
    ax.hlines(
        y,
        xmin=min(x_vals) - 0.5,
        xmax=max(x_vals) + 0.5,
        colors="lightgray",
        linestyles="--",
        linewidth=0.5,
        alpha=0.7,
        zorder=1,
    )

ax.set_axisbelow(True)

ax.set_xlabel("Socioeconomic Status (SES)", fontsize=18)
ax.set_ylabel("Number of Participants", fontsize=18)
ax.tick_params(axis="x", labelsize=16)
ax.tick_params(axis="y", labelsize=16)

ax.set_xticks(x_vals)
ax.set_xticklabels([str(int(x)) for x in x_vals], fontsize=16)

for side in ("left", "bottom"):
    ax.spines[side].set_color("#cccccc")
    ax.spines[side].set_linewidth(0.5)
    ax.spines[side].set_visible(True)
for side in ("top", "right"):
    ax.spines[side].set_visible(False)

out_path = os.path.join(save_dir, "ses_distribution.png")
plt.savefig(out_path, dpi=600, bbox_inches="tight")
plt.close()
print(f"saved SES distribution to:\n{out_path}")


saved SES distribution to:
/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/descriptives/ses_distribution.png


In [117]:
# plot education distribution

edu_counts = demographics["Education"].value_counts().rename_axis("Education").reset_index(name="Count")

# order
education_order = [
    "less_than_highschool",
    "high_school",
    "vocational",
    "bachelor",
    "master",
    "phd",
    "no_answer",
]

edu_counts["Education"] = pd.Categorical(
    edu_counts["Education"],
    categories=education_order,
    ordered=True,
)

edu_counts = edu_counts.sort_values("Education")

plt.figure(figsize=(10, 6))
ax = plt.gca()

ax.grid(False)
# draw bars first so ticks are meaningful
ax.bar(
    range(len(edu_counts)),
    edu_counts["Count"],
    color="lightseagreen",
    edgecolor="black",
    width=0.8,
    align="center",
    zorder=2,
)

for y in ax.get_yticks():
    ax.hlines(
        y,
        xmin=-0.5,
        xmax=len(edu_counts) - 0.5,
        colors="lightgray",
        linestyles="--",
        linewidth=0.5,
        alpha=0.7,
        zorder=1,
    )

plt.xlabel("Education Level", fontsize=18)
plt.ylabel("Number of Participants", fontsize=18)

plt.xticks(
    ticks=range(len(edu_counts)),
    labels=[
        "Less than High School",
        "High School",
        "Vocational",
        "Bachelor",
        "Master",
        "PhD",
        "No Answer",
    ],
    fontsize=12,
    rotation=30,
    ha="right",
)
plt.yticks(fontsize=12)

for side in ("left", "bottom"):
    ax.spines[side].set_color("#cccccc")
    ax.spines[side].set_linewidth(0.5)
    ax.spines[side].set_visible(True)
for side in ("top", "right"):
    ax.spines[side].set_visible(False)

out_path = os.path.join(save_dir, "education_distribution.png")
plt.savefig(out_path, dpi=600, bbox_inches="tight")
plt.close()
print(f"saved education distribution to:\n{out_path}")


saved education distribution to:
/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/descriptives/education_distribution.png


## Language Scores

In [118]:
# create scores-csv

#valid_subjects = sorted(
#    [
#        name for name in os.listdir(DATA_DIRECTORY)
#        if os.path.isdir(os.path.join(DATA_DIRECTORY, name)) and name.isdigit()
#    ],
#    key=lambda x: int(x),
#)

#rows = []

#score_files = {
#    "PhonemicFluencyScore": "phonemicFluencyScore.txt",
#    "PictureNamingScore": "pictureNamingScore.txt",
#    "SemanticFluencyScore": "semanticFluencyScore.txt",
#}

#for subject_id in valid_subjects:
#    subject_folder = os.path.join(DATA_DIRECTORY, subject_id)
#    row = {ID_COL: subject_id}
#    for score_name in SCORES:
#        filename = score_files.get(score_name)
#        if filename is None:
#            row[score_name] = None
#            continue
#        file_path = os.path.join(subject_folder, filename)
#        value = None
#        if os.path.isfile(file_path):
#            with open(file_path, "r") as f:
#                raw = f.read().strip()
#                try:
#                    value = float(raw)
#                except ValueError:
#                    value = None
#        row[score_name] = value
#    rows.append(row)
#scores_df = pd.DataFrame(rows)

# convert scores to float, then to int
#for score in SCORES:
#    scores_df[score] = pd.to_numeric(scores_df[score], errors="coerce")
#    if (scores_df[score].dropna() % 1 == 0).all():
#        scores_df[score] = scores_df[score].astype("Int64")

# save to CSV
#out_path = os.path.join(GIT_DIRECTORY, "data", "language_scores_all_subjects.csv")
#os.makedirs(os.path.dirname(out_path), exist_ok=True)
#scores_df.to_csv(out_path, index=False)

#print(f"saved language scores to:\n{out_path}")

In [125]:
# Language scores: M & SD, n & %

# total sample size
N_TOTAL = len(demographics)  # should be 1003

score_rows = []

for col in SCORES:
    s = pd.to_numeric(scores[col], errors="coerce")
    n = s.notna().sum()
    pct = n / N_TOTAL * 100
    mean = s.mean()
    sd = s.std()

    score_rows.append({
        "Score": col,
        "N": n,
        "% of total sample": pct,
        "M": mean,
        "SD": sd,
    })

score_table = pd.DataFrame(score_rows)
score_table

Unnamed: 0,Score,N,% of total sample,M,SD
0,PictureNamingScore,990,98.703888,17.384848,2.753502
1,SemanticFluencyScore,1002,99.900299,20.760479,5.698425
2,PhonemicFluencyScore,1002,99.900299,15.239521,4.640364


### plot language score distributions

In [120]:
plot_score_distributions(
    scores=SCORES,
    scores_path=os.path.join(GIT_DIRECTORY, "data", "language_scores_all_subjects.csv"),
    outdir=os.path.join(GIT_DIRECTORY, "results/descriptives")
)

saved score distribution panels to: /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/descriptives/score_distributions.png


## Spontaneous Speech Tasks

In [121]:
# create durations-csv

# valid_subjects = sorted(
#    [
#        name for name in os.listdir(DATA_DIRECTORY)
#        if os.path.isdir(os.path.join(DATA_DIRECTORY, name)) and name.isdigit()
#    ],
#    key=lambda x: int(x),
#)
#
#rows = []

#for subject_id in valid_subjects:
#    subject_folder = os.path.join(DATA_DIRECTORY, subject_id)
#    row = {ID_COL: subject_id}
#    for task in TASKS:
#        wav_path = load_audio_file(subject_folder, task)
#        if wav_path is not None:
#            duration = infer_wav_duration_seconds(wav_path)  # raw duration in seconds
#        else:
#            duration = None
#        row[task] = duration
#    rows.append(row)
#dur_df = pd.DataFrame(rows)

# save to CSV
#out_path = os.path.join(GIT_DIRECTORY, "data", "audio_durations.csv")
#os.makedirs(os.path.dirname(out_path), exist_ok=True)
#dur_df.to_csv(out_path, index=False)

#print(f"saved raw audio durations to:\n{out_path}")

In [127]:
# Spontaneous speech tasks: M & SD for duration and n_words; n & %

durations_path = os.path.join(GIT_DIRECTORY, "data", "audio_durations.csv")
features_dir = os.path.join(GIT_DIRECTORY, "results", "features")

dur_df = pd.read_csv(durations_path)

# total sample size
N_TOTAL = len(demographics)  # should be 1003

task_rows = []

for task in TASKS:
    feature_path = task_paths[task]
    df_features = pd.read_csv(feature_path)

    # extract durations & n_words
    df_dur_task = dur_df[[ID_COL, task]].rename(columns={task: "duration_sec"})
    df_features_task = df_features[[ID_COL, "n_words"]].copy()

    # participants who completed the task
    dur_non_na = df_dur_task["duration_sec"].dropna()
    n_task = len(dur_non_na)

    # duration descriptives
    mean_dur = dur_non_na.mean()
    sd_dur   = dur_non_na.std()

    # n_words descriptives
    words_non_na = df_features_task["n_words"].dropna()
    mean_words = words_non_na.mean()
    sd_words   = words_non_na.std()

    pct_total = (n_task / N_TOTAL * 100) if N_TOTAL > 0 else 0.0

    task_rows.append({
        "task": task,
        "N": n_task,
        "% of total sample": round(pct_total, 1),
        "M_duration_sec": round(mean_dur, 2),
        "SD_duration_sec": round(sd_dur, 2),
        "M_n_words": round(mean_words, 2),
        "SD_n_words": round(sd_words, 2),
    })

spontaneous_tasks_table = pd.DataFrame(task_rows)
spontaneous_tasks_table


Unnamed: 0,task,N,% of total sample,M_duration_sec,SD_duration_sec,M_n_words,SD_n_words
0,picnicScene,1002,99.9,87.0,58.87,185.7,113.34
1,cookieTheft,1002,99.9,73.28,114.3,147.92,171.99
2,journaling,995,99.2,92.02,46.91,186.48,120.07


In [123]:
# Picture description feature sets: M & SD for duration and n_words; n & %

pd_rows = []

for label, path in picture_description_paths.items():
    df_pd = pd.read_csv(path)
    n_subjects = df_pd[ID_COL].nunique()

    mean_words = df_pd["n_words"].mean()
    sd_words = df_pd["n_words"].std()

    mean_dur = df_pd["duration_used_sec"].mean()
    sd_dur = df_pd["duration_used_sec"].std()

    pd_rows.append({
        "Variant": label,
        "N": n_subjects,
        "M_n_words": mean_words,
        "SD_n_words": sd_words,
        "M_duration_sec": mean_dur,
        "SD_duration_sec": sd_dur,
    })

pd_table = pd.DataFrame(pd_rows)
pd_table


Unnamed: 0,Variant,N,M_n_words,SD_n_words,M_duration_sec,SD_duration_sec
0,1min,1003,132.556331,230.243357,59.890963,1.425414
1,2min,1003,250.107677,231.169228,116.162607,12.379479
2,full,1003,325.1665,251.857323,151.691667,48.125654
