calculate subjects that had missing values even though transcriptions and audio-files were present
& determine why they were missing in the first place

In [34]:
# setup
import os, sys, shutil, datetime, wave, contextlib, traceback
import numpy as np
import pandas as pd

# project root
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from config.constants import DATA_DIRECTORY, GIT_DIRECTORY

# features
from feature_extraction.features import (
    n_words, clean_text, tokenize, pos_ratios_spacy, filler_word_ratio,
    ttr, mattr, avg_word_length, light_verb_ratio, empty_word_ratio, nid_ratio,
    adjacent_repetitions, brunets_index, honores_statistic, guirauds_statistic,
    article_pause_contentword
)
from feature_extraction.features.psycholinguistic_features import (
    compute_avg_by_pos, load_aoa_lexicon, load_imageability_norms,
    load_familiarity_norms, load_frequency_norms, load_concreteness_lexicon
)
from feature_extraction.features.fluency_features import calculate_fluency_features
from feature_extraction.audio import extract_acoustic_features, extract_egemaps, VoiceActivityDetector

# manually add durations (if can't be calculated from .wav file)
OVERRIDE_DURATIONS = {
    # cookieTheft
    ("56",   "cookieTheft"): 44,
    ("149",  "cookieTheft"): 99,
    ("1009", "cookieTheft"): 141
}


backup → /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/features/cookieTheft.csv.bak.20250825-230945
backup → /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/features/filtered/cookieTheft_filtered.csv.bak.20250825-230945
backup → /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/features/journaling.csv.bak.20250825-230945
backup → /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/features/filtered/journaling_filtered.csv.bak.20250825-230945


In [17]:
def safe_upsert(existing: pd.DataFrame, new_rows: pd.DataFrame, key="Subject_ID") -> pd.DataFrame:
    if new_rows is None or new_rows.empty:
        return existing
    ex = existing.copy(); nw = new_rows.copy()
    ex[key] = ex[key].astype(str); nw[key] = nw[key].astype(str)
    for c in nw.columns:
        if c not in ex.columns:
            ex[c] = pd.NA
    ex = ex.set_index(key)
    nw = nw.set_index(key)
    ex.update(nw)  # only updates overlapping columns for overlapping keys
    return ex.reset_index()

def _infer_wav_duration_seconds(wav_path: str):
    try:
        with contextlib.closing(wave.open(wav_path, 'r')) as w:
            return w.getnframes() / float(w.getframerate())
    except Exception:
        return None

def load_audio_durations(subject_folder, task):
    # 1) manual override
    subject_id = os.path.basename(subject_folder)
    key = (str(subject_id), str(task))
    if key in OVERRIDE_DURATIONS:
        return float(OVERRIDE_DURATIONS[key])
    # 2) subject CSV
    try:
        df = pd.read_csv(os.path.join(subject_folder, "audio_durations.csv"))
        m = df[df["task"] == task]
        if not m.empty:
            return float(m["duration"].values[0])
    except Exception:
        pass
    # 3) fallback: infer from WAV
    wav_path = os.path.join(subject_folder, f"{task}.wav")
    return _infer_wav_duration_seconds(wav_path)

def load_transcription(subject_folder, task):
    try:
        df = pd.read_csv(os.path.join(subject_folder, "ASR", "transcriptions.csv"))
        m = df[df["task"] == task]["text_google"]
        return m.iloc[0] if not m.empty else ""
    except Exception:
        return ""

def load_audio_file(subject_folder, task):
    p = os.path.join(subject_folder, f"{task}.wav")
    return p if os.path.exists(p) else None

def _diagnose_inputs(subject_folder, task):
    issues = []
    dur = load_audio_durations(subject_folder, task)
    if dur is None: issues.append("no audio duration")
    txt = load_transcription(subject_folder, task)
    if not txt: issues.append("no transcription text")
    wav = load_audio_file(subject_folder, task)
    if wav is None: issues.append("no audio file")
    return issues, dur, txt, wav


In [19]:
def process_features(task, subject_ids=None):
    """
    Compute features for specific task; if subject_ids specified, only process those.
    Upserts rows into results/features/{task}.csv.
    """
    base_dir = DATA_DIRECTORY
    out_dir  = os.path.join(GIT_DIRECTORY, "results/features")
    os.makedirs(out_dir, exist_ok=True)
    out_csv  = os.path.join(out_dir, f"{task}.csv")

    # load lexicons once
    concreteness_lexicon = load_concreteness_lexicon()
    aoa_lexicon          = load_aoa_lexicon()
    frequency_lexicon    = load_frequency_norms()
    familiarity_lexicon  = load_familiarity_norms()
    imageability_lexicon = load_imageability_norms()

    # select subjects
    all_subjects = sorted([s for s in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, s)) and s.isdigit()],
                          key=lambda x: int(x))
    if subject_ids is not None:
        keep = {str(s) for s in subject_ids}
        subjects = [s for s in all_subjects if s in keep]
        missing  = sorted(keep - set(subjects))
        if missing: print(f"[warn] subject folder(s) not found: {missing}")
    else:
        subjects = all_subjects

    rows, errors = [], {}
    for sid in subjects:
        subj_folder = os.path.join(base_dir, sid)
        print(f"Calculating features for subject {sid}...")
        issues, duration, text, wav_path = _diagnose_inputs(subj_folder, task)
        if issues: print(f"[diag] {sid} → {', '.join(issues)}")

        text_feats, pos_ratios, acoustic, eg = {}, {}, {}, {}
        try:
            if text:
                # core text features
                text_feats["n_words"] = n_words(text)
                text_feats["ttr"] = ttr(text)
                text_feats.update(mattr(text, window_sizes=[10,20,30,40,50]))
                text_feats["filler_word_ratio"] = filler_word_ratio(text)
                text_feats["average_word_length"] = avg_word_length(text)
                text_feats["brunets_index"] = brunets_index(text)
                text_feats["honores_statistic"] = honores_statistic(text)
                text_feats["guirauds_statistic"] = guirauds_statistic(text)
                text_feats["light_verb_ratio"] = light_verb_ratio(text)
                text_feats["empty_word_ratio"]  = empty_word_ratio(text)
                text_feats["nid_ratio"] = nid_ratio(text)
                text_feats["adjacent_repetitions"] = adjacent_repetitions(text)

                # psycholinguistic aggregates
                text_feats["aoa_content"]  = compute_avg_by_pos(text, aoa_lexicon, pos_tags=["NOUN","VERB","ADJ"])
                text_feats["aoa_nouns"]    = compute_avg_by_pos(text, aoa_lexicon, pos_tags=["NOUN"])
                text_feats["aoa_verbs"]    = compute_avg_by_pos(text, aoa_lexicon, pos_tags=["VERB"])

                text_feats["fam_content"]  = compute_avg_by_pos(text, familiarity_lexicon, pos_tags=["NOUN","VERB","ADJ"])
                text_feats["fam_nouns"]    = compute_avg_by_pos(text, familiarity_lexicon, pos_tags=["NOUN"])
                text_feats["fam_verbs"]    = compute_avg_by_pos(text, familiarity_lexicon, pos_tags=["VERB"])

                text_feats["img_content"]  = compute_avg_by_pos(text, imageability_lexicon, pos_tags=["NOUN","VERB","ADJ"])
                text_feats["img_nouns"]    = compute_avg_by_pos(text, imageability_lexicon, pos_tags=["NOUN"])
                text_feats["img_verbs"]    = compute_avg_by_pos(text, imageability_lexicon, pos_tags=["VERB"])

                text_feats["freq_content"] = compute_avg_by_pos(text, frequency_lexicon, pos_tags=["NOUN","VERB","ADJ"])
                text_feats["freq_nouns"]   = compute_avg_by_pos(text, frequency_lexicon, pos_tags=["NOUN"])
                text_feats["freq_verbs"]   = compute_avg_by_pos(text, frequency_lexicon, pos_tags=["VERB"])

                text_feats["concr_content"] = compute_avg_by_pos(text, concreteness_lexicon, pos_tags=["NOUN","VERB","ADJ"])
                text_feats["concr_nouns"]   = compute_avg_by_pos(text, concreteness_lexicon, pos_tags=["NOUN"])
                text_feats["concr_verbs"]   = compute_avg_by_pos(text, concreteness_lexicon, pos_tags=["VERB"])

                pos_ratios = pos_ratios_spacy(text)
                text_feats.update(calculate_fluency_features(text))

            if wav_path and duration:
                acoustic = extract_acoustic_features(wav_path, text, duration)
                eg       = extract_egemaps(wav_path)

            if any(value is not None for value in [*text_feats.values(), *pos_ratios.values(), *acoustic.values(), *eg.values()]):
                rows.append({"Subject_ID": sid, **text_feats, **pos_ratios, **acoustic, **eg})
            else:
                print(f"[skip] {sid}: no features extracted (missing inputs?)")

        except Exception:
            errors[sid] = traceback.format_exc()
            print(f"[error] {sid}: exception during feature extraction")

    if errors:
        log_path = os.path.join(out_dir, f"{task}_recompute_errors.log")
        with open(log_path, "w") as f:
            for s, tb in errors.items(): f.write(f"--- {s} ---\n{tb}\n\n")
        print(f"[info] error log written to: {log_path}")

    new_df = pd.DataFrame(rows)
    if not new_df.empty: new_df["Subject_ID"] = new_df["Subject_ID"].astype(str)

    # safe write
    if os.path.exists(out_csv):
        old = pd.read_csv(out_csv)
        old["Subject_ID"] = old["Subject_ID"].astype(str)
        merged = safe_upsert(old, new_df, key="Subject_ID")
    else:
        merged = new_df

    merged.to_csv(out_csv, index=False)
    print(f"[ok] updated {out_csv} (safe upsert)")

def backfill_egemaps_if_missing(df: pd.DataFrame, task: str) -> pd.DataFrame:
    eg_cols = ["eGeMAPS_jitterLocal_sma3nz_amean", "eGeMAPS_shimmerLocaldB_sma3nz_amean"]
    out = []
    for _, row in df.iterrows():
        need = any((c not in row.index) or pd.isna(row.get(c)) for c in eg_cols)
        if not need: out.append(row); continue
        sid = str(row["Subject_ID"])
        wav = os.path.join(DATA_DIRECTORY, sid, f"{task}.wav")
        if not os.path.exists(wav): out.append(row); continue
        try:
            eg = extract_egemaps(wav)
            if isinstance(eg, dict):
                for c in eg_cols:
                    if c in eg and (c not in row.index or pd.isna(row.get(c))):
                        row[c] = eg[c]
        except Exception:
            pass
        out.append(row)
    return pd.DataFrame(out)


In [None]:
def sync_to_clean_and_filtered(task):
    src_path      = os.path.join(GIT_DIRECTORY, "results/features", f"{task}.csv")
    cleaned_path  = os.path.join(GIT_DIRECTORY, "results/features", f"{task}_cleaned.csv")
    filtered_path = os.path.join(GIT_DIRECTORY, "results/features/filtered", f"{task}_filtered.csv")

    if not os.path.exists(src_path):
        print(f"[warn] missing {src_path}");
        return

    src = pd.read_csv(src_path)
    if "Subject_ID" not in src.columns:
        print("[warn] Subject_ID missing in source");
        return
    src["Subject_ID"] = src["Subject_ID"].astype(str)

    if os.path.exists(cleaned_path):
        cleaned = pd.read_csv(cleaned_path)
        cleaned["Subject_ID"] = cleaned["Subject_ID"].astype(str)
        cleaned = safe_upsert(cleaned, src, key="Subject_ID")
        cleaned.to_csv(cleaned_path, index=False)
        print(f"synced → {cleaned_path}")

    if os.path.exists(filtered_path):
        filt = pd.read_csv(filtered_path)
        filt["Subject_ID"] = filt["Subject_ID"].astype(str)
        keep_cols = [c for c in src.columns if c in set(filt.columns) | {"Subject_ID"}]
        filt = safe_upsert(filt, src[keep_cols], key="Subject_ID")
        filt.to_csv(filtered_path, index=False)
        print(f"synced → {filtered_path}")

In [21]:
# cookieTheft: 56, 149, 772, 1009
print("\nprocessing selected subjects for task: cookieTheft...\n")
process_features("cookieTheft", subject_ids=[56, 149, 772, 1009])

# journaling: 898, 1202
print("\nprocessing selected subjects for task: journaling...\n")
process_features("journaling", subject_ids=[898, 1202])

# keep the other CSVs consistent
sync_to_clean_and_filtered("cookieTheft")
sync_to_clean_and_filtered("journaling")



processing selected subjects for task: cookieTheft...

Calculating features for subject 56...
Calculating features for subject 149...
Calculating features for subject 772...
Calculating features for subject 1009...
updated /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/features/cookieTheft.csv with extracted features (safe upsert)

processing selected subjects for task: journaling...

Calculating features for subject 898...
Calculating features for subject 1202...
updated /Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/results/features/journaling.csv with extracted features (safe upsert)
