combined picture-description (cookieTheft + picnicScene) features, random order for tasks
1min, 2min and â‰¥5min version

In [1]:
import os, sys, contextlib, wave, tempfile, json
import random, hashlib
import pandas as pd
from pydub import AudioSegment

# project root
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

from config.constants import DATA_DIRECTORY, GIT_DIRECTORY

# linguistic features
from feature_extraction.features import (
    n_words, pos_ratios_spacy, filler_word_ratio, ttr, mattr, avg_word_length,
    light_verb_ratio, empty_word_ratio, nid_ratio, adjacent_repetitions,
    brunets_index, honores_statistic, guirauds_statistic
)
from feature_extraction.features.psycholinguistic_features import (
    compute_avg_by_pos, load_aoa_lexicon, load_imageability_norms,
    load_familiarity_norms, load_frequency_norms, load_concreteness_lexicon
)
from feature_extraction.features.fluency_features import calculate_fluency_features

# audio features
from feature_extraction.audio import (
    extract_acoustic_features as base_extract_acoustic_features,
    extract_egemaps as base_extract_egemaps,
)

# article-pause-contentword
from feature_extraction.features.article_pause_contentword import article_pause_contentword

# config

PICTURE_TASKS = ["cookieTheft", "picnicScene"]
TIMESTAMP_ROOT = "/Volumes/g_psyplafor_methlab$/Students/Gila/word_timestamps"
OUT_DIR = os.path.join(GIT_DIRECTORY, "results", "features")

OUTPUTS = {
    "picture_description.csv":        300,  # 5:00
    "picture_description_1min.csv":   60,   # 1:00
    "picture_description_2min.csv":   120,  # 2:00
}

# small helpers

def _infer_wav_duration_seconds(wav_path: str):
    try:
        with contextlib.closing(wave.open(wav_path, 'r')) as w:
            return w.getnframes() / float(w.getframerate())
    except Exception:
        return None

def load_transcriptions(subject_folder):
    p = os.path.join(subject_folder, "ASR", "transcriptions.csv")
    return pd.read_csv(p) if os.path.exists(p) else pd.DataFrame()

def load_audio_durations(subject_folder):
    p = os.path.join(subject_folder, "audio_durations.csv")
    return pd.read_csv(p) if os.path.exists(p) else pd.DataFrame()

def load_task_word_timestamps(subject_id: str, task: str) -> pd.DataFrame | None:
    p = os.path.join(TIMESTAMP_ROOT, task, "google", "timestamps", f"{subject_id}.csv")
    if not os.path.exists(p):
        return None
    df = pd.read_csv(p)
    need = ["word", "start", "end"]
    if not all(c in df.columns for c in need):
        return None
    df = df[need].copy()
    df["start"] = pd.to_numeric(df["start"], errors="coerce")
    df["end"]   = pd.to_numeric(df["end"], errors="coerce")
    df = df.dropna(subset=["word", "start", "end"])
    return df

def load_audio_file(subject_folder, task):
    p = os.path.join(subject_folder, f"{task}.wav")
    return p if os.path.exists(p) else None

def extract_acoustic_features(audio_path=None, text=None, duration=None, audio_segment=None):
    if audio_segment is not None:
        tmp_path = os.path.join(tempfile.gettempdir(), "tmp_picdesc_audio.wav")
        audio_segment.export(tmp_path, format="wav")
        return base_extract_acoustic_features(tmp_path, text, duration)
    return base_extract_acoustic_features(audio_path, text, duration)

def extract_egemaps(audio_path=None, audio_segment=None):
    if audio_segment is not None:
        tmp_path = os.path.join(tempfile.gettempdir(), "tmp_picdesc_audio.wav")
        audio_segment.export(tmp_path, format="wav")
        return base_extract_egemaps(tmp_path)
    return base_extract_egemaps(audio_path)

# append + checkpoint

def append_row_atomic(out_path: str, row_df: pd.DataFrame):
    exists = os.path.exists(out_path)
    # append one row; write header only if new file
    row_df.to_csv(out_path, mode="a", header=not exists, index=False)
    # extra safety: force flush to disk
    with open(out_path, "a") as f:
        f.flush()
        os.fsync(f.fileno())

def load_checkpoint(ckpt_path: str) -> set[str]:
    if not os.path.exists(ckpt_path):
        return set()
    try:
        with open(ckpt_path, "r") as f:
            return set(json.load(f))
    except Exception:
        return set()

def save_checkpoint(ckpt_path: str, processed_ids: set[str]):
    tmp = ckpt_path + ".tmp"
    with open(tmp, "w") as f:
        json.dump(sorted(processed_ids), f)
        f.flush()
        os.fsync(f.fileno())
    os.replace(tmp, ckpt_path)  # atomic rename

# reproducible random order

def task_order_for_subject(subject_id: str, base_tasks=None, seed: int | None = None):
    """
    Return a reproducible shuffle of tasks for a subject.
    """
    if base_tasks is None:
        base_tasks = PICTURE_TASKS
    tasks = list(base_tasks)

    env_seed = os.environ.get("PICTURE_TASKS_SEED")
    if env_seed is not None and env_seed.strip():
        seed = int(env_seed)

    if seed is None:
        # stable across runs/machines for this subject_id
        seed = int(hashlib.sha1(str(subject_id).encode()).hexdigest(), 16) % (10**8)

    rng = random.Random(seed)
    rng.shuffle(tasks)
    return tasks

# core assembly

def combine_picture_description_streams(subject_id: str, task_order=None):
    subject_folder = os.path.join(DATA_DIRECTORY, subject_id)
    trans = load_transcriptions(subject_folder)
    durs  = load_audio_durations(subject_folder)

    if task_order is None:
        task_order = PICTURE_TASKS  # fallback to fixed order if not provided

    combined_audio = AudioSegment.silent(duration=0)
    total_offset = 0.0
    tasks_included, words_blocks, text_fallbacks = [], [], []

    for task in task_order:
        wav = load_audio_file(subject_folder, task)
        if wav is None:
            continue

        seg = AudioSegment.from_wav(wav)
        dur_audio = len(seg) / 1000.0
        d = durs[durs["task"] == task]["duration"].values
        dur = max(float(d[0]), dur_audio) if len(d) else dur_audio

        wdf = load_task_word_timestamps(subject_id, task)
        if wdf is not None and not wdf.empty:
            wdf2 = wdf.copy()
            wdf2["start"] = wdf2["start"].astype(float) + total_offset
            wdf2["end"]   = wdf2["end"].astype(float) + total_offset
            words_blocks.append(wdf2)
        else:
            t = trans[trans["task"] == task]["text_google"].values
            if len(t) and isinstance(t[0], str) and t[0].strip():
                text_fallbacks.append(t[0].strip())

        combined_audio += seg
        total_offset += dur
        tasks_included.append(task)

    combined_words = pd.concat(words_blocks, ignore_index=True) if words_blocks else None
    if combined_words is not None and not combined_words.empty:
        combined_text = " ".join(combined_words["word"].tolist())
    else:
        combined_text = " ".join(text_fallbacks).strip()

    if not combined_text:
        return None, None, 0.0, [], None

    return combined_text, combined_audio, total_offset, tasks_included, combined_words

def truncate_by_seconds(combined_text, combined_audio, combined_words, cap_seconds):
    if cap_seconds is None:
        return combined_text, combined_audio, combined_words
    ms = int(cap_seconds * 1000)
    audio_cut = combined_audio[:ms]
    if combined_words is not None and not combined_words.empty:
        kept = combined_words[combined_words["end"] <= cap_seconds].copy()
        text_cut = " ".join(kept["word"].tolist())
        return text_cut, audio_cut, kept
    return combined_text, audio_cut, None

def compute_features_for_slice(subject_id: str, text: str, audio_cut: AudioSegment, words_cut: pd.DataFrame | None):
    # lexica
    concreteness_lexicon = load_concreteness_lexicon()
    aoa_lexicon = load_aoa_lexicon()
    frequency_lexicon = load_frequency_norms()
    familiarity_lexicon = load_familiarity_norms()
    imageability_lexicon = load_imageability_norms()

    used_dur = len(audio_cut) / 1000.0

    feats = {
        "Subject_ID": subject_id,
        "duration_used_sec": used_dur,
        "n_words": n_words(text),
        "ttr": ttr(text),
        "average_word_length": avg_word_length(text),
        "filler_word_ratio": filler_word_ratio(text),
        "brunets_index": brunets_index(text),
        "honores_statistic": honores_statistic(text),
        "guirauds_statistic": guirauds_statistic(text),
        "light_verb_ratio": light_verb_ratio(text),
        "empty_word_ratio": empty_word_ratio(text),
        "nid_ratio": nid_ratio(text),
        "adjacent_repetitions": adjacent_repetitions(text),
    }
    feats.update(mattr(text, window_sizes=[10,20,30,40,50]))
    feats.update(pos_ratios_spacy(text))
    feats.update(calculate_fluency_features(text))

    for name, lex in [
        ("aoa",  aoa_lexicon),
        ("fam",  familiarity_lexicon),
        ("img",  imageability_lexicon),
        ("freq", frequency_lexicon),
        ("concr",concreteness_lexicon),
    ]:
        feats[f"{name}_nouns"]   = compute_avg_by_pos(text, lex, ["NOUN"])
        feats[f"{name}_verbs"]   = compute_avg_by_pos(text, lex, ["VERB"])
        feats[f"{name}_content"] = compute_avg_by_pos(text, lex, ["NOUN","VERB","ADJ"])

    feats.update(extract_acoustic_features(audio_segment=audio_cut, text=text, duration=used_dur))
    feats.update(extract_egemaps(audio_segment=audio_cut))

    # article_pause_contentword on CUT timestamps (if available)
    if words_cut is not None and not words_cut.empty:
        with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmpf:
            tmp_path = tmpf.name
        words_cut[["word","start","end"]].to_csv(tmp_path, index=False)
        try:
            apc = article_pause_contentword(tmp_path)
        except Exception:
            apc = None
        finally:
            try:
                os.remove(tmp_path)
            except Exception:
                pass
        feats["article_pause_contentword"] = apc
    else:
        feats["article_pause_contentword"] = None

    return pd.DataFrame([feats])

# run

def main():
    os.makedirs(OUT_DIR, exist_ok=True)
    CKPT_DIR = os.path.join(OUT_DIR, "_checkpoints")
    os.makedirs(CKPT_DIR, exist_ok=True)

    subjects = sorted([
        s for s in os.listdir(DATA_DIRECTORY)
        if os.path.isdir(os.path.join(DATA_DIRECTORY, s)) and s.isdigit()
    ], key=lambda x: int(x))

    # load processed sets from checkpoints if present; else from existing CSVs
    processed = {}
    for fname in OUTPUTS.keys():
        ckpt_path = os.path.join(CKPT_DIR, fname + ".json")
        done = load_checkpoint(ckpt_path)
        if not done:
            out_path = os.path.join(OUT_DIR, fname)
            if os.path.exists(out_path):
                try:
                    dfp = pd.read_csv(out_path, usecols=["Subject_ID"])
                    done = set(dfp["Subject_ID"].astype(str))
                except Exception:
                    done = set()
        processed[fname] = done

    for sid in subjects:
        subj_folder = os.path.join(DATA_DIRECTORY, sid)
        if not any(os.path.exists(os.path.join(subj_folder, f"{t}.wav")) for t in PICTURE_TASKS):
            print(f"[skip] {sid}: no picture-description audio")
            continue

        # reproducible per-subject randomized order
        task_order = task_order_for_subject(sid)

        combo = combine_picture_description_streams(sid, task_order=task_order)
        if combo[3] == []:
            print(f"[skip] {sid}: no picture-description data")
            continue
        combined_text, combined_audio, _total, _tasks_included, combined_words = combo

        # compute once, then reuse for each cap/file
        for fname, cap in OUTPUTS.items():
            out_path = os.path.join(OUT_DIR, fname)
            ckpt_path = os.path.join(CKPT_DIR, fname + ".json")

            if sid in processed[fname]:
                print(f"[{fname}] already processed {sid}, skipping")
                continue

            print(f"[{fname}] processing {sid} (cap={cap}s)")
            text_cut, audio_cut, words_cut = truncate_by_seconds(
                combined_text, combined_audio, combined_words, cap
            )

            try:
                df_slice = compute_features_for_slice(
                    sid, text=text_cut, audio_cut=audio_cut, words_cut=words_cut
                )
                # audit columns for order effects
                df_slice["first_task"] = _tasks_included[0] if _tasks_included else None
                df_slice["task_order"] = "|".join(_tasks_included) if _tasks_included else None

                append_row_atomic(out_path, df_slice)
                processed[fname].add(sid)
                save_checkpoint(ckpt_path, processed[fname])
            except Exception as e:
                print(f"[{fname}] ERROR {sid}: {e}")
                continue

    print("\ndone. wrote/updated:")
    for fname in OUTPUTS.keys():
        print(" -", os.path.join(OUT_DIR, fname))

if __name__ == "__main__":
    main()


[nltk_data] Downloading package words to /Users/gilanorup/nltk_data...
[nltk_data]   Package words is already up-to-date!


[picture_description.csv] processing 41 (cap=300s)
[picture_description_1min.csv] processing 41 (cap=60s)
[picture_description_2min.csv] processing 41 (cap=120s)
[picture_description.csv] processing 43 (cap=300s)
[picture_description_1min.csv] processing 43 (cap=60s)
[picture_description_2min.csv] processing 43 (cap=120s)
[picture_description.csv] processing 44 (cap=300s)
[picture_description_1min.csv] processing 44 (cap=60s)
[picture_description_2min.csv] processing 44 (cap=120s)
[picture_description.csv] processing 46 (cap=300s)
[picture_description_1min.csv] processing 46 (cap=60s)
[picture_description_2min.csv] processing 46 (cap=120s)
[picture_description.csv] processing 49 (cap=300s)
[picture_description_1min.csv] processing 49 (cap=60s)
[picture_description_2min.csv] processing 49 (cap=120s)
[picture_description.csv] processing 50 (cap=300s)
[picture_description_1min.csv] processing 50 (cap=60s)
[picture_description_2min.csv] processing 50 (cap=120s)
[picture_description.csv] pr