calculate linguistic and acoustic features across all three spontaneous speech tasks (cookieTheft, picnicScene and journaling)

(baut z.T. auf Petti et al., 2023 auf: wie viel Sprache braucht es? -> Idee für später: vergleichen zw. allen tasks, einzelnen tasks und Schnitt aus tasks)

In [16]:
# setup

import sys

# add the root of the project to the path
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

import os
import pandas as pd
from pydub import AudioSegment

from feature_extraction.features import (
    n_words, clean_text, tokenize, pos_ratios_spacy, filler_word_ratio,
    ttr, mattr, avg_word_length,
    light_verb_ratio, empty_word_ratio, nid_ratio, adjacent_repetitions,
    brunets_index, honores_statistic, guirauds_statistic
)

from feature_extraction.features.psycholinguistic_features import (
    compute_avg_by_pos, load_aoa_lexicon, load_imageability_norms,
    load_familiarity_norms, load_frequency_norms, load_concreteness_lexicon
)
from feature_extraction.features.fluency_features import filled_pause_ratio, calculate_fluency_features

from feature_extraction.audio import (count_phonemes, VoiceActivityDetector)
from feature_extraction.audio import extract_acoustic_features as base_extract_acoustic_features
from feature_extraction.audio import extract_egemaps as base_extract_egemaps

from config.constants import DATA_DIRECTORY, GIT_DIRECTORY

In [17]:
# adapt some audio functions to accept audio_segment parameter
def extract_acoustic_features(audio_path=None, text=None, duration=None, audio_segment=None):
    if audio_segment is not None:
        # temporarily save to temp wav file
        tmp_path = "/tmp/tmp_combined_audio.wav"
        audio_segment.export(tmp_path, format="wav")
        return base_extract_acoustic_features(tmp_path, text, duration)
    else:
        return base_extract_acoustic_features(audio_path, text, duration)

def extract_egemaps(audio_path=None, audio_segment=None):
    if audio_segment is not None:
        # temporarily save to temp wav file
        tmp_path = "/tmp/tmp_combined_audio.wav"
        audio_segment.export(tmp_path, format="wav")
        return base_extract_egemaps(tmp_path)
    else:
        return base_extract_egemaps(audio_path)


In [18]:
# set up functions to load transcriptions and audio files

def load_audio_durations(subject_folder):
    path = os.path.join(subject_folder, "audio_durations.csv")
    return pd.read_csv(path) if os.path.exists(path) else pd.DataFrame()

def load_transcriptions(subject_folder):
    path = os.path.join(subject_folder, "ASR", "transcriptions.csv")
    return pd.read_csv(path) if os.path.exists(path) else pd.DataFrame()

def load_audio_file(subject_folder, task):
    path = os.path.join(subject_folder, f"{task}.wav")
    return path if os.path.exists(path) else None

In [19]:
# function to calculate features across combined audio files and transcriptions

def calculate_combined_features(Subject_ID):
    subject_folder = os.path.join(DATA_DIRECTORY, Subject_ID)
    tasks = ["cookieTheft", "picnicScene", "journaling"]

    transcriptions = load_transcriptions(subject_folder)
    durations = load_audio_durations(subject_folder)

    combined_text = ""
    combined_audio = AudioSegment.silent(duration=0)
    total_duration = 0
    tasks_included = []

    for task in tasks:
        audio_path = load_audio_file(subject_folder, task)
        if audio_path is None: continue

        task_text = transcriptions[transcriptions["task"] == task]["text_google"].values
        task_duration = durations[durations["task"] == task]["duration"].values

        if len(task_text) == 0 or len(task_duration) == 0: continue

        combined_text += " " + task_text[0].strip()
        combined_audio += AudioSegment.from_wav(audio_path)
        total_duration += float(task_duration[0])
        tasks_included.append(task)

    if not combined_text.strip():
        return None # if no task exists

    # load resources for psycholinguistic features
    concreteness_lexicon = load_concreteness_lexicon()
    aoa_lexicon = load_aoa_lexicon()
    frequency_lexicon = load_frequency_norms()
    familiarity_lexicon = load_familiarity_norms()
    imageability_lexicon = load_imageability_norms()

    features = {
        "Subject_ID": Subject_ID,
        "n_words": n_words(combined_text),
        "ttr": ttr(combined_text),
        "average_word_length": avg_word_length(combined_text),
        "filler_word_ratio": filler_word_ratio(combined_text),
        "brunets_index": brunets_index(combined_text),
        "honores_statistic": honores_statistic(combined_text),
        "guirauds_statistic": guirauds_statistic(combined_text),
        "light_verb_ratio": light_verb_ratio(combined_text),
        "empty_word_ratio": empty_word_ratio(combined_text),
        "nid_ratio": nid_ratio(combined_text),
        "adjacent_repetitions": adjacent_repetitions(combined_text),
        "tasks_included": len(tasks_included),
        "task_list": ",".join(tasks_included),
        "total_duration_combined": total_duration
    }

    features.update(mattr(combined_text, window_sizes=[10, 20, 30, 40, 50]))
    features.update(pos_ratios_spacy(combined_text))
    features.update(calculate_fluency_features(combined_text))

    for name, lex in [
        ("aoa", aoa_lexicon),
        ("fam", familiarity_lexicon),
        ("img", imageability_lexicon),
        ("freq", frequency_lexicon),
        ("concr", concreteness_lexicon)
    ]:
        for pos_group, label in [("NOUN", "nouns"), ("VERB", "verbs"), ("NOUN,VERB,ADJ", "content")]:
            pos_list = pos_group.split(",")
            key = f"{name}_{label}"
            features[key] = compute_avg_by_pos(combined_text, lex, pos_list)

    features.update(extract_acoustic_features(audio_segment=combined_audio, text=combined_text, duration=total_duration))
    features.update(extract_egemaps(audio_segment=combined_audio))

    return pd.DataFrame([features])



In [20]:
# test for one subject

df_test = calculate_combined_features("1370")
print(df_test)

  Subject_ID  n_words       ttr  average_word_length  filler_word_ratio  \
0       1370     8136  0.159169             4.163717           0.014135   

   brunets_index  honores_statistic  guirauds_statistic  light_verb_ratio  \
0      15.800294        1774.771667            14.35702          0.181707   

   empty_word_ratio  ...  eGeMAPS_slopeUV0-500_sma3nz_amean  \
0          0.000615  ...                           0.026775   

   eGeMAPS_slopeUV500-1500_sma3nz_amean  eGeMAPS_spectralFluxUV_sma3nz_amean  \
0                             -0.006255                              0.08044   

  eGeMAPS_loudnessPeaksPerSec  eGeMAPS_VoicedSegmentsPerSec  \
0                    1.323721                      0.182383   

   eGeMAPS_MeanVoicedSegmentLengthSec  eGeMAPS_StddevVoicedSegmentLengthSec  \
0                             0.28572                              0.285371   

   eGeMAPS_MeanUnvoicedSegmentLength  eGeMAPS_StddevUnvoicedSegmentLength  \
0                            0.29706       

In [32]:
# run for all subjects and save to csv

output_path = os.path.join(GIT_DIRECTORY, "results", "features", "tasks_combined.csv")

# set up csv-file
# df_sample = calculate_combined_features("1370")
# df_sample.to_csv(output_path, index=False)

# counter
n_saved = 1

all_subjects = sorted([
    folder for folder in os.listdir(DATA_DIRECTORY)
    if os.path.isdir(os.path.join(DATA_DIRECTORY, folder)) and folder.isdigit()
], key=lambda x: int(x))

# load already processed subjects from the csv-file (bc the server deconnected after 100 subjects)
if os.path.exists(output_path):
    processed_df = pd.read_csv(output_path)
    processed_ids = set(processed_df["Subject_ID"].astype(str))
else:
    processed_ids = set()

# process subjects
for Subject_ID in all_subjects:
    if Subject_ID in processed_ids:
        print(f"already processed subject {Subject_ID}, skipping")
        continue
    print(f"processing subject {Subject_ID}")
    try:
        df = calculate_combined_features(Subject_ID)
        if df is not None:
            df.to_csv(output_path, mode='a', header=not os.path.exists(output_path), index=False)
            print(f"added subject {Subject_ID}")
            n_saved += 1
        else:
            print(f"skipped subject {Subject_ID}")
    except Exception as e:
        print(f"error processing subject {Subject_ID}: {e}")


print(f"\n finished saving combined features for {n_saved} subjects to: {output_path}")


already processed subject 41, skipping
already processed subject 43, skipping
already processed subject 44, skipping
already processed subject 46, skipping
already processed subject 49, skipping
already processed subject 50, skipping
already processed subject 54, skipping
already processed subject 56, skipping
already processed subject 59, skipping
already processed subject 61, skipping
already processed subject 83, skipping
already processed subject 84, skipping
already processed subject 85, skipping
already processed subject 86, skipping
already processed subject 88, skipping
already processed subject 89, skipping
already processed subject 90, skipping
already processed subject 91, skipping
already processed subject 92, skipping
already processed subject 93, skipping
already processed subject 97, skipping
already processed subject 98, skipping
already processed subject 99, skipping
already processed subject 101, skipping
already processed subject 102, skipping
already processed subje