In [6]:
import pandas as pd

df = pd.read_csv("/Users/gilanorup/Desktop/Studium/MSc/MA/dataset/combined_audio_durations.csv")

# print what subjects talk for more than 5 minutes

def print_long_speakers(df, task_name, threshold=300):
    long_speakers = df[df[task_name] > threshold]
    print(f"\n{task_name} subjects with >5min ({len(long_speakers)} total):")
    print(long_speakers[["Subject_ID", task_name]])

for task in ["cookieTheft", "picnicScene", "journaling"]:
    print_long_speakers(df, task)



cookieTheft subjects with >5min (1 total):
      Subject_ID  cookieTheft
1002        1370       3587.4

picnicScene subjects with >5min (8 total):
      Subject_ID  picnicScene
36           122   321.429333
411          612   421.362667
530          757   422.220000
700          971   334.560000
772         1059   340.680000
825         1133   332.040000
903         1237   330.420000
1002        1370  1403.520000

journaling subjects with >5min (10 total):
      Subject_ID  journaling
126          242  448.080000
303          473  302.760000
343          523  422.890667
366          554  325.560000
536          764  328.852625
585          824  334.980000
695          966  334.770667
854         1175  346.980000
951         1304  376.980000
1002        1370  492.120000


In [7]:
# setup

import sys

# add the root of the project to the path
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

import os
import pandas as pd
from pydub import AudioSegment

from feature_extraction.features import (
    n_words, clean_text, tokenize, pos_ratios_spacy, filler_word_ratio,
    ttr, mattr, avg_word_length,
    light_verb_ratio, empty_word_ratio, nid_ratio, adjacent_repetitions,
    brunets_index, honores_statistic, guirauds_statistic
)

from feature_extraction.features.psycholinguistic_features import (
    compute_avg_by_pos, load_aoa_lexicon, load_imageability_norms,
    load_familiarity_norms, load_frequency_norms, load_concreteness_lexicon
)
from feature_extraction.features.fluency_features import filled_pause_ratio, calculate_fluency_features

from feature_extraction.audio import (
    count_phonemes, extract_acoustic_features,
    extract_egemaps, VoiceActivityDetector
)

from config.constants import DATA_DIRECTORY
from config.constants import GIT_DIRECTORY

[nltk_data] Downloading package words to /Users/gilanorup/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [8]:
# trim audio

def trim_audio(input_path, output_path, duration_sec=300):
    audio = AudioSegment.from_wav(input_path)
    trimmed = audio[:duration_sec * 1000] # bc pydub uses ms
    trimmed.export(output_path, format="wav")
    return len(trimmed) / 1000 # return duration in seconds


In [9]:
# compute features for subjects for only five minutes of the audio-file

def calculate_features_exception(text, audio_path, total_duration, Subject_ID):

    concreteness_lexicon = load_concreteness_lexicon()
    aoa_lexicon = load_aoa_lexicon()
    frequency_lexicon = load_frequency_norms()
    familiarity_lexicon = load_familiarity_norms()
    imageability_lexicon = load_imageability_norms()

    features = {}

    # linguistic features
    features["n_words"] = n_words(text)
    features["ttr"] = ttr(text)
    features.update(mattr(text, window_sizes=[10, 20, 30, 40, 50]))
    features["filler_word_ratio"] = filler_word_ratio(text)
    features["average_word_length"] = avg_word_length(text)
    features["brunets_index"] = brunets_index(text)
    features["honores_statistic"] = honores_statistic(text)
    features["guirauds_statistic"] = guirauds_statistic(text)
    features["light_verb_ratio"] = light_verb_ratio(text)
    features["empty_word_ratio"] = empty_word_ratio(text)
    features["nid_ratio"] = nid_ratio(text)
    features["adjacent_repetitions"] = adjacent_repetitions(text)
    features["aoa_content"] = compute_avg_by_pos(text, aoa_lexicon, ["NOUN", "VERB", "ADJ"])
    features["aoa_nouns"] = compute_avg_by_pos(text, aoa_lexicon, ["NOUN"])
    features["aoa_verbs"] = compute_avg_by_pos(text, aoa_lexicon, ["VERB"])
    features["fam_content"] = compute_avg_by_pos(text, familiarity_lexicon, ["NOUN", "VERB", "ADJ"])
    features["fam_nouns"] = compute_avg_by_pos(text, familiarity_lexicon, ["NOUN"])
    features["fam_verbs"] = compute_avg_by_pos(text, familiarity_lexicon, ["VERB"])
    features["img_content"] = compute_avg_by_pos(text, imageability_lexicon, ["NOUN", "VERB", "ADJ"])
    features["img_nouns"] = compute_avg_by_pos(text, imageability_lexicon, ["NOUN"])
    features["img_verbs"] = compute_avg_by_pos(text, imageability_lexicon, ["VERB"])
    features["freq_content"] = compute_avg_by_pos(text, frequency_lexicon, ["NOUN", "VERB", "ADJ"])
    features["freq_nouns"] = compute_avg_by_pos(text, frequency_lexicon, ["NOUN"])
    features["freq_verbs"] = compute_avg_by_pos(text, frequency_lexicon, ["VERB"])
    features["concr_content"] = compute_avg_by_pos(text, concreteness_lexicon, ["NOUN", "VERB", "ADJ"])
    features["concr_nouns"] = compute_avg_by_pos(text, concreteness_lexicon, ["NOUN"])
    features["concr_verbs"] = compute_avg_by_pos(text, concreteness_lexicon, ["VERB"])
    features.update(pos_ratios_spacy(text))
    features.update(calculate_fluency_features(text))

    # acoustic features
    acoustic = extract_acoustic_features(audio_path, text, total_duration)
    egemaps = extract_egemaps(audio_path)
    features.update(acoustic)
    features.update(egemaps)

    return pd.DataFrame([{**{"Subject_ID": Subject_ID}, **features}])

In [12]:
# loop through all shortened transcriptions and run trimming & feature replacement

shortened_df = pd.read_csv(os.path.join(GIT_DIRECTORY, "data", "shortened_transcriptions.csv"))

for idx, row in shortened_df.iterrows():
    Subject_ID = int(row["Subject_ID"])
    task_name = row["task"]
    shortened_transcription = row["transcription"]

    print(f"\nprocessing subject {Subject_ID} ({task_name})")

    # define original and trimmed paths
    original_audio_path = os.path.join(DATA_DIRECTORY, str(Subject_ID), f"{task_name}.wav")
    trimmed_audio_path = os.path.join(GIT_DIRECTORY, "data", f"{Subject_ID}_{task_name}_trimmed.wav")

    # trim audio
    total_duration = trim_audio(original_audio_path, trimmed_audio_path, duration_sec=300)

    # calculate features
    df_cut = calculate_features_exception(shortened_transcription, trimmed_audio_path, total_duration, Subject_ID)

    # update in respective feature-set .csv
    features_path = os.path.join(GIT_DIRECTORY, "results", "features", f"{task_name}.csv")
    df = pd.read_csv(features_path)
    df = df[df["Subject_ID"] != Subject_ID]
    df = pd.concat([df, df_cut], ignore_index=True).sort_values("Subject_ID")
    df.to_csv(features_path, index=False)
    print(f"updated features for subject {Subject_ID} ({task_name})")



processing subject 122 (picnicScene)
updated features for subject 122 (picnicScene)

processing subject 242 (journaling)
updated features for subject 242 (journaling)

processing subject 473 (journaling)
updated features for subject 473 (journaling)

processing subject 523 (journaling)
updated features for subject 523 (journaling)

processing subject 554 (journaling)
updated features for subject 554 (journaling)

processing subject 612 (picnicScene)
updated features for subject 612 (picnicScene)

processing subject 757 (picnicScene)
updated features for subject 757 (picnicScene)

processing subject 764 (journaling)
updated features for subject 764 (journaling)

processing subject 824 (journaling)
updated features for subject 824 (journaling)

processing subject 966 (journaling)
updated features for subject 966 (journaling)

processing subject 971 (picnicScene)
updated features for subject 971 (picnicScene)

processing subject 1059 (picnicScene)
updated features for subject 1059 (picni