remove outliers: cut audio files that have a duration longer than five minutes to five minutes.

In [1]:
# setup

import sys

# add the root of the project to the path
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")

import os
import pandas as pd
from pydub import AudioSegment

from feature_extraction.features import (
    n_words, clean_text, tokenize, pos_ratios_spacy, filler_word_ratio,
    ttr, mattr, avg_word_length,
    light_verb_ratio, empty_word_ratio, nid_ratio, adjacent_repetitions,
    brunets_index, honores_statistic, guirauds_statistic
)

from feature_extraction.features.psycholinguistic_features import (
    compute_avg_by_pos, load_aoa_lexicon, load_imageability_norms,
    load_familiarity_norms, load_frequency_norms, load_concreteness_lexicon
)
from feature_extraction.features.fluency_features import filled_pause_ratio, calculate_fluency_features

from feature_extraction.audio import (
    count_phonemes, extract_acoustic_features,
    extract_egemaps, VoiceActivityDetector
)

from config.constants import DATA_DIRECTORY
from config.constants import GIT_DIRECTORY


[nltk_data] Downloading package words to /Users/gilanorup/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
# trim audio

def trim_audio(input_path, output_path, duration_sec=300):
    audio = AudioSegment.from_wav(input_path)
    trimmed = audio[:duration_sec * 1000] # bc pydub uses ms
    trimmed.export(output_path, format="wav")

original_audio_path = os.path.join(DATA_DIRECTORY, "1370", "cookieTheft.wav")
trimmed_audio_path = os.path.join(GIT_DIRECTORY, "data", "1370_cookieTheft_trimmed.wav")

trim_audio(original_audio_path, trimmed_audio_path)

In [3]:
# calculate duration of trimmed audio file (to make sure + for feature-calculation later)

trimmed_audio = AudioSegment.from_wav(trimmed_audio_path)
total_duration = len(trimmed_audio) / 1000  # -> seconds


In [4]:
# compute features for subjects for only five minutes of the audio-file

def calculate_features_exception(text, audio_path, total_duration, Subject_ID = 1370):

    concreteness_lexicon = load_concreteness_lexicon()
    aoa_lexicon = load_aoa_lexicon()
    frequency_lexicon = load_frequency_norms()
    familiarity_lexicon = load_familiarity_norms()
    imageability_lexicon = load_imageability_norms()

    features = {}

    # linguistic features
    features["n_words"] = n_words(text)
    features["ttr"] = ttr(text)
    features.update(mattr(text, window_sizes=[10, 20, 30, 40, 50]))
    features["filler_word_ratio"] = filler_word_ratio(text)
    features["average_word_length"] = avg_word_length(text)
    features["brunets_index"] = brunets_index(text)
    features["honores_statistic"] = honores_statistic(text)
    features["guirauds_statistic"] = guirauds_statistic(text)
    features["light_verb_ratio"] = light_verb_ratio(text)
    features["empty_word_ratio"] = empty_word_ratio(text)
    features["nid_ratio"] = nid_ratio(text)
    features["adjacent_repetitions"] = adjacent_repetitions(text)
    features["aoa_content"] = compute_avg_by_pos(text, aoa_lexicon, ["NOUN", "VERB", "ADJ"])
    features["aoa_nouns"] = compute_avg_by_pos(text, aoa_lexicon, ["NOUN"])
    features["aoa_verbs"] = compute_avg_by_pos(text, aoa_lexicon, ["VERB"])
    features["fam_content"] = compute_avg_by_pos(text, familiarity_lexicon, ["NOUN", "VERB", "ADJ"])
    features["fam_nouns"] = compute_avg_by_pos(text, familiarity_lexicon, ["NOUN"])
    features["fam_verbs"] = compute_avg_by_pos(text, familiarity_lexicon, ["VERB"])
    features["img_content"] = compute_avg_by_pos(text, imageability_lexicon, ["NOUN", "VERB", "ADJ"])
    features["img_nouns"] = compute_avg_by_pos(text, imageability_lexicon, ["NOUN"])
    features["img_verbs"] = compute_avg_by_pos(text, imageability_lexicon, ["VERB"])
    features["freq_content"] = compute_avg_by_pos(text, frequency_lexicon, ["NOUN", "VERB", "ADJ"])
    features["freq_nouns"] = compute_avg_by_pos(text, frequency_lexicon, ["NOUN"])
    features["freq_verbs"] = compute_avg_by_pos(text, frequency_lexicon, ["VERB"])
    features["concr_content"] = compute_avg_by_pos(text, concreteness_lexicon, ["NOUN", "VERB", "ADJ"])
    features["concr_nouns"] = compute_avg_by_pos(text, concreteness_lexicon, ["NOUN"])
    features["concr_verbs"] = compute_avg_by_pos(text, concreteness_lexicon, ["VERB"])
    features.update(pos_ratios_spacy(text))
    features.update(calculate_fluency_features(text))

    # acoustic features
    acoustic = extract_acoustic_features(audio_path, text, total_duration)
    egemaps = extract_egemaps(audio_path)
    features.update(acoustic)
    features.update(egemaps)

    return pd.DataFrame([{**{"Subject_ID": Subject_ID}, **features}])

In [5]:
# calculate for Subject 1370
shortened_transcription = """i see a kitchen scene based on the style of hair of the woman doing the dishes, i would say it's from the 50s, uh, got a young boy standing on top of a stool that is tipping, reaching for a cookie jar, having what might be chocolate chip or raisin.
 oatmeal cookies based on the spots on the cookies and the cookie jar has the lid off and is on the top shelf of the upper kitchen cabinet, i see a little girl, i would presume to be his sister, standing next to the stool, reaching up, waiting for a cookie, the boy has one already in his left hand while he's reaching for another in his right hand with his right hand,
 "the girl has her right hand over her mouth and is smiling, the girl is wearing a dress with a short skirt and her shoes are, they look like mary jane's, the stool is a three-legged stool by the way, the boy is wearing, his shoes are..."
 yeah, well they it looks like he's just wearing socks, possibly because the toe and the heel are marked and the part covering the ankle is somewhat sagging. he's also wearing a collared shirt and short sleeve and the girls dresses short sleeve as well. the boys wearing collar shirt, short sleeve.
 with short pants that come down to mid thigh, the coup from which he is extracting cookies, has a hinge on the left side, it's one of three cupboards hanging on the wall which is directly opposite the view of the...
 three hanging on the wall with two below, the doors are simple rectangles with a bar handle protruding slightly, having two posts holding it out from the door to allow for gripping, the boy has one leg straight,
 on the centered more or less on the top of the stool, his right leg, his left leg is bent at the knee and standing with in place with the arch of the foot over the upper edge of the stool as it's tilting, the stool looks tilted enough to where he the boy is probably in the process.
 of falling, the mother has gin length hair, curled under at the bottom, curled under, curled under at the bottom, she's wearing a round neck, sleeveless dress with knee length, it's knee length, and a an apron,
 of the type just tied around her waist and going nearly as going down nearly as long as her dress, she has on uh some type of simple closed toe shoe with a low heel, oh just to go back to the girl's shoes, um, she's also wearing socks and the shoes she's wearing are closed toe.
 with a strap over the top of the arch and no visible heels, back to the woman, she's standing primarily with her weight on her right leg, which is pointed to the right, she's facing, or rather she is standing next to,
 the"""
audio_path = trimmed_audio_path

df_cut = calculate_features_exception(shortened_transcription, audio_path, total_duration=total_duration)
print(df_cut)

   Subject_ID  n_words       ttr  mattr_10  mattr_20  mattr_30  mattr_40  \
0        1370      498  0.407631  0.912679  0.841336  0.788628  0.752124   

   mattr_50  filler_word_ratio  average_word_length  ...  \
0  0.719866           0.014056             4.048193  ...   

   eGeMAPS_slopeUV0-500_sma3nz_amean  eGeMAPS_slopeUV500-1500_sma3nz_amean  \
0                           0.026431                             -0.006015   

   eGeMAPS_spectralFluxUV_sma3nz_amean  eGeMAPS_loudnessPeaksPerSec  \
0                             0.094161                     1.650055   

   eGeMAPS_VoicedSegmentsPerSec  eGeMAPS_MeanVoicedSegmentLengthSec  \
0                       1.72707                            0.294208   

   eGeMAPS_StddevVoicedSegmentLengthSec  eGeMAPS_MeanUnvoicedSegmentLength  \
0                              0.304693                           0.296849   

   eGeMAPS_StddevUnvoicedSegmentLength  eGeMAPS_equivalentSoundLevel_dBp  
0                             0.506396             

In [6]:
# update in cookieTheft features .csv
df = pd.read_csv(os.path.join(GIT_DIRECTORY, "results", "features", "cookieTheft.csv"))
df = df[df["Subject_ID"] != 1370]
df = pd.concat([df, df_cut], ignore_index=True).sort_values("Subject_ID")
df.to_csv(os.path.join(GIT_DIRECTORY, "results", "features", "cookieTheft.csv"), index=False)