In [18]:
import librosa
import os
import pandas as pd

def speaking_time_ratio(audio_path, total_duration):
    """
    Ratio of voiced time to total duration.
    """
    y, sr = librosa.load(audio_path, sr=16000) # load waveform at 16kHz
    intervals = librosa.effects.split(y, top_db=30) # identifies non-silent segments (segments that are over 30dB (-> reference?)); list of frame indices where speech happens
        
    voiced_duration = sum((end - start) for start, end in intervals) / sr # number of samples in speech segment, divide by sample rate to convert to seconds -> sum: total time spent speaking
    return voiced_duration / total_duration if total_duration > 0 else None


In [19]:

def load_audio_durations(subject_folder, task):
    # load audio durations for subject
    total_duration = None
    duration_path = os.path.join(subject_folder, "audio_durations.csv")  # path to audio duration file
    if os.path.exists(duration_path):  # checks if file exists
        try:
            duration_df = pd.read_csv(duration_path)  # tries to load csv
            match = duration_df[duration_df["task"] == task]  # filters data frame to row for current task
            if not match.empty:
                total_duration = match["duration"].values[0]  # if row exists -> store duration value
        except Exception as e:
            print(f"âš  error reading duration for subject {subject_folder}: {e}")

    else:
        # TODO: Shouldn't this exist in every case? If it doesn't, why not? Check and either add some error handling here or remove this if() statement
        pass

    return total_duration


In [20]:
subject_folder = "/Users/jheitz/git/luha-prolific-study/data/processed_combined/data/54"

In [21]:
task = "cookieTheft"

In [22]:
total_duration =  load_audio_durations(subject_folder, task)

In [23]:
audio_path = "/Users/jheitz/git/luha-prolific-study/data/processed_combined/data/54/cookieTheft.wav"
speaking_time_ratio(audio_path, total_duration)

[[ 55808  59392]
 [ 89600 108032]
 [117248 149504]
 [162304 187392]
 [192512 211968]
 [223744 228864]
 [229376 241664]
 [242176 250368]
 [251392 302080]
 [319488 371200]
 [380416 410624]
 [419328 461312]
 [469504 485888]
 [527872 560128]
 [573952 616960]
 [629248 636416]
 [646144 664576]
 [667136 671744]
 [672768 695808]
 [700416 731648]
 [739328 761856]
 [762368 766976]
 [841216 890880]
 [897536 915456]
 [921600 949760]]


0.589700440664446

In [31]:
y, sr = librosa.load(audio_path, sr=16000) # load waveform at 16kHz
intervals = librosa.effects.split(y, top_db=30) # identifies non-silent segments (segments that are over 30dB (-> reference?)); list of frame indices where speech happens

intervals_df = pd.DataFrame(intervals, columns=['start', 'end'])

# convert to seconds
intervals_df = intervals_df / sr
intervals_df['duration'] = intervals_df['end'] - intervals_df['start']
intervals_df

Unnamed: 0,start,end,duration
0,3.488,3.712,0.224
1,5.6,6.752,1.152
2,7.328,9.344,2.016
3,10.144,11.712,1.568
4,12.032,13.248,1.216
5,13.984,14.304,0.32
6,14.336,15.104,0.768
7,15.136,15.648,0.512
8,15.712,18.88,3.168
9,19.968,23.2,3.232


In [33]:
voiced_duration = intervals_df['duration'].sum()
voiced_duration

37.37600000000001