# Feature Extraction
- Utterance features
- OpenSmile features
- Librosa features

In [67]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

import opensmile

## 0. Meta information (including demographic and label information)

In [68]:
def load_and_preprocess_data(base_path: str, train_file: str, dev_file: str, test_file: str) -> pd.DataFrame:
    """
    Load and preprocess data from multiple CSV files.

    Args:
        base_path (str): Base directory containing the CSV files.
        train_file (str): Filename for the training data.
        dev_file (str): Filename for the development data.
        test_file (str): Filename for the test data.

    Returns:
        pd.DataFrame: Preprocessed and combined DataFrame.
    """
    base_path = Path(base_path)

    # Load datasets
    train = pd.read_csv(base_path / train_file)
    dev = pd.read_csv(base_path / dev_file)
    test = pd.read_csv(base_path / test_file)

    # Rename columns in the test dataset
    try:
        test.rename(columns={"PHQ_Score": "PHQ8_Score", "PHQ_Binary": "PHQ8_Binary"}, inplace=True)
    except:
        pass
    # Add a 'Split' column to each dataset
    train['Split'] = 'train'
    dev['Split'] = 'dev'
    test['Split'] = 'test'

    # Concatenate datasets
    combined_df = pd.concat([train, dev, test])

    # Sort by Participant_ID
    combined_df = combined_df.sort_values(by='Participant_ID').reset_index(drop=True)

    # Reorder columns
    use_cols = ['Participant_ID', 'Split', 'Gender', 'PHQ8_Binary', 'PHQ8_Score']
    columns_order = use_cols + [col for col in combined_df.columns if col not in use_cols]
    combined_df = combined_df[columns_order]

    return combined_df

In [69]:
# Create the 'data' folder if it doesn't exist
Path("data").mkdir(exist_ok=True)

In [70]:
info_path = 'data/info_df.csv'

if not os.path.exists(info_path):
    print(f'{info_path} not found, loading and preprocessing data...')
    info_df = load_and_preprocess_data(
        base_path='downloads/',
        train_file='train_split_Depression_AVEC2017.csv',
        dev_file='dev_split_Depression_AVEC2017.csv',
        test_file='full_test_split.csv',
    )
    info_df.to_csv('data/info_df.csv', index=False)
else:
    print(f'{info_path} found, loading...')
    info_df = pd.read_csv(info_path)

data/info_df.csv found, loading...


In [71]:
info_df.head()
# Split = "test" don't have PHQ8 related columns (e.g., PHQ8_NoInterest, ...)

Unnamed: 0,Participant_ID,Split,Gender,PHQ8_Binary,PHQ8_Score,PHQ8_NoInterest,PHQ8_Depressed,PHQ8_Sleep,PHQ8_Tired,PHQ8_Appetite,PHQ8_Failure,PHQ8_Concentrating,PHQ8_Moving
0,300,test,1,0,2,,,,,,,,
1,301,test,1,0,3,,,,,,,,
2,302,dev,1,0,4,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,303,train,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,304,train,0,0,6,0.0,1.0,1.0,2.0,2.0,0.0,0.0,0.0


## 1. Utterance features
- Input: transcript files (.csv)
- Output: utterance_features_df (.csv)

In [81]:
from src import extract_utterance_features

In [77]:
utt_path = Path("data/utterance_features.csv")

if not os.path.exists(utt_path):
    print(f'{utt_path} not found, extracting utterance features...')
    utterance_features_df = extract_utterance_features(transcript_dir="transcript_files/")
    utterance_features_df.to_csv("data/utterance_features.csv", index=False)
else:
    print(f'{utt_path} found, loading...')
    utterance_features_df = pd.read_csv(utt_path)

data\utterance_features.csv not found, extracting utterance features...
[!] Error processing ._487_TRANSCRIPT.csv: 'utf-8' codec can't decode byte 0xb0 in position 37: invalid start byte


In [78]:
utterance_features_df

Unnamed: 0,Participant_ID,Num_Utterances_Ellie,Num_Utterances_Participant,Total_Duration_Ellie,Total_Duration_Participant,Avg_Utterance_Duration_Ellie,Avg_Utterance_Duration_Participant,Total_Interview_Duration,Total_Word_Count_Ellie,Total_Word_Count_Participant,...,Long_Utterance_Speech_Rate,Total_Filler_Count,Avg_Fillers_per_Utterance,Total_Emotion_Cue_Count,Filler_to_Word_Ratio,Emotion_sigh,Emotion_yawn,Emotion_laughter,Emotion_sniffle,Emotion_clears throat
0,300,87,87,140.840,155.760,1.618851,1.790345,584.680,472,352,...,4.273504,31,0.356322,0,0.088068,0,0,0,0,0
1,301,77,104,97.950,475.440,1.272078,4.571538,774.400,379,1475,...,3.219407,20,0.192308,3,0.013559,0,0,3,0,0
2,302,89,97,113.393,208.933,1.274079,2.153948,676.970,431,614,...,3.233150,28,0.288660,1,0.045603,0,0,1,0,0
3,303,88,103,148.230,642.930,1.684432,6.242039,934.100,448,1965,...,3.229241,31,0.300971,4,0.015776,0,0,3,0,1
4,304,100,104,164.100,362.600,1.641000,3.486538,720.040,501,987,...,3.291115,30,0.288462,11,0.030395,0,0,11,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,488,64,138,135.227,422.490,2.112922,3.061522,855.426,524,1413,...,3.854602,39,0.282609,4,0.027601,4,0,0,0,0
185,489,85,117,163.259,168.810,1.920694,1.442821,659.400,647,491,...,3.153989,9,0.076923,11,0.018330,0,0,11,0,0
186,490,77,97,149.917,185.900,1.946974,1.916495,663.963,620,592,...,3.504983,24,0.247423,5,0.040541,3,0,1,1,0
187,491,85,146,163.156,413.580,1.919482,2.832740,811.140,645,1241,...,3.556755,34,0.232877,8,0.027397,3,1,4,0,0


In [79]:
print(utterance_features_df.columns)

Index(['Participant_ID', 'Num_Utterances_Ellie', 'Num_Utterances_Participant',
       'Total_Duration_Ellie', 'Total_Duration_Participant',
       'Avg_Utterance_Duration_Ellie', 'Avg_Utterance_Duration_Participant',
       'Total_Interview_Duration', 'Total_Word_Count_Ellie',
       'Total_Word_Count_Participant', 'Avg_Word_Count_Ellie',
       'Avg_Word_Count_Participant', 'Num_Turns', 'Total_Turns',
       'Total_Silence_Duration', 'Avg_Silence_Duration',
       'Max_Silence_Duration', 'Std_Silence_Duration',
       'Silence_Duration_Ratio', 'Avg_Reaction_Time', 'Max_Reaction_Time',
       'Std_Reaction_Time', 'Long_Reaction_Times', 'Avg_Speech_Rate',
       'Std_Speech_Rate', 'Max_Speech_Rate', 'Min_Speech_Rate',
       'Short_Utterance_Speech_Rate', 'Long_Utterance_Speech_Rate',
       'Total_Filler_Count', 'Avg_Fillers_per_Utterance',
       'Total_Emotion_Cue_Count', 'Filler_to_Word_Ratio', 'Emotion_sigh',
       'Emotion_yawn', 'Emotion_laughter', 'Emotion_sniffle',
       'Emo

## 2. OpenSmile features
- Input: audio files (.wav)
- Output: dataframe

In [26]:
def extract_opensmile_features(df: pd.DataFrame) -> pd.DataFrame:
    temp_df = df.copy()
    temp_df['wav_path'] = 'wav_files/' + temp_df['Participant_ID'].astype(str) + '_AUDIO.wav'
    smile = opensmile.Smile(
                feature_set = opensmile.FeatureSet.eGeMAPSv02, # eGeMAPSv02 does not use LLD, other options: ComParE_2016, ...
                feature_level = opensmile.FeatureLevel.Functionals)

    files = temp_df.wav_path
    temp_features = smile.process_files(files)
    temp_features.reset_index(inplace=True)
    temp_features.drop(columns=['file', 'start', 'end'], axis=1, inplace=True)
    temp_features.insert(0, 'Participant_ID', df['Participant_ID'].values)
    return temp_features

In [27]:
smile_path = "data/smile_features.csv"

if not os.path.exists(smile_path):
    print(f'{smile_path} not found, extracting OpenSMILE features...')
    smile_df = extract_opensmile_features(info_df)
    smile_df.to_csv(smile_path, index=False)
else:
    print(f'{smile_path} found, loading...')
    smile_df = pd.read_csv(smile_path)

data/smile_features.csv found, loading...


In [28]:
smile_df

Unnamed: 0,Participant_ID,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,300,28.989614,0.164996,25.515335,26.992317,33.286697,7.771362,212.50317,396.76877,62.987938,...,-0.019914,0.008097,0.069432,0.243643,1.131965,0.191049,0.241436,0.690924,1.544863,-31.463453
1,301,24.682632,0.212958,21.029116,22.728956,27.562181,6.533066,133.78280,261.45132,55.496384,...,-0.021514,0.005452,0.028502,1.320565,1.213828,0.206510,0.217221,0.418650,0.763230,-37.011110
2,302,25.595976,0.221090,22.035534,23.558725,27.024971,4.989437,141.37645,234.92667,80.336975,...,-0.024416,0.009156,0.022415,0.735381,1.018794,0.178758,0.177598,0.789830,1.752425,-46.299590
3,303,33.288822,0.112554,31.288510,33.143818,35.496540,4.208031,181.89624,327.01398,50.025394,...,-0.017102,0.004484,0.024622,1.636067,1.014981,0.189660,0.174769,0.288130,0.515600,-39.244053
4,304,34.866764,0.107737,32.686874,34.458084,37.560783,4.873909,172.44302,292.83514,77.229040,...,-0.019647,0.003905,0.022772,1.078742,1.261766,0.200920,0.210789,0.485150,0.907032,-41.975723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,488,33.347610,0.144575,31.812204,33.053932,36.228940,4.416735,260.52628,378.89267,81.803890,...,-0.012968,0.013727,0.007381,1.596809,1.130148,0.205130,0.182987,0.570030,1.956777,-50.160927
185,489,21.904884,0.119526,20.233719,21.309267,23.089685,2.855967,102.09488,249.69783,100.032430,...,-0.009124,0.013728,0.008082,0.736494,0.543540,0.223786,0.200095,1.604016,3.186081,-43.226524
186,490,23.067087,0.202900,20.124071,21.619263,24.328503,4.204432,203.31024,419.19650,131.684080,...,-0.008957,0.014794,0.006169,0.807187,0.684278,0.216364,0.175007,1.224820,3.106926,-46.802773
187,491,30.386635,0.150679,27.498314,29.908390,33.354590,5.856278,198.89561,368.56015,57.368860,...,-0.008072,0.012006,0.007786,0.392428,1.134250,0.226110,0.207638,0.655723,2.241796,-43.799267


## 3. Librosa features
- Input: audio files (.wav)
- Output: dataframe