# Feature Extraction
- Utterance features
- OpenSmile features
- Librosa features

In [43]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

import opensmile

## 0. Meta information (including demographic and label information)

In [59]:
def load_and_preprocess_data(base_path: str, train_file: str, dev_file: str, test_file: str) -> pd.DataFrame:
    """
    Load and preprocess data from multiple CSV files.

    Args:
        base_path (str): Base directory containing the CSV files.
        train_file (str): Filename for the training data.
        dev_file (str): Filename for the development data.
        test_file (str): Filename for the test data.

    Returns:
        pd.DataFrame: Preprocessed and combined DataFrame.
    """
    base_path = Path(base_path)

    # Load datasets
    train = pd.read_csv(base_path / train_file)
    dev = pd.read_csv(base_path / dev_file)
    test = pd.read_csv(base_path / test_file)

    # Rename columns in the test dataset
    try:
        test.rename(columns={"PHQ_Score": "PHQ8_Score", "PHQ_Binary": "PHQ8_Binary"}, inplace=True)
    except:
        pass
    # Add a 'Split' column to each dataset
    train['Split'] = 'train'
    dev['Split'] = 'dev'
    test['Split'] = 'test'

    # Concatenate datasets
    combined_df = pd.concat([train, dev, test])

    # Sort by Participant_ID
    combined_df = combined_df.sort_values(by='Participant_ID').reset_index(drop=True)

    # Reorder columns
    use_cols = ['Participant_ID', 'Split', 'Gender', 'PHQ8_Binary', 'PHQ8_Score']
    columns_order = use_cols + [col for col in combined_df.columns if col not in use_cols]
    combined_df = combined_df[columns_order]

    return combined_df

In [60]:
info_df = load_and_preprocess_data(
    base_path='downloads/',
    train_file='train_split_Depression_AVEC2017.csv',
    dev_file='dev_split_Depression_AVEC2017.csv',
    test_file='full_test_split.csv',
)

In [None]:
info_df.head()
# Split = "test" don't have PHQ8 related columns (e.g., PHQ8_NoInterest, ...)

Unnamed: 0,Participant_ID,Split,Gender,PHQ8_Binary,PHQ8_Score,PHQ8_NoInterest,PHQ8_Depressed,PHQ8_Sleep,PHQ8_Tired,PHQ8_Appetite,PHQ8_Failure,PHQ8_Concentrating,PHQ8_Moving
0,300,test,1,0,2,,,,,,,,
1,301,test,1,0,3,,,,,,,,
2,302,dev,1,0,4,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,303,train,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,304,train,0,0,6,0.0,1.0,1.0,2.0,2.0,0.0,0.0,0.0


In [62]:
info_df.to_csv('data/info_df.csv', index=False)

## 1. Utterance features
- Input: transcript files (.csv)
- Output: utterance_features_df (.csv)

In [44]:
def extract_silence_features(df: pd.DataFrame) -> dict:
    participant_df = df[df['speaker'] == 'Participant'].reset_index(drop=True)
    silence_durations = []

    for i in range(len(participant_df) - 1):
        prev_end = participant_df.loc[i, 'stop_time']
        next_start = participant_df.loc[i + 1, 'start_time']
        intervening = df[
            (df['start_time'] > prev_end) &
            (df['stop_time'] < next_start) &
            (df['speaker'] == 'Ellie')
        ]
        if intervening.empty:
            silence_durations.append(next_start - prev_end)

    return {
        "Num_Silences_Between_Participant_Utts": len(silence_durations),
        "Total_Silence_Duration": sum(silence_durations),
        "Avg_Silence_Duration": (
            sum(silence_durations) / len(silence_durations) if silence_durations else 0
        ),
        "Max_Silence_Duration": max(silence_durations) if silence_durations else 0
    }

def extract_utterance_features(transcript_dir: str, include_silence_features: bool = True) -> pd.DataFrame:
    transcrip_path = Path(transcript_dir)
    transcript_files = sorted(transcrip_path.glob('*_TRANSCRIPT.csv'))
    features = []

    for file_path in transcript_files:
        try:
            # Split the fields by tab and skip the first row
            df = pd.read_csv(file_path, sep='\t', header=None, names=["start_time", "stop_time", "speaker", "value"]).iloc[1:]

            # Transform the start_time and stop_time to float
            df["start_time"] = df["start_time"].astype(float)
            df["stop_time"] = df["stop_time"].astype(float)
            df["duration"] = df["stop_time"] - df["start_time"]

            # Extract features
            summary = {
                "Participant_ID": int(file_path.stem.split("_")[0]),
                "Num_Utterances_Ellie": (df['speaker'] == 'Ellie').sum(),
                "Num_Utterances_Participant": (df['speaker'] == 'Participant').sum(),
                "Total_Duration_Ellie": df.loc[df['speaker'] == 'Ellie', 'duration'].sum(),
                "Total_Duration_Participant": df.loc[df['speaker'] == 'Participant', 'duration'].sum(),
                "Avg_Utterance_Duration_Ellie": df.loc[df['speaker'] == 'Ellie', 'duration'].mean(),
                "Avg_Utterance_Duration_Participant": df.loc[df['speaker'] == 'Participant', 'duration'].mean(),
            }

            speakers = df['speaker'].values
            
            # Calculate the number of conversational turns between Ellie and the Participant.
            # A turn is defined as a transition from Ellie speaking to the Participant speaking.
            summary['Num_Turns'] = sum((speakers[i] == 'Ellie' and speakers[i+1] == 'Participant') for i in range(len(speakers)-1))

            if include_silence_features:
                silence_features = extract_silence_features(df)
                summary.update(silence_features)
            
            features.append(summary)

        except Exception as e:
            print(f"[!] Error processing {file_path.name}: {e}")

    return pd.DataFrame(features)


In [45]:
utterance_features_df = extract_utterance_features(transcript_dir="transcript_files/")

[!] Error processing ._487_TRANSCRIPT.csv: 'utf-8' codec can't decode byte 0xb0 in position 37: invalid start byte


In [47]:
utterance_features_df.to_csv("data/utterance_features.csv", index=False)

## 2. OpenSmile features
- Input: audio files (.wav)
- Output: dataframe

In [69]:
df.iloc[:3].wav_path

0    wav_files/300_AUDIO.wav
1    wav_files/301_AUDIO.wav
2    wav_files/302_AUDIO.wav
Name: wav_path, dtype: object

In [None]:
from glob import glob

wav_path = glob('wav_files/*')

## 수정 필요

In [65]:
def extract_opensmile_features(files):
    # Initialize OpenSMILE
    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.eGeMAPSv02,
        feature_level=opensmile.FeatureLevel.Functionals,
    )
    
    # Extract features
    features = smile.process_file(files)
    
    # Convert to numpy array
    features_array = np.array(features)
    
    return features_array

In [39]:
df = pd.read_csv("data/df.csv")

In [40]:
df['wav_path'] = 'wav_files/' + df['Participant_ID'].astype(str) + '_AUDIO.wav'

In [41]:
df.head()

Unnamed: 0,Split,Participant_ID,Gender,PHQ8_Binary,PHQ8_Score,Num_Utterances_Ellie,Num_Utterances_Participant,Total_Duration_Ellie,Total_Duration_Participant,Avg_Utterance_Duration_Ellie,Avg_Utterance_Duration_Participant,Num_Turns,wav_path
0,test,300,1,0,2,87,87,140.84,155.76,1.618851,1.790345,58,wav_files/300_AUDIO.wav
1,test,301,1,0,3,77,104,97.95,475.44,1.272078,4.571538,49,wav_files/301_AUDIO.wav
2,dev,302,1,0,4,89,97,113.393,208.933,1.274079,2.153948,53,wav_files/302_AUDIO.wav
3,train,303,0,0,0,88,103,148.23,642.93,1.684432,6.242039,57,wav_files/303_AUDIO.wav
4,train,304,0,0,6,100,104,164.1,362.6,1.641,3.486538,75,wav_files/304_AUDIO.wav


In [None]:
def extract_opensmile_features(files):
    # Initialize OpenSMILE
    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.eGeMAPSv02,
        feature_level=opensmile.FeatureLevel.Functionals,
    )
    
    # Extract features
    features = smile.process_file(files)
    
    # Convert to numpy array
    features_array = np.array(features)
    
    return features_array

In [20]:
class OpenSmileFeatures(object):
    def __init__(self, df, dir_name='data', save_dir_name='features'):
        self.df = df
        self.dir_name = dir_name
        self.save_dir_name = save_dir_name

    def create_features(self, save=True, filename='opensmile', load_feature=True, type_array=False):
        smile = opensmile.Smile(
                    feature_set = opensmile.FeatureSet.eGeMAPSv02, # eGeMAPSv02 does not use LLD, other options: ComParE_2016, ...
                    feature_level = opensmile.FeatureLevel.Functionals)

        # # Opensmile이 duration 0인 파일 처리하면 오류 발생하므로 0 초과하는 파일만 feature extraction
        # files = self.df.wav_path.loc[self.df.duration > 0]
        files = self.df.wav_path
        temp_features = smile.process_files(files)

        # 음성 파일 길이가 0인 파일의 features를 0으로 채움
        features = pd.merge(self.df, temp_features.reset_index(), how='left', left_on='wav_path', right_on='file')
        for col in temp_features.columns:
            features[col] = features[col].fillna(0)

        # features.drop(['file', 'start', 'end'], axis=1, inplace=True)
        features.drop(['file'], axis=1, inplace=True)

        if save:
            self.save_features(filename, features)

        if load_feature:
            return self.load_features(filename, type_array)

    def save_features(self, filename, features):
        filename = os.path.join(self.dir_name, self.save_dir_name, filename)
        with open(f'{filename}.pickle', 'wb') as f:
            pickle.dump(features, f, pickle.HIGHEST_PROTOCOL)
        print(f'{filename} saved.')

    def load_features(self, filename='opensmile', type_array=False):
        filename = os.path.join(self.dir_name, self.save_dir_name, filename)
        with open(f'{filename}.pickle', 'rb') as f:
            features = pickle.load(f)  
        if type_array:
            features = features.iloc[:, self.df.shape[1]:]
            if 'start' in features.columns:
                features = features.drop(['start', 'end'], axis=1).values
            else:
                features = features.values
        return features

In [23]:
dir_name = 'data'
save_dir_name = 'smile_features'

In [24]:
Smile = OpenSmileFeatures(df, dir_name='data', save_dir_name='smile_features')

if not os.path.exists(os.path.join(dir_name, save_dir_name)):
    os.makedirs(os.path.join(dir_name, save_dir_name))
    features2 = Smile.create_features()
else:
    features2 = Smile.load_features(type_array=False)

print(features2.shape)

data\smile_features\opensmile saved.
(189, 103)


In [26]:
import librosa
import soundfile

class LibrosaFeatures(object):
    
    def __init__(self, df, dir_name='data', save_dir_name='features', win_length=2048):
        self.df = df
        self.dir_name = dir_name
        self.save_dir_name = save_dir_name
        self.win_length = win_length

    def create_features(self, save=True, filename='librosa', load_feature=True, type_array=False):
        features = []
        for _, row in self.df.iterrows():
            # if row.duration > 0:
            #     features.append(self.get_librosa_features(row['wav_path']))
            # else: features.append(np.zeros((180, )))
            features.append(self.get_librosa_features(row['wav_path']))
        features = np.array(features) # shape: (the number of utterances, 180)
        features = pd.concat([self.df, pd.DataFrame(features)], axis=1)
        if save:
            self.save_features(filename, features)
        if load_feature:
            return self.load_features(filename, type_array)

    def save_features(self, filename, features):
        filename = os.path.join(self.dir_name, self.save_dir_name, filename)
        with open(f'{filename}.pickle', 'wb') as f:
            pickle.dump(features, f, pickle.HIGHEST_PROTOCOL)
        print(f'{filename} saved.')

    def load_features(self, filename='librosa', type_array=False):
        filename = os.path.join(self.dir_name, self.save_dir_name, filename)
        with open(f'{filename}.pickle', 'rb') as f:
            features = pickle.load(f)
        if type_array:
            features = features.iloc[:, self.df.shape[1]:].values
        return features

    def feature_chromagram(self, waveform, sample_rate):
        # STFT computed here explicitly; mel spectrogram and MFCC functions do this under the hood
        stft_spectrogram=np.abs(librosa.stft(waveform))
        # Produce the chromagram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
        chromagram=np.mean(librosa.feature.chroma_stft(S=stft_spectrogram, sr=sample_rate, win_length=self.win_length).T,axis=0)
        return chromagram

    def feature_melspectrogram(self, waveform, sample_rate):
        # Produce the mel spectrogram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
        # Using 8khz as upper frequency bound should be enough for most speech classification tasks
        melspectrogram=np.mean(librosa.feature.melspectrogram(y=waveform, sr=sample_rate, n_mels=128, fmax=8000, win_length=self.win_length).T,axis=0)
        return melspectrogram

    def feature_mfcc(self, waveform, sample_rate):
        # Compute the MFCCs for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
        # 40 filterbanks = 40 coefficients
        mfc_coefficients=np.mean(librosa.feature.mfcc(y=waveform, sr=sample_rate, n_mfcc=40, win_length=self.win_length).T, axis=0) 
        return mfc_coefficients
    
    def get_librosa_features(self, file):
        # load an individual soundfile
        with soundfile.SoundFile(file) as audio:
            waveform = audio.read(dtype="float32")
            sample_rate = audio.samplerate
            # compute features of soundfile
            chromagram = self.feature_chromagram(waveform, sample_rate)
            melspectrogram = self.feature_melspectrogram(waveform, sample_rate)
            mfc_coefficients = self.feature_mfcc(waveform, sample_rate)

            feature_matrix=np.array([])
            # use np.hstack to stack our feature arrays horizontally to create a feature matrix
            feature_matrix = np.hstack((chromagram, melspectrogram, mfc_coefficients))
            
            return feature_matrix

In [25]:
dir_name = 'data'
save_dir_name = 'librosa_features'

In [27]:
Librosa = LibrosaFeatures(df, dir_name=dir_name, save_dir_name=save_dir_name)

if not os.path.exists(os.path.join(dir_name, save_dir_name)):
    os.makedirs(os.path.join(dir_name, save_dir_name))
    features1 = Librosa.create_features()
else:
    features1 = Librosa.load_features(type_array=False)

print(features1.shape)

data\librosa_features\librosa saved.
(189, 193)


In [1]:
import pandas as pd
import numpy as np
import opensmile
import pickle
import os

In [51]:
import pickle
with open('data/smile_features/opensmile.pickle', 'rb') as file:
    df = pickle.load(file)

In [53]:
df.wav_path

0      wav_files/300_AUDIO.wav
1      wav_files/301_AUDIO.wav
2      wav_files/302_AUDIO.wav
3      wav_files/303_AUDIO.wav
4      wav_files/304_AUDIO.wav
                ...           
184    wav_files/488_AUDIO.wav
185    wav_files/489_AUDIO.wav
186    wav_files/490_AUDIO.wav
187    wav_files/491_AUDIO.wav
188    wav_files/492_AUDIO.wav
Name: wav_path, Length: 189, dtype: object

In [21]:
duration = df['end'] - df['start']
duration = duration.dt.total_seconds()
df.insert(5, 'duration(sec)', duration)

In [22]:
df

Unnamed: 0,Split,Participant_ID,Gender,PHQ8_Binary,PHQ8_Score,duration(sec),Num_Utterances_Ellie,Num_Utterances_Participant,Total_Duration_Ellie,Total_Duration_Participant,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,test,300,1,0,2,648.5,87,87,140.840,155.760,...,-0.019914,0.008097,0.069432,0.243643,1.131965,0.191049,0.241436,0.690924,1.544863,-31.463453
1,test,301,1,0,3,823.9,77,104,97.950,475.440,...,-0.021514,0.005452,0.028502,1.320565,1.213828,0.206510,0.217221,0.418650,0.763230,-37.011108
2,dev,302,1,0,4,758.8,89,97,113.393,208.933,...,-0.024416,0.009156,0.022415,0.735381,1.018794,0.178758,0.177598,0.789830,1.752425,-46.299591
3,train,303,0,0,0,985.3,88,103,148.230,642.930,...,-0.017102,0.004484,0.024622,1.636067,1.014981,0.189660,0.174769,0.288130,0.515600,-39.244053
4,train,304,0,0,6,792.6,100,104,164.100,362.600,...,-0.019647,0.003905,0.022772,1.078742,1.261766,0.200920,0.210789,0.485150,0.907032,-41.975723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,train,488,0,0,0,884.9,64,138,135.227,422.490,...,-0.012968,0.013727,0.007381,1.596809,1.130148,0.205130,0.182987,0.570030,1.956777,-50.160927
185,dev,489,1,0,3,704.7,85,117,163.259,168.810,...,-0.009124,0.013728,0.008082,0.736494,0.543540,0.223786,0.200095,1.604016,3.186081,-43.226524
186,dev,490,1,0,2,691.3,77,97,149.917,185.900,...,-0.008957,0.014794,0.006169,0.807187,0.684278,0.216364,0.175007,1.224820,3.106926,-46.802773
187,train,491,0,0,8,881.7,85,146,163.156,413.580,...,-0.008072,0.012006,0.007786,0.392428,1.134250,0.226110,0.207638,0.655723,2.241796,-43.799267


In [24]:
df.drop(['start', 'end'], axis=1, inplace=True)

In [31]:
info_features = df[['wav_path', 'Split', 'Participant_ID', 'Gender', 'PHQ8_Binary', 'PHQ8_Score']]

utterance_features = df[[
    'Participant_ID',
    'duration(sec)',
    'Num_Utterances_Ellie',
    'Num_Utterances_Participant',
    'Total_Duration_Ellie',
    'Total_Duration_Participant',
    'Avg_Utterance_Duration_Ellie',
    'Avg_Utterance_Duration_Participant',
    'Num_Turns'
    ]]

# eGeMAPS features
eGeMAPS_features = df[['Participant_ID'] + list(df.columns[14:])]

info_features.to_csv('data/info_features.csv', index=False)
utterance_features.to_csv('data/utterance_features.csv', index=False)
eGeMAPS_features.to_csv('data/eGeMAPS_features.csv', index=False)

In [33]:
import pickle
with open('data/librosa_features/librosa.pickle', 'rb') as file:
    librosa_df = pickle.load(file)

Split
Participant_ID
Gender
PHQ8_Binary
PHQ8_Score
Num_Utterances_Ellie
Num_Utterances_Participant
Total_Duration_Ellie
Total_Duration_Participant
Avg_Utterance_Duration_Ellie
Avg_Utterance_Duration_Participant
Num_Turns
wav_path
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
