In [57]:
import pandas as pd
import numpy as np

## Feature extraction notebook

In this notebook there the:
- extraction of the audios and the labels from the paths 
- filtering of the audios
- extraction of features in time-domain, frequency-domain, MFCCs, Chroma, fundamental frequency and amplitude envelope
- result a combined dataframe (raw df + feature extracted)
  

In [58]:
from utils import load_audio_files
from utils import extract_labels

train_path = '/Users/fabiodepetro/DSL_Project/ProvaProgetto/DSL_Winter_Project_2025/audios_development'
eval_path = '/Users/fabiodepetro/DSL_Project/ProvaProgetto/DSL_Winter_Project_2025/audios_evaluation'

signals_train, sr_train = load_audio_files(train_path)
signals_eval, sr_eval = load_audio_files(eval_path)

def get_ids(path, signals):
  ids = extract_labels(path)

  ids_signals = {}
  for i, signal in zip(ids, signals):
    ids_signals[i] = signal
  
  return ids_signals, ids

id_signal_dict_train, ids_train = get_ids(train_path, signals_train)
# id_signal_dict_train
id_signal_dict_eval, ids_eval = get_ids(eval_path, signals_eval )
ids_train = pd.to_numeric(ids_train)
ids_eval = pd.to_numeric(ids_eval)
ids_train_combaciati = [x - 1 for x in ids_train]
ids_eval_combaciati = [x - 1 for x in ids_eval]

In [59]:
from scipy.signal import butter, filtfilt

# Funzione per creare e applicare il filtro passa basso
def filter_signal(s, cutoff, sr, order=4):
  nyquist = 0.5 * sr
  normal_cutoff = cutoff / nyquist
  b, a = butter(order, normal_cutoff, btype='low', analog=False)
  filtered_signal = filtfilt(b, a, s)

  return filtered_signal

filtered_signals_train = [filter_signal(s, cutoff=4000, sr=sr_train) for s in signals_train]
filtered_signals_eval = [filter_signal(s, cutoff=4000, sr=sr_eval) for s in signals_eval]

In [60]:
time_len_train = [len(s) / sr_train for s in filtered_signals_train]
time_len_eval = [len(s) / sr_train for s in filtered_signals_eval]

In [61]:
df_time_len_train = pd.DataFrame(time_len_train)
df_time_len_train['Id'] = ids_train_combaciati

In [62]:
df_time_len_eval = pd.DataFrame(time_len_eval)
df_time_len_eval['Id'] = ids_eval_combaciati

In [63]:
def cut_signals(filtered_signals_train, filtered_signals_eval, sr_train, sr_eval, duration):

  num_samples_train = int(duration * sr_train)
  num_samples_eval = int(duration * sr_eval)

  cutted_filtered_signals_train = [s[:num_samples_train] for s in filtered_signals_train]
  cutted_filtered_signals_eval = [s[:num_samples_eval] for s in filtered_signals_eval]
  return cutted_filtered_signals_train, cutted_filtered_signals_eval


cutted_filtered_signals_train5s, cutted_filtered_signals_eval5s = cut_signals(filtered_signals_train, filtered_signals_eval, sr_train, sr_eval,  duration=5)


In [64]:
from scipy.stats import skew, kurtosis
import librosa

def extract_and_merge_features(filtered_signals_train, filtered_signals_eval, sr_train, sr_eval, ids_train_combaciati, ids_eval_combaciati, df_time_len_train, df_time_len_eval):
  def extract_time_features(s, sr):
    rms_energy = librosa.feature.rms(y=s)
    rms_energy_mean = np.mean(rms_energy)
    rms_energy_median = np.median(rms_energy)
    rms_energy_sd = np.std(rms_energy)
    time_mean = np.mean(np.abs(s))
    time_sd = np.std(np.abs(s))
    time_min = np.min(np.abs(s))
    time_max = np.max(np.abs(s))
    time_kurtosis = kurtosis(s)
    time_skew = skew(s)

    return {
      'rms_energy_mean': rms_energy_mean,
      'rms_energy_median': rms_energy_median,
      'rms_energy_sd': rms_energy_sd,
      'time_mean': time_mean,
      'time_sd': time_sd,
      'time_min': time_min,
      'time_max': time_max,
      'time_kurtosis': time_kurtosis,
      'time_skew': time_skew,
    }

  def extract_frequency_features(y, sr):
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=16384, hop_length=512)
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr, n_fft=16384, hop_length=512)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=16384, hop_length=512)
    spectral_flatness = librosa.feature.spectral_flatness(y=y, n_fft=16384, hop_length=512)

    return {
      'spectral_rolloff_mean': np.mean(spectral_rolloff),
      'spectral_rolloff_median': np.median(spectral_rolloff),
      'spectral_rolloff_std': np.std(spectral_rolloff),
      'spectral_bandwidth_mean': np.mean(spectral_bandwidth),
      'spectral_bandwidth_median': np.median(spectral_bandwidth),
      'spectral_bandwidth_std': np.std(spectral_bandwidth),
      'spectral_centroid_mean': np.mean(spectral_centroid),
      'spectral_centroid_median': np.median(spectral_centroid),
      'spectral_centroid_std': np.std(spectral_centroid),
      'spectral_flatness_mean': np.mean(spectral_flatness),
      'spectral_flatness_median': np.median(spectral_flatness),
      'spectral_flatness_std': np.std(spectral_flatness),
    }

  def extract_mfcc(signal, sr, n_mfcc=13):
    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc, n_fft=16384, hop_length=512)
    mfcc_mean = np.mean(mfccs, axis=1)
    mfcc_median = np.median(mfccs, axis=1)
    mfcc_std = np.std(mfccs, axis=1)

    features = {}
    for i in range(n_mfcc):
      features[f'MFCC{i+1}_mean'] = mfcc_mean[i]
      features[f'MFCC{i+1}_median'] = mfcc_median[i]
      features[f'MFCC{i+1}_std'] = mfcc_std[i]

    return features

  def extract_chroma(signal, sr, n_chroma=12, n_fft=16384, hop_length=512):
    chroma = librosa.feature.chroma_stft(y=signal, sr=sr, n_fft=n_fft, hop_length=hop_length)
    chroma_mean = np.mean(chroma, axis=1)
    chroma_median = np.median(chroma, axis=1)
    chroma_std = np.std(chroma, axis=1)

    features = {}
    for i in range(n_chroma):
      features[f'Chroma{i+1}_mean'] = chroma_mean[i]
      features[f'Chroma{i+1}_median'] = chroma_median[i]
      features[f'Chroma{i+1}_std'] = chroma_std[i]

    return features

  def extract_fundamental_frequencies(signal, sr, n_fft=8192, hop_length=256, fmin=25, fmax=500):
    f0, _, _ = librosa.pyin(signal, fmin=fmin, fmax=fmax, sr=sr, frame_length=n_fft, hop_length=hop_length)
    f0 = f0[~np.isnan(f0)]
    f0_mean = np.mean(f0)
    f0_median = np.median(f0)
    f0_std = np.std(f0)

    return {
      'fundamental_frequency_mean': f0_mean,
      'fundamental_frequency_median': f0_median,
      'fundamental_frequency_std': f0_std,
    }

  def extract_amplitude_envelope(signal, n_fft=8192, hop_length=512):
    frame_size = n_fft
    amplitude_envelope = [np.max(signal[i:i+frame_size]) for i in range(0, len(signal), hop_length)]
    amplitude_mean = np.mean(amplitude_envelope)
    amplitude_median = np.median(amplitude_envelope)
    amplitude_std = np.std(amplitude_envelope)

    return {
      'amplitude_envelope_mean': amplitude_mean,
      'amplitude_envelope_median': amplitude_median,
      'amplitude_envelope_std': amplitude_std,
    }

  time_features_train = [extract_time_features(s, sr_train) for s in filtered_signals_train]
  time_features_eval = [extract_time_features(s, sr_eval) for s in filtered_signals_eval]

  frequency_features_train = [extract_frequency_features(s, sr_train) for s in filtered_signals_train]
  frequency_features_eval = [extract_frequency_features(s, sr_eval) for s in filtered_signals_eval]

  mfcc_features_train = [extract_mfcc(s, sr_train) for s in filtered_signals_train]
  mfcc_features_eval = [extract_mfcc(s, sr_eval) for s in filtered_signals_eval]

  chroma_features_train = [extract_chroma(s, sr_train) for s in filtered_signals_train]
  chroma_features_eval = [extract_chroma(s, sr_eval) for s in filtered_signals_eval]

  fundamental_frequency_train = [extract_fundamental_frequencies(s, sr_train) for s in filtered_signals_train]
  fundamental_frequency_eval = [extract_fundamental_frequencies(s, sr_eval) for s in filtered_signals_eval]

  amplitude_envelope_train = [extract_amplitude_envelope(s, sr_train) for s in filtered_signals_train]
  amplitude_envelope_eval = [extract_amplitude_envelope(s, sr_eval) for s in filtered_signals_eval]

  df_time_features_train = pd.DataFrame(time_features_train)
  df_time_features_train['Id'] = ids_train_combaciati

  df_time_features_eval = pd.DataFrame(time_features_eval)
  df_time_features_eval['Id'] = ids_eval_combaciati

  df_frequency_features_train = pd.DataFrame(frequency_features_train)
  df_frequency_features_train['Id'] = ids_train_combaciati

  df_frequency_features_eval = pd.DataFrame(frequency_features_eval)
  df_frequency_features_eval['Id'] = ids_eval_combaciati

  df_mfccs_train = pd.DataFrame(mfcc_features_train)
  df_mfccs_train['Id'] = ids_train_combaciati

  df_mfccs_eval = pd.DataFrame(mfcc_features_eval)
  df_mfccs_eval['Id'] = ids_eval_combaciati

  df_chroma_train = pd.DataFrame(chroma_features_train)
  df_chroma_train['Id'] = ids_train_combaciati

  df_chroma_eval = pd.DataFrame(chroma_features_eval)
  df_chroma_eval['Id'] = ids_eval_combaciati

  df_fundamental_frequency_train = pd.DataFrame(fundamental_frequency_train)
  df_fundamental_frequency_train['Id'] = ids_train_combaciati

  df_fundamental_frequency_eval = pd.DataFrame(fundamental_frequency_eval)
  df_fundamental_frequency_eval['Id'] = ids_eval_combaciati

  df_amplitude_envelope_train = pd.DataFrame(amplitude_envelope_train)
  df_amplitude_envelope_train['Id'] = ids_train_combaciati

  df_amplitude_envelope_eval = pd.DataFrame(amplitude_envelope_eval)
  df_amplitude_envelope_eval['Id'] = ids_eval_combaciati


  df_train = pd.merge(df_time_features_train, df_frequency_features_train, on='Id', how='inner')
  df_train = pd.merge(df_train, df_mfccs_train, on='Id', how='inner')
  df_train = pd.merge(df_train, df_chroma_train, on='Id', how='inner')
  df_train = pd.merge(df_train, df_fundamental_frequency_train, on='Id', how='inner')
  df_train = pd.merge(df_train, df_amplitude_envelope_train, on='Id', how='inner')
  df_train = pd.merge(df_train, df_time_len_train, on='Id', how='inner')

  df_eval = pd.merge(df_time_features_eval, df_frequency_features_eval, on='Id', how='inner')
  df_eval = pd.merge(df_eval, df_mfccs_eval, on='Id', how='inner')
  df_eval = pd.merge(df_eval, df_chroma_eval, on='Id', how='inner')
  df_eval = pd.merge(df_eval, df_fundamental_frequency_eval, on='Id', how='inner')
  df_eval = pd.merge(df_eval, df_amplitude_envelope_eval, on='Id', how='inner')
  df_eval = pd.merge(df_eval, df_time_len_eval, on='Id', how='inner')

  df_combined = pd.concat([df_train, df_eval], axis=0)

  return df_combined


In [65]:
df_features = extract_and_merge_features(cutted_filtered_signals_train5s, cutted_filtered_signals_eval5s, sr_train, sr_eval, ids_train_combaciati, ids_eval_combaciati, df_time_len_train, df_time_len_eval)

  return pitch_tuning(


In [71]:
df_dev = pd.read_csv('/Users/fabiodepetro/DSL_Project/ProvaProgetto/DSL_Winter_Project_2025/development.csv')
df_eval = pd.read_csv('/Users/fabiodepetro/DSL_Project/ProvaProgetto/DSL_Winter_Project_2025/evaluation.csv')

In [72]:
df = pd.concat([df_dev, df_eval], sort=False)
len(df_dev), len(df_eval), len(df)

(2933, 691, 3624)

In [73]:
df_dev_features = df_features.iloc[:2933]
df_eval_features = df_features.tail(df_features.shape[0]-2933)
df_dev_total = pd.merge(df_dev, df_dev_features, on='Id', how='inner')
df_eval_total = pd.merge(df_eval, df_eval_features, on='Id', how='inner')

In [74]:
df_total = pd.concat([df_dev_total, df_eval_total], axis = 0)
df_total

Unnamed: 0,Id,sampling_rate,age,gender,ethnicity,mean_pitch,max_pitch,min_pitch,jitter,shimmer,...,Chroma12_mean,Chroma12_median,Chroma12_std,fundamental_frequency_mean,fundamental_frequency_median,fundamental_frequency_std,amplitude_envelope_mean,amplitude_envelope_median,amplitude_envelope_std,0
0,0,22050,24.0,female,arabic,1821.69060,3999.7170,145.43066,0.013795,0.082725,...,0.284390,0.159516,0.309165,280.201810,284.481204,76.990362,0.279256,0.234448,0.157617,35.095238
1,1,22050,22.5,female,hungarian,1297.81870,3998.8590,145.37268,0.025349,0.096242,...,0.235085,0.126637,0.276171,209.628549,204.674778,22.039430,0.384136,0.281566,0.122184,23.331293
2,2,22050,22.0,female,portuguese,1332.85240,3998.8025,145.42395,0.019067,0.119456,...,0.420667,0.275919,0.349890,178.202631,214.354693,83.780137,0.364011,0.296423,0.093786,21.667891
3,3,22050,22.0,female,english,1430.34990,3998.4510,147.98083,0.017004,0.102389,...,0.155145,0.111702,0.146220,228.292126,224.492410,49.859493,0.649081,0.684984,0.083701,22.476961
4,4,22050,22.0,male,dutch,1688.72340,3998.6113,145.44772,0.028027,0.124831,...,0.364694,0.258917,0.320842,121.120608,109.050773,29.340614,0.390123,0.346465,0.108384,19.090295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
686,686,22050,,male,igbo,570.62740,3900.6730,145.67577,0.018842,0.079197,...,0.575447,0.671069,0.395700,108.426812,119.609612,40.071467,0.366293,0.383727,0.147121,2.024490
687,687,22050,,male,igbo,974.13965,3919.0024,145.90408,0.024367,0.117492,...,0.248571,0.178327,0.272512,61.593122,51.465112,28.159212,0.171333,0.089318,0.178207,4.947211
688,688,22050,,female,serbian,1113.27650,3999.3510,145.38307,0.020637,0.089355,...,0.206062,0.049989,0.295049,194.536542,183.400809,49.264104,0.532887,0.610292,0.115757,27.485306
689,689,22050,,male,spanish,1759.17420,3999.4610,145.56773,0.026118,0.106429,...,0.400657,0.308231,0.270338,143.914523,141.421356,50.750295,0.424355,0.381707,0.172068,21.632653


In [76]:
df_total.columns.values[122] = 'time_len'

In [78]:
df_total.to_csv('../datasets/df_features_extracted.csv', index=False)