# Audio Data Processing
### Goal: take a .wav file as input => get a list of features corresponding to emotion dataset
(I want to make more data using existing files that matches the features I trained the emotion detecting model on and see how it does)

--- 

feature names - for each feature they computed the mean of the mean, mean of the std, std of the mean, and std of the std. I am assuming the inner computation is for that window size.

For example: 
- Mean_Acc1298_Mean_Mem40_Centroid numeric 
- Mean_Acc1298_Std_Mem40_Centroid numeric
- Std_Acc1298_Mean_Mem40_Centroid numeric
- Std_Acc1298_Std_Mem40_Centroid numeric


They do this for the following features:
1. Centroid (assuming this is the spectral centroid)
2. Rolloff
3. Flux
4. MFCC constants 1-12

I hope to do this and then plot the distributions to make sure that they fall into a similar distribution/range as the training data did (incase im accidentally scaling/not scaling something etc.)
other assumptions: i'm assuming the Mem40 means that the window size for the inner aggregation is 40.

In [23]:
#imports
import librosa
import numpy as np

#windowsize
mem_size = 40

In [24]:
feature_names = ['Centroid', 'Rolloff', 'Flux']
for i in range(0,13):
    feature_names.append("MFCC_"+str(i))
print(feature_names)

['Centroid', 'Rolloff', 'Flux', 'MFCC_0', 'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10', 'MFCC_11', 'MFCC_12']


In [31]:
#centroid
y, sr = librosa.load("./data/movie_music/1.wav")
centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
rolloffs = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)[0]
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
S = np.abs(librosa.stft(y))
S_norm = librosa.util.normalize(S, axis=0)
flux = np.sqrt(np.sum(np.diff(S_norm, axis=1)**2, axis=0))

In [32]:
print(centroids.shape)
print(rolloffs.shape)
print(mfccs[i].shape)
print(mfccs.shape)
print(flux.shape)

(1445,)
(1445,)
(1445,)
(13, 1445)
(1444,)


In [35]:
#create the features
def compute_feature_aggregations(feature_list,feature_name):
    feature_array = np.array(feature_list)

    chunks = [
        feature_array[i:i+mem_size]
        for i in range(0, len(feature_array) - mem_size + 1, mem_size)
    ]

    chunk_means = [np.mean(chunk) for chunk in chunks]
    chunk_stds = [np.std(chunk) for chunk in chunks]

    results = {
        "Mean_Acc1298_Mean_Mem40_"+feature_name:float(np.mean(chunk_means)) ,
        "Mean_Acc1298_Std_Mem40_"+feature_name: float( np.mean(chunk_stds)),
        "Std_Acc1298_Mean_Mem40_"+feature_name: float( np.std(chunk_means)),
        "Std_Acc1298_Std_Mem40_"+feature_name:  float( np.std(chunk_stds)),
    }

    return results

data = {
    **compute_feature_aggregations(centroids, "Centroid"),
    **compute_feature_aggregations(rolloffs, "Rolloff"),
    **compute_feature_aggregations(flux, "Flux"),
}

for i in range(13):
    name = "MFCC_"+str(i)
    data = {**compute_feature_aggregations(mfccs[i],name), **data}
print(data)
    

{'Mean_Acc1298_Mean_Mem40_MFCC_12': -7.353248596191406, 'Mean_Acc1298_Std_Mem40_MFCC_12': 5.891010284423828, 'Std_Acc1298_Mean_Mem40_MFCC_12': 4.079969882965088, 'Std_Acc1298_Std_Mem40_MFCC_12': 1.3373233079910278, 'Mean_Acc1298_Mean_Mem40_MFCC_11': 2.949770927429199, 'Mean_Acc1298_Std_Mem40_MFCC_11': 5.783247947692871, 'Std_Acc1298_Mean_Mem40_MFCC_11': 5.9295268058776855, 'Std_Acc1298_Std_Mem40_MFCC_11': 1.0910663604736328, 'Mean_Acc1298_Mean_Mem40_MFCC_10': 6.195117473602295, 'Mean_Acc1298_Std_Mem40_MFCC_10': 5.600480079650879, 'Std_Acc1298_Mean_Mem40_MFCC_10': 3.6627042293548584, 'Std_Acc1298_Std_Mem40_MFCC_10': 0.9604876637458801, 'Mean_Acc1298_Mean_Mem40_MFCC_9': 11.387118339538574, 'Mean_Acc1298_Std_Mem40_MFCC_9': 6.32328987121582, 'Std_Acc1298_Mean_Mem40_MFCC_9': 3.769984722137451, 'Std_Acc1298_Std_Mem40_MFCC_9': 1.074137806892395, 'Mean_Acc1298_Mean_Mem40_MFCC_8': -2.333993911743164, 'Mean_Acc1298_Std_Mem40_MFCC_8': 5.693330764770508, 'Std_Acc1298_Mean_Mem40_MFCC_8': 2.66748666

In [None]:
#put it all together in a function
def compute_feature_aggregations(feature_list,feature_name):
    feature_array = np.array(feature_list)

    chunks = [
        feature_array[i:i+mem_size]
        for i in range(0, len(feature_array) - mem_size + 1, mem_size)
    ]

    chunk_means = [np.mean(chunk) for chunk in chunks]
    chunk_stds = [np.std(chunk) for chunk in chunks]

    results = {
        "Mean_Acc1298_Mean_Mem40_"+feature_name:float(np.mean(chunk_means)) ,
        "Mean_Acc1298_Std_Mem40_"+feature_name: float( np.mean(chunk_stds)),
        "Std_Acc1298_Mean_Mem40_"+feature_name: float( np.std(chunk_means)),
        "Std_Acc1298_Std_Mem40_"+feature_name:  float( np.std(chunk_stds)),
    }

    return results

def extract_features_from_wav(filename):
    y, sr = librosa.load(filename)
    centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    rolloffs = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)[0]
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    S = np.abs(librosa.stft(y))
    S_norm = librosa.util.normalize(S, axis=0)
    flux = np.sqrt(np.sum(np.diff(S_norm, axis=1)**2, axis=0))

  
    data = {
        **compute_feature_aggregations(centroids, "Centroid"),
        **compute_feature_aggregations(rolloffs, "Rolloff"),
        **compute_feature_aggregations(flux, "Flux"),
    }

    for i in range(13):
        name = "MFCC_"+str(i)
        data = {**compute_feature_aggregations(mfccs[i],name), **data}

#also do it for a bunch of files in a folder
from os import listdir
import pandas as pd
def audio_directory_to_df(audio_dir):
    files = [f for f in listdir(audio_dir) if f.endswith('.wav')]
    data = [extract_features_from_wav(f) for f in files]
    df = pd.DataFrame(data)
    return df