# Extracting Audio Features with Essentia Streaming

Essentia is an open-source C++ library for audio analysis and audio-based music information retrieval.
Documentation: http://essentia.upf.edu/documentation.html

In [2]:
import sys
sys.path.append('../')

import os
import essentia
import essentia.standard as esstd
import essentia.streaming as esstr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fnmatch import fnmatch
from pathlib import Path

from src.data.egfxset import load_egfxset
from src.data.springset import load_springset

In [3]:
DATA_DIR = Path('../data/raw/')
MODELS_DIR = Path('../models/')
RESULTS_DIR = Path('../data/features/')
PLOTS_DIR = Path('../docs/plots/')

## Basic set of features

In [4]:
def extract_basic_features_essentia(data, sample_rate):
    """"""

    features = {}

    pool = essentia.Pool()

    # Instantiate the algorithms
    # loader = esstr.MonoLoader(filename=file, sampleRate=sample_rate)
    loader = esstr.VectorInput(data)
    fcut = esstr.FrameCutter(frameSize=2048, hopSize=1024)
    w = esstr.Windowing(type='hann')
    spec = esstr.Spectrum(size=2048)
    
    gain = esstr.ReplayGain(sampleRate=sample_rate)
    leq = esstr.Leq()
    loudness = esstr.Loudness()

    zero_crossing_rate = esstr.ZeroCrossingRate()
    
    centroid = esstr.SpectralCentroidTime(sampleRate=sample_rate)
    pitch = esstr.PitchYin(sampleRate=sample_rate)

    # Connect the algorithms
    loader.data >> fcut.signal
    fcut.frame >> centroid.array
    fcut.frame >> loudness.signal

    fcut.frame >> zero_crossing_rate.signal
    
    fcut.frame >> pitch.signal
   
    loader.data >> gain.signal
    loader.data >> leq.signal
    
    # Create a pool and output algorithms
    gain.replayGain >> (pool, 'gain')
    leq.leq >> (pool, 'leq')
    loudness.loudness >> (pool, 'loudness')
    
    zero_crossing_rate.zeroCrossingRate >> (pool, 'zcr')
    
    centroid.centroid >> (pool, 'centroid')

    pitch.pitch >> (pool, 'pitch')
    pitch.pitchConfidence >> (pool, 'confidence')

    # Run the network
    essentia.run(loader)

    aggrpool = esstd.PoolAggregator(defaultStats = ["mean"])(pool)    
    descriptors = aggrpool.descriptorNames()
    
    # for feature in ['gain', 'leq', 'loudness.mean', 'centroid.mean', 'pitch.mean', 'confidence.mean']:
    #     features[feature].append(np.array(aggrpool[feature]).flatten()[0])
    for feature in ['gain', 'leq', 'loudness.mean', 'zcr.mean', 'centroid.mean', 'pitch.mean', 'confidence.mean']:
        value = np.array(aggrpool[feature]).flatten()[0]
        if feature not in features:
            features[feature] = []
            features[feature].append(value)

    return features

## Larger set of features


to get help:
```python
help(esstr.SNR())
```

In [None]:
# help(esstr.SNR())
print(help(esstr.LowLevelSpectralExtractor()))

In [6]:
def extract_more_features_essentia(data, sample_rate):
    """Extract various audio features using Essentia's streaming mode."""
    
    frame_size = 1024
    # noise_threshold = -90

    features = {}
    pool = essentia.Pool()

    # Instantiate the algorithms
    loader = esstr.VectorInput(data)
    fcut = esstr.FrameCutter(frameSize=frame_size, hopSize=512)
    w = esstr.Windowing()
    spec = esstr.Spectrum(size=frame_size)

    gain = esstr.ReplayGain(sampleRate=sample_rate)
    leq = esstr.Leq()
    loudness = esstr.Loudness()
    zero_crossing_rate = esstr.ZeroCrossingRate()
    centroid = esstr.SpectralCentroidTime(sampleRate=sample_rate)
    pitch = esstr.PitchYin(sampleRate=sample_rate, frameSize=frame_size)
    
    # New features' algorithms
    flatness = esstr.FlatnessDB()
    hfc = esstr.HFC(sampleRate=sample_rate)
    # snr = esstr.SNR(sampleRate=sample_rate, frameSize=frame_size, noiseThreshold=noise_threshold)

    # Connect the algorithms
    loader.data >> fcut.signal
    loader.data >> gain.signal
    loader.data >> leq.signal

    fcut.frame >> centroid.array
    fcut.frame >> loudness.signal
    fcut.frame >> zero_crossing_rate.signal
    fcut.frame >> pitch.signal
    # fcut.frame >> snr.frame
    
    fcut.frame >> w.frame
    w.frame >> spec.frame

    spec.spectrum >> flatness.array
    spec.spectrum >> hfc.spectrum
    
    # Add the features to the pool
    gain.replayGain >> (pool, 'gain')
    leq.leq >> (pool, 'leq')
    loudness.loudness >> (pool, 'loudness')
    zero_crossing_rate.zeroCrossingRate >> (pool, 'zcr')
    centroid.centroid >> (pool, 'centroid')
    pitch.pitch >> (pool, 'pitch')
    pitch.pitchConfidence >> (pool, 'confidence')

    # snr.instantSNR >> None
    # snr.averagedSNR >> None
    # snr.spectralSNR >> (pool, 'spectralSNR')
    
    flatness.flatnessDB >> (pool, 'flatness')
    hfc.hfc >> (pool, 'hfc')
    
    # Run the network
    essentia.run(loader)

    aggrpool = esstd.PoolAggregator(defaultStats = ["mean"])(pool)    

    # Extract features
    for feature in ['gain', 'leq', 'loudness.mean', 'zcr.mean', 'pitch.mean', 'confidence.mean', 'flatness.mean', 'hfc.mean']:
        value = np.array(aggrpool[feature]).flatten()[0]
        
        # Remove the .mean suffix for storage
        feature_name = feature.replace('.mean', '')

        if feature_name not in features:
            features[feature_name] = []
        features[feature_name].append(value)

    return features

## EGFxSet

This may take long time...

In [7]:
sample_rate = 48000
train_loader, valid_loader, test_loader = load_egfxset(datadir=DATA_DIR, batch_size=1, train_ratio=0.50, valid_ratio=0.25, test_ratio=0.25, num_workers=4)

In [14]:
# EGFxSet: train set

data_x = []
data_y = []

for idx, (input, target) in enumerate(train_loader):
    # Convert tensor to numpy and ensure dtype and shape
    input_np = input.numpy().squeeze().astype(np.float32)
    target_np = target.numpy().squeeze().astype(np.float32)

    # Extract features from the dry signal
    x_features = extract_more_features_essentia(input_np, sample_rate)
    y_features = extract_more_features_essentia(target_np, sample_rate)

    x_features = {f'{key}': value for key, value in x_features.items()}
    y_features = {f'{key}': value for key, value in y_features.items()}
    
    data_x.append({'idx': idx, **x_features})
    data_y.append({'idx': idx, **y_features})

# Convert to DataFrame
df_x = pd.DataFrame(data_x)
df_y = pd.DataFrame(data_y)

# Save DataFrame as a .json file
df_x.to_json(os.path.join(RESULTS_DIR, 'egfxset_x_train.json'), orient='records', lines=True)
df_y.to_json(os.path.join(RESULTS_DIR, 'egfxset_y_train.json'), orient='records', lines=True)

In [20]:
# EGFxSet: validation set

data_x = []
data_y = []

for idx, (input, target) in enumerate(valid_loader):
    # Convert tensor to numpy and ensure dtype and shape
    input_np = input.numpy().squeeze().astype(np.float32)
    target_np = target.numpy().squeeze().astype(np.float32)

    # Extract features from the dry signal
    x_features = extract_more_features_essentia(input_np, sample_rate)
    y_features = extract_more_features_essentia(target_np, sample_rate)

    x_features = {f'{key}': value for key, value in x_features.items()}
    y_features = {f'{key}': value for key, value in y_features.items()}
    
    data_x.append({'idx': idx, **x_features})
    data_y.append({'idx': idx, **y_features})

# Convert to DataFrame
df_x = pd.DataFrame(data_x)
df_y = pd.DataFrame(data_y)

# Save DataFrame as a .json file
df_x.to_json(os.path.join(RESULTS_DIR, 'egfxset_x_valid.json'), orient='records', lines=True)
df_y.to_json(os.path.join(RESULTS_DIR, 'egfxset_y_valid.json'), orient='records', lines=True)

In [8]:
# EGFxSet: test set

data_x = []
data_y = []

for idx, (input, target) in enumerate(test_loader):
    # Convert tensor to numpy and ensure dtype and shape
    input_np = input.numpy().squeeze().astype(np.float32)
    target_np = target.numpy().squeeze().astype(np.float32)

    # Extract features from the dry signal
    x_features = extract_more_features_essentia(input_np, sample_rate)
    y_features = extract_more_features_essentia(target_np, sample_rate)

    x_features = {f'{key}': value for key, value in x_features.items()}
    y_features = {f'{key}': value for key, value in y_features.items()}
    
    data_x.append({'idx': idx, **x_features})
    data_y.append({'idx': idx, **y_features})

# Convert to DataFrame
df_x = pd.DataFrame(data_x)
df_y = pd.DataFrame(data_y)

# Save DataFrame as a .json file
df_x.to_json(os.path.join(RESULTS_DIR, 'egfxset_x_test.json'), orient='records', lines=True)
df_y.to_json(os.path.join(RESULTS_DIR, 'egfxset_y_test.json'), orient='records', lines=True)

## SpringSet

In [9]:
sample_rate = 16000

train_loader, valid_loader, test_loader = load_springset(datadir=DATA_DIR, batch_size=1, train_ratio=0.70, num_workers=4)

Found 4 files in ../data/raw/spring
Using dry_train.h5 and wet_train.h5 for train split.
Found 4 files in ../data/raw/spring
Using dry_val_test.h5 and wet_val_test.h5 for test split.


In [13]:
# SpringSet: train set

data_x = []
data_y = []

for idx, (input, target) in enumerate(train_loader):
    # Convert tensor to numpy and ensure dtype and shape
    input_np = input.numpy().squeeze().astype(np.float32)
    target_np = target.numpy().squeeze().astype(np.float32)

    # Extract features from the dry signal
    x_features = extract_more_features_essentia(input_np, sample_rate)
    y_features = extract_more_features_essentia(target_np, sample_rate)

    x_features = {f'{key}': value for key, value in x_features.items()}
    y_features = {f'{key}': value for key, value in y_features.items()}
    
    data_x.append({'idx': idx, **x_features})
    data_y.append({'idx': idx, **y_features})

# Convert to DataFrame
df_x = pd.DataFrame(data_x)
df_y = pd.DataFrame(data_y)

# Save DataFrame as a .json file
df_x.to_json(os.path.join(RESULTS_DIR, 'springset_x_train.json'), orient='records', lines=True)
df_y.to_json(os.path.join(RESULTS_DIR, 'springset_y_train.json'), orient='records', lines=True)

In [11]:
# SpringSet: validation set

data_x = []
data_y = []

for idx, (input, target) in enumerate(valid_loader):
    # Convert tensor to numpy and ensure dtype and shape
    input_np = input.numpy().squeeze().astype(np.float32)
    target_np = target.numpy().squeeze().astype(np.float32)

    # Extract features from the dry signal
    x_features = extract_more_features_essentia(input_np, sample_rate)
    y_features = extract_more_features_essentia(target_np, sample_rate)

    x_features = {f'{key}': value for key, value in x_features.items()}
    y_features = {f'{key}': value for key, value in y_features.items()}
    
    data_x.append({'idx': idx, **x_features})
    data_y.append({'idx': idx, **y_features})

# Convert to DataFrame
df_x = pd.DataFrame(data_x)
df_y = pd.DataFrame(data_y)

# Save DataFrame as a .json file
df_x.to_json(os.path.join(RESULTS_DIR, 'springset_x_valid.json'), orient='records', lines=True)
df_y.to_json(os.path.join(RESULTS_DIR, 'springset_y_valid.json'), orient='records', lines=True)

In [12]:
# SpringSet: test set

data_x = []
data_y = []

for idx, (input, target) in enumerate(test_loader):
    # Convert tensor to numpy and ensure dtype and shape
    input_np = input.numpy().squeeze().astype(np.float32)
    target_np = target.numpy().squeeze().astype(np.float32)

    # Extract features from the dry signal
    x_features = extract_more_features_essentia(input_np, sample_rate)
    y_features = extract_more_features_essentia(target_np, sample_rate)

    x_features = {f'{key}': value for key, value in x_features.items()}
    y_features = {f'{key}': value for key, value in y_features.items()}
    
    data_x.append({'idx': idx, **x_features})
    data_y.append({'idx': idx, **y_features})

# Convert to DataFrame
df_x = pd.DataFrame(data_x)
df_y = pd.DataFrame(data_y)

# Save DataFrame as a .json file
df_x.to_json(os.path.join(RESULTS_DIR, 'springset_x_test.json'), orient='records', lines=True)
df_y.to_json(os.path.join(RESULTS_DIR, 'springset_y_test.json'), orient='records', lines=True)