# Notebook 02: Acoustic Analysis & Feature Extraction

## Overview
This notebook performs comprehensive acoustic analysis of instrument families through feature extraction and visualization. It computes traditional audio features and provides visual insights into the acoustic properties that differentiate instrument families.

## Workflow
1. **RMS and Dynamics Analysis** — Compute RMS level, peak amplitude, and dynamic range for all audio clips
2. **Spectral Timbre Features** — Extract spectral centroid, bandwidth, rolloff, flatness, and zero-crossing rate
3. **Harmonic vs Percussive Separation (HPSS)** — Quantify harmonic and percussive energy components
4. **Mel-Spectrogram Examples** — Visual comparison of time-frequency representations across families
5. **Feature Space Visualization** — Dimensionality reduction (PCA and t-SNE) to visualize family separability
6. **Summary and Export** — Save acoustic features to CSV for downstream analysis

---

In [1]:
# --- Imports and configuration ---

import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import numpy as np
import pandas as pd
import soundfile as sf
import librosa
import librosa.display

from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import gridplot, column, row
from bokeh.models import HoverTool, ColumnDataSource, ColorBar, LinearColorMapper, BasicTicker
from bokeh.palettes import Category10_5, Viridis256
from bokeh.transform import dodge

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

output_notebook()

# Configuration
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

PROJECT_ROOT = Path("/Users/dghifari/02-University/SEM-2-2025/elec5305-project-520140154")
manifests_dir = PROJECT_ROOT / "Manifests"
manifest_path = manifests_dir / "manifest_master.csv"

# Load data
df = pd.read_csv(manifest_path)
FAMILY_COLNAME = "family_label"
families = sorted(df[FAMILY_COLNAME].unique())

# Color palette
COLORS = {fam: Category10_5[i] for i, fam in enumerate(families)}

print(f"Dataset: {len(df)} clips across {len(families)} families")
print(f"\nClips per family:")
print(df[FAMILY_COLNAME].value_counts().sort_index())

# Audio loading utilities
def load_waveform(filepath: str, target_sr: int = 16000, target_num_samples: int = 48000) -> np.ndarray:
    """Load and standardize audio to mono, target sample rate, and fixed length."""
    x, sr = sf.read(filepath, dtype='float32')
    
    # Convert to mono
    if x.ndim > 1:
        x = np.mean(x, axis=1)
    
    # Resample if needed
    if sr != target_sr:
        x = librosa.resample(x, orig_sr=sr, target_sr=target_sr)
    
    # Crop or pad to target length
    current_len = len(x)
    if current_len > target_num_samples:
        start = (current_len - target_num_samples) // 2
        x = x[start:start + target_num_samples]
    elif current_len < target_num_samples:
        x = np.pad(x, (0, target_num_samples - current_len), mode='constant')
    
    return x.astype(np.float32)

print("Audio utilities loaded")

Dataset: 874 clips across 5 families

Clips per family:
family_label
keyboards     214
percussion    143
strings       142
voice         170
winds         205
Name: count, dtype: int64
Audio utilities loaded


## 1. RMS and Dynamics Analysis
Compute RMS level, peak amplitude, and dynamic range for all clips.

In [2]:
df_acoustic = df.copy()
rms_db_list, peak_db_list, dynamic_range_db_list = [], [], []

for idx, data_row in df_acoustic.iterrows():
    if idx % 100 == 0:
        print(f"Processing {idx+1}/{len(df_acoustic)}...", end='\r')
    
    try:
        x = load_waveform(data_row['filepath'])
        
        # Compute RMS and peak
        rms = np.sqrt(np.mean(x**2))
        rms_db = 20 * np.log10(rms + 1e-12)
        peak = np.max(np.abs(x))
        peak_db = 20 * np.log10(peak + 1e-12)
        
        rms_db_list.append(rms_db)
        peak_db_list.append(peak_db)
        dynamic_range_db_list.append(peak_db - rms_db)
    except Exception as e:
        rms_db_list.append(np.nan)
        peak_db_list.append(np.nan)
        dynamic_range_db_list.append(np.nan)

df_acoustic['rms_db'] = rms_db_list
df_acoustic['peak_db'] = peak_db_list
df_acoustic['dynamic_range_db'] = dynamic_range_db_list

print(f"\n✓ Computed dynamics for {len(df_acoustic)} clips")

Processing 801/874...
✓ Computed dynamics for 874 clips


In [3]:
# Summary statistics
print("Dynamics Summary by Family:")
print(df_acoustic.groupby(FAMILY_COLNAME)[['rms_db', 'peak_db', 'dynamic_range_db']].agg(['mean', 'std']).round(2))

# Helper function for boxplots
def create_boxplot(df, column, title, ylabel, families, colors):
    from bokeh.transform import jitter
    
    p = figure(x_range=families, title=title, width=550, height=450,
               toolbar_location=None, tools="")
    
    # Collect all box plot elements in data structures
    for family in families:
        data = df[df[FAMILY_COLNAME] == family][column].values
        
        q1 = np.percentile(data, 25)
        q2 = np.percentile(data, 50)
        q3 = np.percentile(data, 75)
        iqr = q3 - q1
        upper = min(q3 + 1.5*iqr, data.max())
        lower = max(q1 - 1.5*iqr, data.min())
        
        # Create data sources for all glyphs
        box_source = ColumnDataSource(data=dict(
            family=[family],
            q1=[q1],
            q2=[q2],
            q3=[q3],
            upper=[upper],
            lower=[lower]
        ))
        
        # Box
        p.vbar(x='family', width=0.6, bottom='q1', top='q3', 
               source=box_source,
               fill_color=colors[family], alpha=0.7, line_color="black", line_width=1.5)
        
        # Median line - use rect instead of segment
        p.rect(x='family', y='q2', width=0.6, height=0.001, 
               source=box_source,
               fill_color="black", line_color="black", line_width=2)
        
        # Whisker stems
        p.segment(x0='family', y0='q3', x1='family', y1='upper', 
                  source=box_source,
                  line_color="black", line_width=1.5)
        p.segment(x0='family', y0='q1', x1='family', y1='lower', 
                  source=box_source,
                  line_color="black", line_width=1.5)
        
        # Whisker caps - use rect
        # Need separate sources for upper and lower caps
        upper_cap_source = ColumnDataSource(data=dict(family=[family], upper=[upper]))
        lower_cap_source = ColumnDataSource(data=dict(family=[family], lower=[lower]))
        
        p.rect(x='family', y='upper', width=0.15, height=0.001,
               source=upper_cap_source,
               fill_color="black", line_color="black")
        p.rect(x='family', y='lower', width=0.15, height=0.001,
               source=lower_cap_source,
               fill_color="black", line_color="black")
        
        # Scatter points with jitter
        point_source = ColumnDataSource(data=dict(
            x=[family] * len(data),
            y=data
        ))
        p.scatter(x=jitter('x', width=0.2, range=p.x_range), y='y', 
                 source=point_source, size=4, color=colors[family], alpha=0.3)
    
    p.xaxis.major_label_orientation = 0.785
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_alpha = 0.5
    p.yaxis.axis_label = ylabel
    p.xaxis.axis_label = "Family"
    p.title.text_font_size = "13pt"
    
    return p

# Create plots
from bokeh.layouts import row as bokeh_row
p1 = create_boxplot(df_acoustic, 'rms_db', 'RMS Level by Family', 'RMS (dB)', families, COLORS)
p2 = create_boxplot(df_acoustic, 'dynamic_range_db', 'Dynamic Range by Family', 'Dynamic Range (dB)', families, COLORS)

show(bokeh_row(p1, p2))

Dynamics Summary by Family:
                 rms_db       peak_db       dynamic_range_db      
                   mean   std    mean   std             mean   std
family_label                                                      
keyboards    -27.370001  5.36  -13.07  5.57            14.31  1.63
percussion   -23.450001  5.02   -4.62  3.96            18.83  3.02
strings      -23.570000  6.40   -9.44  6.03            14.13  2.31
voice        -20.799999  5.43   -8.55  5.07            12.25  3.71
winds        -25.900000  6.14  -12.76  5.43            13.13  2.69


## 2. Spectral Timbre Features
Extract spectral centroid, bandwidth, rolloff, flatness, and zero-crossing rate.

In [4]:
centroid_list, bandwidth_list, rolloff_list, flatness_list, zcr_list = [], [], [], [], []

for idx, data_row in df_acoustic.iterrows():
    if idx % 100 == 0:
        print(f"Processing {idx+1}/{len(df_acoustic)}...", end='\r')
    
    try:
        x = load_waveform(data_row['filepath'])
        sr = 16000
        
        # Extract spectral features
        centroid_list.append(np.mean(librosa.feature.spectral_centroid(y=x, sr=sr)))
        bandwidth_list.append(np.mean(librosa.feature.spectral_bandwidth(y=x, sr=sr)))
        rolloff_list.append(np.mean(librosa.feature.spectral_rolloff(y=x, sr=sr, roll_percent=0.85)))
        flatness_list.append(np.mean(librosa.feature.spectral_flatness(y=x)))
        zcr_list.append(np.mean(librosa.feature.zero_crossing_rate(y=x)))
    except Exception as e:
        centroid_list.append(np.nan)
        bandwidth_list.append(np.nan)
        rolloff_list.append(np.nan)
        flatness_list.append(np.nan)
        zcr_list.append(np.nan)

df_acoustic['centroid_hz'] = centroid_list
df_acoustic['bandwidth_hz'] = bandwidth_list
df_acoustic['rolloff_hz'] = rolloff_list
df_acoustic['flatness'] = flatness_list
df_acoustic['zcr'] = zcr_list

print(f"\n✓ Extracted spectral features for {len(df_acoustic)} clips")

Processing 801/874...
✓ Extracted spectral features for 874 clips


In [5]:
# Summary statistics
feature_cols = ['centroid_hz', 'bandwidth_hz', 'rolloff_hz', 'flatness', 'zcr']
print("Spectral Features Summary by Family:")
print(df_acoustic.groupby(FAMILY_COLNAME)[feature_cols].agg(['mean', 'std']).round(2))

# Create boxplots for each feature
from bokeh.transform import jitter

feature_labels = {
    'centroid_hz': 'Spectral Centroid (Hz)',
    'bandwidth_hz': 'Spectral Bandwidth (Hz)',
    'rolloff_hz': 'Spectral Rolloff (Hz)',
    'flatness': 'Spectral Flatness',
    'zcr': 'Zero-Crossing Rate'
}

plots = []
for feature in feature_cols:
    p = figure(x_range=families, title=feature_labels[feature], 
               width=450, height=380, toolbar_location=None, tools="")
    
    for family in families:
        data = df_acoustic[df_acoustic[FAMILY_COLNAME] == family][feature].values
        
        q1 = np.percentile(data, 25)
        q2 = np.percentile(data, 50)
        q3 = np.percentile(data, 75)
        iqr = q3 - q1
        upper = min(q3 + 1.5*iqr, data.max())
        lower = max(q1 - 1.5*iqr, data.min())
        
        # Create data sources for all glyphs
        box_source = ColumnDataSource(data=dict(
            family=[family],
            q1=[q1],
            q2=[q2],
            q3=[q3],
            upper=[upper],
            lower=[lower]
        ))
        
        # Box
        p.vbar(x='family', width=0.6, bottom='q1', top='q3', 
               source=box_source,
               fill_color=COLORS[family], alpha=0.7, line_color="black", line_width=1.5)
        
        # Median line - use rect
        p.rect(x='family', y='q2', width=0.6, height=0.001, 
               source=box_source,
               fill_color="black", line_color="black", line_width=2)
        
        # Whiskers
        p.segment(x0='family', y0='q3', x1='family', y1='upper', 
                  source=box_source,
                  line_color="black", line_width=1.5)
        p.segment(x0='family', y0='q1', x1='family', y1='lower', 
                  source=box_source,
                  line_color="black", line_width=1.5)
        
        # Whisker caps - use separate sources
        upper_cap_source = ColumnDataSource(data=dict(family=[family], upper=[upper]))
        lower_cap_source = ColumnDataSource(data=dict(family=[family], lower=[lower]))
        
        p.rect(x='family', y='upper', width=0.15, height=0.001,
               source=upper_cap_source,
               fill_color="black", line_color="black")
        p.rect(x='family', y='lower', width=0.15, height=0.001,
               source=lower_cap_source,
               fill_color="black", line_color="black")
        
        # Points
        point_source = ColumnDataSource(data=dict(
            x=[family] * len(data),
            y=data
        ))
        p.scatter(x=jitter('x', width=0.2, range=p.x_range), y='y', 
                 source=point_source, size=3, color=COLORS[family], alpha=0.3)
    
    p.xaxis.major_label_orientation = 0.785
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_alpha = 0.5
    p.yaxis.axis_label = feature_labels[feature]
    p.xaxis.axis_label = "Family"
    p.title.text_font_size = "12pt"
    
    plots.append(p)

# Display in grid
grid = gridplot(plots, ncols=3, sizing_mode='fixed')
show(grid)

Spectral Features Summary by Family:
             centroid_hz         bandwidth_hz         rolloff_hz           \
                    mean     std         mean     std       mean      std   
family_label                                                                
keyboards        1466.48  671.36      1501.27  662.90    3080.96  1736.80   
percussion       2216.84  913.70      1947.84  442.52    4404.49  1406.10   
strings           940.20  532.59      1063.47  402.89    1754.53  1113.02   
voice            1646.08  719.59      1615.30  455.29    3150.62  1596.69   
winds            1345.18  477.30      1118.42  335.51    2282.27   885.95   

             flatness         zcr        
                 mean   std  mean   std  
family_label                             
keyboards        0.02  0.07  0.11  0.05  
percussion       0.09  0.10  0.16  0.13  
strings          0.03  0.14  0.07  0.05  
voice            0.07  0.13  0.11  0.06  
winds            0.03  0.09  0.11  0.04  




## 3. Harmonic vs Percussive Energy (HPSS)
Separate and quantify harmonic and percussive components.

In [6]:
harmonic_ratio_list, percussive_ratio_list = [], []

for idx, data_row in df_acoustic.iterrows():
    if idx % 100 == 0:
        print(f"Processing {idx+1}/{len(df_acoustic)}...", end='\r')
    
    try:
        x = load_waveform(data_row['filepath'])
        
        # Separate harmonic and percussive components
        D = librosa.stft(x)
        H, P = librosa.decompose.hpss(D)
        
        # Compute energy ratios
        harmonic_energy = np.sum(np.abs(H)**2)
        percussive_energy = np.sum(np.abs(P)**2)
        total_energy = harmonic_energy + percussive_energy + 1e-9
        
        harmonic_ratio_list.append(harmonic_energy / total_energy)
        percussive_ratio_list.append(percussive_energy / total_energy)
    except Exception as e:
        harmonic_ratio_list.append(np.nan)
        percussive_ratio_list.append(np.nan)

df_acoustic['harmonic_ratio'] = harmonic_ratio_list
df_acoustic['percussive_ratio'] = percussive_ratio_list

print(f"\n✓ Computed HPSS for {len(df_acoustic)} clips")

Processing 801/874...
✓ Computed HPSS for 874 clips


In [7]:
# Summary statistics
print("HPSS Summary by Family:")
print(df_acoustic.groupby(FAMILY_COLNAME)[['harmonic_ratio', 'percussive_ratio']].agg(['mean', 'std']).round(3))

# Create boxplots
from bokeh.layouts import row as bokeh_row
p1 = create_boxplot(df_acoustic, 'harmonic_ratio', 'Harmonic Energy Ratio', 'Harmonic Ratio', families, COLORS)
p2 = create_boxplot(df_acoustic, 'percussive_ratio', 'Percussive Energy Ratio', 'Percussive Ratio', families, COLORS)

show(bokeh_row(p1, p2))

HPSS Summary by Family:
             harmonic_ratio        percussive_ratio       
                       mean    std             mean    std
family_label                                              
keyboards             0.925  0.067            0.075  0.067
percussion            0.361  0.302            0.639  0.302
strings               0.881  0.176            0.119  0.176
voice                 0.819  0.198            0.181  0.198
winds                 0.873  0.156            0.127  0.156


## 4. Mel-Spectrogram Examples
Visual comparison of timbre across families.

In [8]:
n_examples = 2
sr, n_mels, hop_length, fmax = 16000, 64, 512, 8000

# Process each family separately and group visualizations
for family in families:
    print(f"\n{family.upper()} Family Examples")
    print("-" * 50)
    
    family_df = df_acoustic[df_acoustic[FAMILY_COLNAME] == family]
    
    # Try to get examples from different instrument categories
    example_indices = []
    if 'instrument_label' in df_acoustic.columns:
        # Get unique instruments in this family
        instruments_in_family = family_df['instrument_label'].dropna().unique()
        
        if len(instruments_in_family) >= n_examples:
            # Select n_examples different instruments
            selected_instruments = np.random.choice(instruments_in_family, size=n_examples, replace=False)
            
            for instrument in selected_instruments:
                # Get one random example from each instrument
                instrument_samples = family_df[family_df['instrument_label'] == instrument]
                idx = np.random.choice(instrument_samples.index, size=1)[0]
                example_indices.append(idx)
        else:
            # Not enough different instruments, just pick random samples
            example_indices = np.random.choice(family_df.index, size=min(n_examples, len(family_df)), replace=False).tolist()
    else:
        # No instrument labels, just pick random samples
        example_indices = np.random.choice(family_df.index, size=min(n_examples, len(family_df)), replace=False).tolist()
    
    family_plots = []
    
    for j, idx in enumerate(example_indices):
        filepath = df_acoustic.loc[idx, 'filepath']
        filename = Path(filepath).stem  # Get filename without extension
        
        # Extract instrument label if available
        if 'instrument_label' in df_acoustic.columns:
            instrument = df_acoustic.loc[idx, 'instrument_label']
            title = f"{family.capitalize()} — {instrument.capitalize()}"
        else:
            title = f"{family.capitalize()} — Example {j+1}"
        
        # Load and compute mel-spectrogram
        x = load_waveform(filepath)
        S = librosa.feature.melspectrogram(y=x, sr=sr, n_mels=n_mels, hop_length=hop_length, fmax=fmax)
        S_db = librosa.power_to_db(S, ref=np.max)
        
        # Create time and frequency axes
        n_frames = S_db.shape[1]
        time_sec = librosa.frames_to_time(np.arange(n_frames), sr=sr, hop_length=hop_length)
        mel_freqs = librosa.mel_frequencies(n_mels=n_mels, fmax=fmax)
        
        # Create Bokeh figure with improved styling
        p = figure(
            title=title,
            x_axis_label="Time (s)",
            y_axis_label="Frequency (Hz)",
            width=550,
            height=300,
            toolbar_location="right",
            tools="hover,save,reset",
            tooltips=[("Time", "$x{0.2f}s"), ("Freq", "$y{0.0f}Hz"), ("Power", "@image{0.1f}dB")]
        )
        
        # Create image with color mapper
        color_mapper = LinearColorMapper(palette=Viridis256, low=-80, high=0)
        p.image(image=[S_db], x=0, y=mel_freqs[0], 
                dw=time_sec[-1], dh=mel_freqs[-1]-mel_freqs[0],
                color_mapper=color_mapper)
        
        # Add color bar
        color_bar = ColorBar(color_mapper=color_mapper, 
                            ticker=BasicTicker(desired_num_ticks=8),
                            label_standoff=8, 
                            border_line_color=None, 
                            location=(0, 0),
                            title='dB')
        p.add_layout(color_bar, 'right')
        
        # Styling
        p.title.text_font_size = "12pt"
        p.title.text_font_style = "bold"
        p.xgrid.grid_line_alpha = 0.3
        p.ygrid.grid_line_alpha = 0.3
        p.background_fill_color = "#fafafa"
        
        family_plots.append(p)
        
        if 'instrument_label' in df_acoustic.columns:
            print(f"  {j+1}. {instrument.capitalize()} — {filename}")
        else:
            print(f"  {j+1}. {filename}")
    
    # Display family's examples in a row
    show(row(family_plots))


KEYBOARDS Family Examples
--------------------------------------------------
  1. Organ — keyboards_organ_keyboards_organ_001_clip_008
  2. Synth — keyboards_synth_keyboards_synth_002_clip_005



PERCUSSION Family Examples
--------------------------------------------------
  1. Tambourine — percussion_tambourine_percussion_tambourine_002_clip_002
  2. Drum — percussion_drum_percussion_drum_005_clip_020



STRINGS Family Examples
--------------------------------------------------
  1. Violin — strings_violin_strings_violin_002_clip_008
  2. Cello — strings_cello_strings_cello_001_clip_003



VOICE Family Examples
--------------------------------------------------
  1. Spoken — voice_spoken_voice_spoken_001_clip_002
  2. Hum — voice_hum_voice_hum_002_clip_024



WINDS Family Examples
--------------------------------------------------
  1. Clarinet — winds_clarinet_winds_clarinet_001_clip_012
  2. Saxophone — winds_saxophone_winds_saxophone_001_clip_001


## 5. Feature Space Visualization (PCA / t-SNE)
Dimensionality reduction to visualize family separability in acoustic feature space.

In [9]:
# Prepare feature matrix
feature_cols_for_viz = ['rms_db', 'centroid_hz', 'bandwidth_hz', 'rolloff_hz', 'flatness', 'zcr', 'harmonic_ratio']
df_viz = df_acoustic.dropna(subset=feature_cols_for_viz)

X_feat = df_viz[feature_cols_for_viz].values
y_labels = df_viz[FAMILY_COLNAME].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_feat)

print(f"Feature matrix: {X_scaled.shape[0]} samples × {X_scaled.shape[1]} features")

# PCA
pca = PCA(n_components=2, random_state=RANDOM_SEED)
X_pca = pca.fit_transform(X_scaled)
print(f"PCA variance explained: {pca.explained_variance_ratio_.sum():.1%}")

# t-SNE
print("Running t-SNE...")
tsne = TSNE(n_components=2, random_state=RANDOM_SEED, perplexity=30, max_iter=1000)
X_tsne = tsne.fit_transform(X_scaled)
print("✓ Dimensionality reduction complete")

Feature matrix: 874 samples × 7 features
PCA variance explained: 69.6%
Running t-SNE...
✓ Dimensionality reduction complete


In [10]:
# PCA plot
from bokeh.layouts import row as bokeh_row

p1 = figure(
    title='PCA: Acoustic Feature Space',
    x_axis_label=f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
    y_axis_label=f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)',
    width=650,
    height=550,
    tools="pan,wheel_zoom,box_zoom,reset,hover",
    tooltips=[("Family", "@family")]
)

for family in families:
    mask = y_labels == family
    source = ColumnDataSource(data=dict(
        x=X_pca[mask, 0],
        y=X_pca[mask, 1],
        family=[family] * mask.sum()
    ))
    p1.scatter('x', 'y', source=source, size=8, alpha=0.6, 
              color=COLORS[family], legend_label=family)

p1.legend.location = "top_right"
p1.legend.click_policy = "hide"
p1.grid.grid_line_alpha = 0.3
p1.title.text_font_size = "13pt"

# t-SNE plot
p2 = figure(
    title='t-SNE: Acoustic Feature Space',
    x_axis_label='t-SNE Dimension 1',
    y_axis_label='t-SNE Dimension 2',
    width=650,
    height=550,
    tools="pan,wheel_zoom,box_zoom,reset,hover",
    tooltips=[("Family", "@family")]
)

for family in families:
    mask = y_labels == family
    source = ColumnDataSource(data=dict(
        x=X_tsne[mask, 0],
        y=X_tsne[mask, 1],
        family=[family] * mask.sum()
    ))
    p2.scatter('x', 'y', source=source, size=8, alpha=0.6, 
              color=COLORS[family], legend_label=family)

p2.legend.location = "top_right"
p2.legend.click_policy = "hide"
p2.grid.grid_line_alpha = 0.3
p2.title.text_font_size = "13pt"

show(bokeh_row(p1, p2))

## 6. Summary
Save acoustic features and display summary statistics.

In [11]:
# Summary table
all_features = ['rms_db', 'centroid_hz', 'bandwidth_hz', 'rolloff_hz', 
                'flatness', 'zcr', 'harmonic_ratio', 'percussive_ratio']
summary = df_acoustic.groupby(FAMILY_COLNAME)[all_features].mean().round(2)

print("Mean Acoustic Features by Family:")
print(summary)

Mean Acoustic Features by Family:
                 rms_db  centroid_hz  bandwidth_hz  rolloff_hz  flatness  \
family_label                                                               
keyboards    -27.370001      1466.48       1501.27     3080.96      0.02   
percussion   -23.450001      2216.84       1947.84     4404.49      0.09   
strings      -23.570000       940.20       1063.47     1754.53      0.03   
voice        -20.799999      1646.08       1615.30     3150.62      0.07   
winds        -25.900000      1345.18       1118.42     2282.27      0.03   

               zcr  harmonic_ratio  percussive_ratio  
family_label                                          
keyboards     0.11            0.93              0.07  
percussion    0.16            0.36              0.64  
strings       0.07            0.88              0.12  
voice         0.11            0.82              0.18  
winds         0.11            0.87              0.13  
