## Extracting the Dataset for CSV

In [4]:
import zipfile
from pathlib import Path

# --- SETTINGS ---
dataset_zip = Path("dataset/PAMANA_Dataset.zip")  # top-level zip file
extract_root = Path("input")                      # where to unpack
extract_root.mkdir(exist_ok=True)

# --- STEP 1: Extract the top-level zip ---
print(f"Extracting: {dataset_zip}")
with zipfile.ZipFile(dataset_zip, "r") as z:
    z.extractall(extract_root)

# --- STEP 2: Recursively extract nested zips and delete them afterward ---
for inner_zip in extract_root.rglob("*.zip"):
    target_dir = inner_zip.parent / inner_zip.stem
    target_dir.mkdir(exist_ok=True)
    print(f"Extracting {inner_zip} -> {target_dir}")

    try:
        with zipfile.ZipFile(inner_zip, "r") as z:
            z.extractall(target_dir)
        inner_zip.unlink()  # delete the zip file after successful extraction
    except zipfile.BadZipFile as e:
        print(f"⚠️ Skipped bad zip {inner_zip}: {e}")

print("\n✅ All nested zips extracted and originals deleted!")

Extracting: dataset\PAMANA_Dataset.zip
Extracting input\PAMANA_Dataset\Luzon\Ab-bew_audios.zip -> input\PAMANA_Dataset\Luzon\Ab-bew_audios
Extracting input\PAMANA_Dataset\Luzon\Abistung_audios.zip -> input\PAMANA_Dataset\Luzon\Abistung_audios
Extracting input\PAMANA_Dataset\Luzon\Bangsi_audios.zip -> input\PAMANA_Dataset\Luzon\Bangsi_audios
Extracting input\PAMANA_Dataset\Luzon\Gangsa (Besao Fieldwork)_audios.zip -> input\PAMANA_Dataset\Luzon\Gangsa (Besao Fieldwork)_audios
Extracting input\PAMANA_Dataset\Luzon\Kalaleng (Besao Fieldwork)_audios.zip -> input\PAMANA_Dataset\Luzon\Kalaleng (Besao Fieldwork)_audios
Extracting input\PAMANA_Dataset\Luzon\Kalaleng (Bontoc Fieldwork)_audios.zip -> input\PAMANA_Dataset\Luzon\Kalaleng (Bontoc Fieldwork)_audios
Extracting input\PAMANA_Dataset\Luzon\Labil_audios.zip -> input\PAMANA_Dataset\Luzon\Labil_audios
Extracting input\PAMANA_Dataset\Luzon\Patang-ug_audios.zip -> input\PAMANA_Dataset\Luzon\Patang-ug_audios
Extracting input\PAMANA_Dataset\Luz

## Reading the Dataset for CSV

In [3]:
import librosa
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# --- SETTINGS ---
DATA_DIR = Path("input")                 # root folder containing instrument folders
OUTPUT_CSV = "pamana_dataset.csv"
AUDIO_EXTS = {".wav", ".mp3", ".flac", ".ogg", ".m4a", ".aac", ".wma"}

# --- Feature extraction helper ---
def extract_audio_metadata(file_path: Path) -> dict | None:
    """
    Load one audio file and return basic metadata:
    sample rate, duration, file type, and inferred instrument label.
    """
    try:
        # Load audio (mono)
        y, sr = librosa.load(file_path, sr=None, mono=True)
        duration = librosa.get_duration(y=y, sr=sr)

        # Infer instrument label from parent folder (e.g., 'Agung_audios' -> 'Agung')
        parent_folder = file_path.parent.name
        instrument_label = parent_folder.replace("_audios", "")

        return {
            "filepath": str(file_path),
            "filetype": file_path.suffix.lower(),
            "sample_rate": sr,
            "duration_sec": duration,
            "instrument_label": instrument_label,
        }

    except Exception as e:
        print(f"⚠️ Skipped {file_path}: {e}")
        return None


# --- Collect all supported audio files ---
audio_files = [f for f in DATA_DIR.rglob("*") if f.suffix.lower() in AUDIO_EXTS]
print(f"Found {len(audio_files)} audio files in {DATA_DIR.resolve()}.")

# --- Extract metadata with progress bar ---
records: list[dict] = []
for f in tqdm(audio_files, desc="Extracting metadata"):
    info = extract_audio_metadata(f)
    if info:
        records.append(info)

# --- Save to CSV ---
df = pd.DataFrame(records)
if not df.empty:
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"\n✅ Metadata extraction complete.")
    print(f"Saved {len(df)} rows to '{OUTPUT_CSV}'.\n")
    print("Instrument distribution:")
    print(df["instrument_label"].value_counts())
else:
    print("\n⚠️ No features extracted — check your input path and file formats.")

Found 670 audio files in C:\Users\creep\Documents\Programs\Third_Year\Project_PAMANA_dataset\input.


Extracting metadata: 100%|██████████| 670/670 [00:09<00:00, 73.48it/s] 


✅ Metadata extraction complete.
Saved 670 rows to 'pamana_dataset.csv'.

Instrument distribution:
instrument_label
Gabbang                        70
Faglung                        54
Labil                          39
Agung (Panay Bukidnon)         34
Tumpong                        31
Subing                         30
Bombo                          28
Tambor                         26
Kubing                         24
Tambol                         24
Agung (Subanen)                24
Kalaleng (Besao Fieldwork)     24
Tacombo                        24
Tanangong                      21
Ab-bew                         20
Kalaleng (Bontoc Fieldwork)    19
Tikumbo                        18
Abistung                       17
Agung (Mansaka)                17
Gimbao                         16
Paratikan                      16
Kuging                         16
Suganggang                     11
Patik-patik                    11
Subing (Panay Bukidnon)        10
Banjo                          10





In [27]:
import polars as pl

df = pl.read_csv("audio_features.csv")

df.describe()

statistic,filepath,filetype,sample_rate,duration_sec,rms_energy,zcr,spectral_centroid,spectral_bandwidth,rolloff,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,instrument_label
str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
"""count""","""670""","""670""",670.0,670.0,670.0,670.0,670.0,670.0,670.0,670.0,670.0,670.0,670.0,670.0,670.0,670.0,670.0,670.0,670.0,670.0,670.0,670.0,"""670"""
"""null_count""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""0"""
"""mean""",,,44100.0,7.01957,0.029071,0.084694,3109.142782,3257.536043,6085.723374,-537.292611,82.167339,-13.745677,2.268664,-13.213705,-0.951005,-11.37975,-4.265865,-6.246474,-3.030142,-7.122061,-2.233258,-5.116345,
"""std""",,,0.0,20.930983,0.042135,0.058325,1365.363108,891.355553,2579.858768,145.916152,36.432381,45.944229,20.018244,23.328254,14.613191,17.164946,12.671156,14.667601,11.342897,12.582856,10.91895,10.019524,
"""min""","""input\PAMANA_Dataset\Luzon\Ab-…",""".mp3""",44100.0,0.287347,0.000383,0.013562,496.721754,642.585383,537.321553,-882.38586,-6.157698,-182.32399,-65.11822,-91.664215,-45.025852,-70.18855,-38.82878,-45.652367,-30.356543,-40.709927,-32.855774,-29.165678,"""Ab-bew"""
"""25%""",,,44100.0,0.992653,0.00518,0.04503,2127.755051,2657.391817,4012.323001,-632.4725,53.813793,-40.486237,-8.142176,-28.206844,-9.812379,-20.967497,-11.84029,-14.502827,-11.373013,-15.019673,-10.698123,-12.69113,
"""50%""",,,44100.0,1.306122,0.014968,0.065021,2990.918209,3346.769442,6239.332233,-522.3148,81.50444,0.025956,6.2607574,-7.080081,-0.369483,-7.574062,-4.430204,-5.546705,-2.880387,-7.294196,-1.399989,-4.240933,
"""75%""",,,44100.0,3.526531,0.033431,0.113155,3808.66583,3892.246141,7813.548101,-438.87396,107.85301,21.870789,16.84386,4.779669,7.764397,0.419275,4.023712,2.2458737,3.968629,1.2052195,4.7461767,1.3652027,
"""max""","""input\PAMANA_Dataset\Visayas\T…",""".mp3""",44100.0,250.514286,0.25868,0.36233,8175.161119,5359.078821,12911.403245,-91.37293,199.40674,77.72598,43.102238,32.928738,48.648666,35.951878,36.16436,39.1544,35.456444,32.105907,37.01128,33.930573,"""Tumpong"""
