In [None]:
import os
import glob
import numpy as np
import pandas as pd
import librosa
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- CONFIGURATION ---
DATASET_PATH = '../data/audio/BanglaBeats'  # REPLACE with your actual folder path
GENRES = ['Adhunik', 'Folk', 'Hiphop', 'Indie', 'Islamic', 'Metal', 'Pop', 'Rock']
SAMPLE_RATE = 22050
DURATION_CHUNKS = 10  # Number of 3s clips to make one 30s clip

def extract_features(y, sr, filename, label):
    """
    Extracts all specific features requested for the GTZAN-style CSV.
    """
    
    # 1. Basic Audio Processing
    # Harmonic and Percussive components
    y_harm, y_perc = librosa.effects.hpss(y)
    
    # 2. Spectral Features
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rms = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    
    # 3. Tempo
    # librosa > 0.10.0 returns tempo as a scalar or array depending on aggregators
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    if isinstance(tempo, np.ndarray):
        tempo = tempo[0] # Handle array return in newer librosa versions

    # 4. MFCCs (20 coefficients)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)

    # --- BUILD FEATURE DICTIONARY ---
    # We calculate Mean and Variance for each feature
    features = {
        "filename": filename,
        "length": len(y),
        
        "chroma_stft_mean": np.mean(chroma_stft),
        "chroma_stft_var": np.var(chroma_stft),
        
        "rms_mean": np.mean(rms),
        "rms_var": np.var(rms),
        
        "spectral_centroid_mean": np.mean(spec_cent),
        "spectral_centroid_var": np.var(spec_cent),
        
        "spectral_bandwidth_mean": np.mean(spec_bw),
        "spectral_bandwidth_var": np.var(spec_bw),
        
        "rolloff_mean": np.mean(rolloff),
        "rolloff_var": np.var(rolloff),
        
        "zero_crossing_rate_mean": np.mean(zcr),
        "zero_crossing_rate_var": np.var(zcr),
        
        "harmony_mean": np.mean(y_harm),
        "harmony_var": np.var(y_harm),
        
        "perceptr_mean": np.mean(y_perc),
        "perceptr_var": np.var(y_perc),
        
        "tempo": tempo,
    }

    # Add MFCCs (1 through 20)
    for i in range(20):
        features[f"mfcc{i+1}_mean"] = np.mean(mfcc[i])
        features[f"mfcc{i+1}_var"] = np.var(mfcc[i])

    features["label"] = label
    return features

def main():
    data_rows = []

    print(f"Processing dataset at: {DATASET_PATH}...")

    for genre in GENRES:
        genre_path = os.path.join(DATASET_PATH, genre)
        
        if not os.path.exists(genre_path):
            print(f"Skipping {genre} (Folder not found)")
            continue

        print(f"Processing genre: {genre}")
        
        # Get all wav files
        files = glob.glob(os.path.join(genre_path, "*.wav"))
        
        # CRITICAL: Sort files numerically (1.wav, 2.wav, ... 10.wav)
        # Otherwise '10.wav' might come before '2.wav' in string sort
        try:
            files.sort(key=lambda f: int(os.path.basename(f).split('.')[0]))
        except ValueError:
            print(f"Warning: Non-numeric filenames found in {genre}. Sorting alphabetically.")
            files.sort()

        # Chunk files into groups of 10
        # Files 1-10 -> Song 1
        # Files 11-20 -> Song 2
        for i in range(0, len(files), DURATION_CHUNKS):
            chunk_files = files[i : i + DURATION_CHUNKS]
            
            # Ensure we have a full 30-second set (optional check)
            if len(chunk_files) < DURATION_CHUNKS:
                print(f"  Skipping incomplete chunk at index {i} in {genre}")
                continue

            # Load and Concatenate
            combined_y = []
            for wav_file in chunk_files:
                y, _ = librosa.load(wav_file, sr=SAMPLE_RATE)
                combined_y.append(y)
            
            # Stitch the 10 parts into one array
            y_30s = np.concatenate(combined_y)
            
            # Define a filename for the CSV (e.g., "adhunik.00001.wav")
            # We use the name of the first file in the chunk to generate an ID
            song_id = (i // DURATION_CHUNKS)
            csv_filename = f"{genre}.{song_id:05d}.wav"

            # Extract Features
            row_data = extract_features(y_30s, SAMPLE_RATE, csv_filename, genre)
            data_rows.append(row_data)

    # --- EXPORT TO CSV ---
    # Define exact column order as requested
    columns = [
        "filename", "length", 
        "chroma_stft_mean", "chroma_stft_var",
        "rms_mean", "rms_var",
        "spectral_centroid_mean", "spectral_centroid_var",
        "spectral_bandwidth_mean", "spectral_bandwidth_var",
        "rolloff_mean", "rolloff_var",
        "zero_crossing_rate_mean", "zero_crossing_rate_var",
        "harmony_mean", "harmony_var",
        "perceptr_mean", "perceptr_var",
        "tempo"
    ]
    # Add MFCC columns dynamically to ensure correct order 1-20
    for i in range(1, 21):
        columns.append(f"mfcc{i}_mean")
        columns.append(f"mfcc{i}_var")
    
    columns.append("label")

    df = pd.DataFrame(data_rows)
    
    # Reorder columns to match GTZAN strictly
    df = df[columns]
    
    output_file = "banglabeats_features_30sec.csv"
    df.to_csv(output_file, index=False)
    print(f"Done! Saved features to {output_file}")

if __name__ == "__main__":
    main()