In [1]:
import librosa
import numpy as np
import pandas as pd
from pymongo import MongoClient

In [2]:
def process_single_file(document):
    """
    Process a single audio file from a MongoDB document.

    Args:
        document (dict): A MongoDB document containing 'path' to the audio file.

    Returns:
        dict: Processed data including features and genre.
    """
    try:
        audio_path = document.get('path')
        genre = document.get('genre')

        print(f"Processing file: {audio_path}")

        # Load the audio file with a consistent sampling rate (e.g., 22050 Hz)
        y, sr = librosa.load(audio_path, sr=22050)

        # Preprocessing
        # Normalize the audio
        y = librosa.util.normalize(y)

        # Trim silence
        y, _ = librosa.effects.trim(y)

        # Feature Extraction
        # MFCCs
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc_mean = np.mean(mfcc, axis=1)
        mfcc_var = np.var(mfcc, axis=1)

        # Mel Spectrogram
        mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
        mel_mean = np.mean(mel_spectrogram, axis=1)
        mel_var = np.var(mel_spectrogram, axis=1)

        # Chroma Features
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        chroma_mean = np.mean(chroma, axis=1)

        # Spectral Contrast
        spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        spectral_contrast_mean = np.mean(spectral_contrast, axis=1)

        # Combine all features into a single vector
        features = np.concatenate([mfcc_mean, mfcc_var, mel_mean, mel_var, chroma_mean, spectral_contrast_mean])

        # Return features and genre
        return {'features': features.tolist(), 'genre': genre}

    except Exception as e:
        print(f"Error processing file {document.get('path')}: {e}")
        return None

In [3]:
def process_all_files_to_csv(connection_string, database_name, collection_name, output_csv):
    """
    Process all audio files in the MongoDB collection and save the results to a CSV file.

    Args:
        connection_string (str): MongoDB connection string.
        database_name (str): MongoDB database name.
        collection_name (str): MongoDB collection name.
        output_csv (str): Path to the output CSV file.
    """
    try:
        # Connect to MongoDB
        client = MongoClient(connection_string)
        db = client[database_name]
        collection = db[collection_name]

        # Initialize list for processed data
        processed_data = []

        # Iterate through all documents in the collection
        documents = collection.find()
        for document in documents:
            result = process_single_file(document)
            if result:
                # Append features and genre to the processed_data list
                processed_data.append({
                    **{f"feature_{i}": val for i, val in enumerate(result['features'])},
                    "genre": result['genre']
                })
            else:
                print(f"Processing failed for document: {document.get('_id')}")

        # Convert processed data to a DataFrame
        df = pd.DataFrame(processed_data)

        # Save DataFrame to CSV
        df.to_csv(output_csv, index=False)
        print(f"Processed data saved to {output_csv}")

    except Exception as e:
        print(f"Error: {e}")

In [4]:
connection_string = "mongodb+srv://bkhuu5:W6hGcpaquUu1p596@cluster0.7sswn.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
database_name = "Music_Genre"
collection_name = "Music_Genre_Classifier"
output_csv = "Data/processed_music_data.csv"

process_all_files_to_csv(connection_string, database_name, collection_name, output_csv)

Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_projects\Music_genre_classifier\Data\genres_original\blues\blues.00000.wav
Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_projects\Music_genre_classifier\Data\genres_original\blues\blues.00001.wav
Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_projects\Music_genre_classifier\Data\genres_original\blues\blues.00002.wav
Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_projects\Music_genre_classifier\Data\genres_original\blues\blues.00003.wav
Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_projects\Music_genre_classifier\Data\genres_original\blues\blues.00004.wav
Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_projects\Music_genre_classifier\Data\genres_original\blues\blues.00005.wav
Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_projects\Music_genre_classifier\Data\genres_original\blues\blues.00006.wav
Processing file: C:\Users\bkhuu\Portfolio\projec

  y, sr = librosa.load(audio_path, sr=22050)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_projects\Music_genre_classifier\Data\genres_original\jazz\jazz.00056.wav
Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_projects\Music_genre_classifier\Data\genres_original\jazz\jazz.00057.wav
Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_projects\Music_genre_classifier\Data\genres_original\jazz\jazz.00058.wav
Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_projects\Music_genre_classifier\Data\genres_original\jazz\jazz.00059.wav
Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_projects\Music_genre_classifier\Data\genres_original\jazz\jazz.00060.wav
Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_projects\Music_genre_classifier\Data\genres_original\jazz\jazz.00061.wav
Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_projects\Music_genre_classifier\Data\genres_original\jazz\jazz.00062.wav
Processing file: C:\Users\bkhuu\Portfolio\projects\portfolio_p