<a href="https://colab.research.google.com/github/fireHedgehog/music-intrument-OvA-model/blob/main/nsynth_noise_and_EMR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Step 2: Load Pretrained Models
Assuming the models are saved in a specific directory within your Google Drive, you can create a model cache like this:

In [None]:
import os
from tensorflow.keras.models import load_model

model_dir = '/content/drive/My Drive/path_to_your_models'  # Update this path
model_cache = {}

label_map = {
    0: 'bass', 1: 'brass', 2: 'flute', 3: 'guitar',
    4: 'keyboard', 5: 'mallet', 6: 'organ', 7: 'reed',
    8: 'string', 10: 'vocal'
}

for label, instrument_name in label_map.items():
    model_path = os.path.join(model_dir, f'{instrument_name}_classifier.h5')
    if os.path.exists(model_path):
        model_cache[instrument_name] = load_model(model_path)
    else:
        print(f"Model for {instrument_name} not found.")


def audio_to_spectrogram(audio_sample, sr=16000, n_fft=2048, hop_length=512):
    """Convert audio to spectrogram."""
    spectrogram = librosa.stft(audio_sample, n_fft=n_fft, hop_length=hop_length)
    spectrogram_db = librosa.amplitude_to_db(abs(spectrogram))
    return spectrogram_db


Step 3: Prepare Dataset Samples


In [None]:
import tensorflow_datasets as tfds

def get_samples_for_each_family(n_samples=100):
    datasets = {
        'train': tfds.load('nsynth/gansynth_subset', split='train', shuffle_files=True),
        'valid': tfds.load('nsynth/gansynth_subset', split='validation', shuffle_files=True),
        'test': tfds.load('nsynth/gansynth_subset', split='test', shuffle_files=True)
    }
    samples, labels = [], []

    for family_id, instrument_name in label_map.items():
        count = 0
        for split in ['valid', 'test', 'train']:
            if count >= n_samples:
                break
            for example in tfds.as_numpy(datasets[split]):
                if example['instrument']['family'] == family_id:
                    samples.append(example['audio'])
                    labels.append(family_id)
                    count += 1
                    if count >= n_samples:
                        break

    return samples, labels


Step 4: Predict and Analyze


In [None]:
def predict_and_analyze(samples, labels):
    from sklearn.metrics import classification_report, confusion_matrix
    import numpy as np

    predictions = []
    for sample, true_label in zip(samples, labels):
        spectrogram = audio_to_spectrogram(sample)  # Assume this function is already defined
        # Expand dimensions to match the model's expected input
        spectrogram = np.expand_dims(np.expand_dims(spectrogram, axis=0), axis=-1)

        pred_probs = [model.predict(spectrogram)[0] for model in model_cache.values()]
        pred_label = np.argmax(pred_probs)
        predictions.append(pred_label)

    print(classification_report(labels, predictions, target_names=label_map.values()))
    print(confusion_matrix(labels, predictions))

samples, true_labels = get_samples_for_each_family()
predict_and_analyze(samples, true_labels)


Noise:
To extend the code for these experiments, you'll need to overlay your NSynth dataset samples with different types of background noise (dog barks, traffic noise, nature sounds, and human noise) and then use your pre-trained models to predict the instrument classes in these modified audio samples. This process involves a few steps, including acquiring the noise samples, overlaying these noises onto your test samples, and then conducting predictions with your models.

Step 1: Acquire Noise Samples
You'll need to find or record a single sample of each noise type. There are many free sound databases online, such as Freesound, where you might find suitable samples. Ensure that any samples you use are free for use and do not require attribution or have copyright restrictions.

Step 2: Overlay Noise on Test Samples
This step involves mixing your noise sample with each of the test samples from the NSynth dataset. The following is a generic function to overlay noise on an audio sample. This function assumes you've loaded your noise samples into variables such as dog_bark_sample, traffic_noise_sample, nature_noise_sample, and human_noise_sample.

In [None]:
def overlay_noise(audio_sample, noise_sample, noise_level=0.5):
    """
    Overlays a noise sample onto an audio sample at a specified level.

    Parameters:
    - audio_sample: The original audio sample.
    - noise_sample: The noise sample to overlay.
    - noise_level: The volume level of the noise relative to the audio sample.

    Returns:
    - The audio sample with the noise overlay.
    """
    # Ensure the noise sample is the same length as the audio sample
    if len(noise_sample) > len(audio_sample):
        noise_sample = noise_sample[:len(audio_sample)]
    else:
        # Repeat the noise sample if it is shorter than the audio sample
        repeat_times = len(audio_sample) // len(noise_sample) + 1
        noise_sample = np.tile(noise_sample, repeat_times)[:len(audio_sample)]

    # Mix the audio sample with the noise
    return audio_sample + noise_level * noise_sample


Overlay Noise and Prepare Data
First, ensure you have your pre-trained models loaded and accessible in the code. Also, make sure you have your noise samples (dog_bark_sample, traffic_noise_sample, nature_noise_sample, human_noise_sample) ready.

In [None]:
# Example function to load a noise sample - you would replace this with actual loading code
def load_noise_sample(file_path):
    # This function should load and return the noise sample from the file_path
    # For example purposes, this returns a dummy numpy array
    return np.random.normal(0, 1, (64000,))

# Load your noise samples here
dog_bark_sample = load_noise_sample('path/to/dog_bark_sample.wav')
traffic_noise_sample = load_noise_sample('path/to/traffic_noise_sample.wav')
nature_noise_sample = load_noise_sample('path/to/nature_noise_sample.wav')
human_noise_sample = load_noise_sample('path/to/human_noise_sample.wav')

def get_test_samples(n_samples=100):
    # Define the dataset splits
    datasets = {
        'train': tfds.load('nsynth/gansynth_subset', split='train', shuffle_files=True),
        'valid': tfds.load('nsynth/gansynth_subset', split='validation', shuffle_files=True),
        'test': tfds.load('nsynth/gansynth_subset', split='test', shuffle_files=True)
    }

    samples_per_family = {family: [] for family in label_map.values()}

    # Iterate over each dataset split until enough samples per family are collected
    for split_name, dataset in datasets.items():
        if all(len(samples) >= n_samples for samples in samples_per_family.values()):
            break  # Stop if we've already collected enough samples for each family

        for example in tfds.as_numpy(dataset):
            family_id = example['instrument']['family']
            # Convert family_id to instrument name using label_map, skipping if not found (e.g., synth_lead)
            instrument_name = label_map.get(family_id)
            if instrument_name is None or len(samples_per_family[instrument_name]) >= n_samples:
                continue  # Skip if instrument is not recognized or enough samples have been collected

            # Convert audio samples to spectrograms and store them with their labels
            spectrogram = audio_to_spectrogram(example['audio'])
            samples_per_family[instrument_name].append((spectrogram, family_id))

    # Aggregate collected samples and labels from all families
    aggregated_samples = []
    aggregated_labels = []
    for family, samples in samples_per_family.items():
        for spectrogram, label in samples[:n_samples]:
            aggregated_samples.append(spectrogram)
            aggregated_labels.append(label)

    return np.array(aggregated_samples), np.array(aggregated_labels)

# Now, calling this function should give you a balanced dataset
test_samples, test_labels = get_test_samples(n_samples=100)

# Function to prepare datasets with noise overlay
def prepare_dataset_with_noise(test_samples, noise_sample):
    noisy_test_samples = []
    for sample in test_samples:
        noisy_sample = overlay_noise(sample, noise_sample)
        noisy_test_samples.append(noisy_sample)
    return noisy_test_samples

# Prepare datasets for each noise type
noisy_datasets = {
    "dog_bark": prepare_dataset_with_noise(test_samples, dog_bark_sample),
    "traffic_noise": prepare_dataset_with_noise(test_samples, traffic_noise_sample),
    "nature_noise": prepare_dataset_with_noise(test_samples, nature_noise_sample),
    "human_noise": prepare_dataset_with_noise(test_samples, human_noise_sample),
}


Predictions and Analysis

In [None]:
def make_predictions(noisy_samples, model_cache):
    predictions = []
    # Iterate over each sample
    for sample in noisy_samples:
        spectrogram = audio_to_spectrogram(sample)  # Convert audio sample to spectrogram
        spectrogram = np.expand_dims(np.expand_dims(spectrogram, axis=0), axis=-1)  # Reshape for the model

        # Aggregate predictions from each model
        pred_probs = np.zeros(len(model_cache))
        for i, (instrument_name, model) in enumerate(model_cache.items()):
            pred = model.predict(spectrogram)[0]
            pred_probs[i] = pred

        # Determine the predicted class
        predicted_class = np.argmax(pred_probs)
        predictions.append(predicted_class)

    return predictions

def evaluate_predictions(predictions, true_labels, label_map):
    # Convert numerical labels back to class names for a more interpretable report
    target_names = [label_map[label] for label in sorted(label_map.keys())]

    print(classification_report(true_labels, predictions, target_names=target_names))
    conf_matrix = confusion_matrix(true_labels, predictions)
    print(conf_matrix)

# Assuming `test_labels` is a list of numerical labels corresponding to `label_map`
# and `noisy_datasets` is a dictionary with noise type keys and lists of noisy samples as values
for noise_type, noisy_samples in noisy_datasets.items():
    print(f"Evaluating with {noise_type} noise...")
    predictions = make_predictions(noisy_samples, model_cache)
    evaluate_predictions(predictions, test_labels, label_map)


caculation of number of EMR matrix data:

In [None]:
from math import comb

# Redefining the number of samples per class to 10
samples_per_class = 10

# Solo instruments and combinations
solo_samples = 10 * 10  # 10 solo instruments
duo_samples = comb(10, 2) * samples_per_class  # Combinations of 2 from 10 instruments
trio_samples = comb(10, 3) * samples_per_class  # Combinations of 3 from 10 instruments
quartet_samples = comb(10, 4) * samples_per_class  # Combinations of 4 from 10 instruments
quintet_samples = comb(10, 5) * samples_per_class  # Combinations of 5 from 10 instruments
sextet_samples = comb(10, 6) * samples_per_class  # Combinations of 6 from 10 instruments
septet_samples = comb(10, 7) * samples_per_class  # Combinations of 7 from 10 instruments
octet_samples = comb(10, 8) * samples_per_class  # Combinations of 8 from 10 instruments
nonet_samples = comb(10, 9) * samples_per_class  # Combinations of 9 from 10 instruments
all_instruments_samples = samples_per_class  # All 10 instruments together

# Adding no instrument samples
no_instrument_samples = samples_per_class

# Calculating total number of samples with reduced sample size per class
total_samples_reduced = (solo_samples + duo_samples + trio_samples + quartet_samples + quintet_samples +
                         sextet_samples + septet_samples + octet_samples + nonet_samples + all_instruments_samples +
                         no_instrument_samples)

total_samples_reduced

In [None]:
import numpy as np
import librosa
import tensorflow_datasets as tfds
import itertools

# Define the mapping from numerical labels to string names, excluding 'synth_lead'
label_map = {
    0: 'bass',
    1: 'brass',
    2: 'flute',
    3: 'guitar',
    4: 'keyboard',
    5: 'mallet',
    6: 'organ',
    7: 'reed',
    8: 'string',
    # 9: 'synth_lead', # Excluded
    10: 'vocal',
}

def audio_to_spectrogram(audio_sample, sr=16000, n_fft=2048, hop_length=512):
    """Convert audio to spectrogram."""
    spectrogram = librosa.stft(audio_sample, n_fft=n_fft, hop_length=hop_length)
    spectrogram_db = librosa.amplitude_to_db(abs(spectrogram))
    return spectrogram_db

def mix_audios(audios):
    """Mix multiple audios into one."""
    mixed_audio = np.sum(audios, axis=0)
    return mixed_audio

def load_audio_sample_for_family(family_id, nsynth_dataset):
    """
    Load an audio sample for a specific family from the NSynth dataset.
    This is a placeholder function. You need to implement the logic to select and return
    an audio sample and its sample rate based on the family_id.
    """
    # Placeholder: Generate a random audio sample
    audio_sample = np.random.normal(0, 1, (64000,))  # Assuming 4 seconds of audio at 16kHz
    sr = 16000  # Sample rate
    return audio_sample, sr

def generate_samples_labels(all_sample_size=10):
    multiple_validate_sample = []
    multiple_validate_label = []

   # No instruments scenario
    no_instruments_audio = np.random.normal(0, 1, (64000,))  # Assuming 4 seconds of audio at 16kHz
    for i in range(all_sample_size):
        multiple_validate_sample.append(audio_to_spectrogram(no_instruments_audio))
        multiple_validate_label.append([0] * 10)  # No instrument label

    # Iterate over each instrument family for solo instruments
    for family_id in range(len(label_map)):
        for _ in range(all_sample_size):
            audio_sample, _ = load_audio_sample_for_family(family_id, nsynth_dataset)
            spectrogram = audio_to_spectrogram(audio_sample)
            label = [1 if idx == family_id else 0 for idx in range(10)]
            multiple_validate_sample.append(spectrogram)
            multiple_validate_label.append(label)

    # Combinations of instruments
    for num_instruments in range(2, len(label_map) + 1):  # From duo to all instruments
        for combo in itertools.combinations(range(len(label_map)), num_instruments):
            audios = []
            labels = [0] * len(label_map)
            for family_id in combo:
                audio_sample, _ = load_audio_sample_for_family(family_id, nsynth_dataset)
                audios.append(audio_sample)
                labels[family_id] = 1
            mixed_audio = mix_audios(audios)
            spectrogram = audio_to_spectrogram(mixed_audio)
            multiple_validate_sample.append(spectrogram)
            multiple_validate_label.append(labels)

    return np.array(multiple_validate_sample), np.array(multiple_validate_label)

# Load NSynth dataset
nsynth_dataset = tfds.load('nsynth/gansynth_subset', split='test', shuffle_files=True)

# Generate samples and labels
samples, labels = generate_samples_labels(all_sample_size=10)