In [1]:
# !pip install tensorflow librosa pydub

# Notes About this file
This file uses an existing model and code that I found online. I wasn't sure how to use tensor flow with audio so I worked through this notebook and modified it as I understood it.
The main purpose of this file is to create a saved trained_model.pkl that I can load and use to classify the instruments. I explored existing hugging face models for instrument recognition but wanted to have more understanding of the process.

In [2]:
import os
import librosa
import numpy as np
import csv
import pandas as pd
import pickle

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio

import matplotlib.pyplot as plt
from IPython.display import Audio
from scipy.io import wavfile
from scipy import signal
from pydub import AudioSegment
from IPython import display

from sklearn.preprocessing import MultiLabelBinarizer

import shutil
# source tf-env/bin/activate


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Specify the path to the directory
output_dir = './output'
input_dir = './data/instrument/musicnet'

In [None]:
# Clear all files in the directory
for filename in os.listdir(output_dir):
    file_path = os.path.join(output_dir, filename)
    if os.path.isfile(file_path):
        os.remove(file_path) 
    elif os.path.isdir(file_path):
        shutil.rmtree(file_path)  # Remove directory and its contents

In [None]:
# Load the model.
model = hub.load('https://tfhub.dev/google/yamnet/1')

In [None]:
# Utility functions for loading audio files and making sure the sample rate is correct.
@tf.function
def load_wav_16k_mono(filename):
    """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
    file_contents = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(
          file_contents,
          desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

In [None]:
class_map_path = model.class_map_path().numpy().decode('utf-8')
sound_names =list(pd.read_csv(class_map_path)['display_name'])

for name in sound_names[:20]:
  print(name)
print('...')

Speech
Child speech, kid speaking
Conversation
Narration, monologue
Babbling
Speech synthesizer
Shout
Bellow
Whoop
Yell
Children shouting
Screaming
Whispering
Laughter
Baby laughter
Giggle
Snicker
Belly laugh
Chuckle, chortle
Crying, sobbing
...


In [None]:
# Mapping of MusicNet instruments and treir indexes
musicnet_instruments_map = {
1: "Acoustic Grand Piano",
2: "Bright Acoustic Piano",
3: "Electric Grand Piano",
4: "Honky-tonk Piano",
5: "Electric Piano 1",
6: "Electric Piano 2",
7: "Harpsichord",
8: "Clavi",
9: "Celesta",
10: "Glockenspiel",
11: "Music Box",
12: "Vibraphone",
13: "Marimba",
14: "Xylophone",
15: "Tubular Bells",
16: "Dulcimer",
17: "Drawbar Organ",
18: "Percussive Organ",
19: "Rock Organ",
20: "Church Organ",
21: "Reed Organ",
22: "Accordion",
23: "Harmonica",
24: "Tango Accordion",
25: "Acoustic Guitar (nylon)",
26: "Acoustic Guitar (steel)",
27: "Electric Guitar (jazz)",
28: "Electric Guitar (clean)",
29: "Electric Guitar (muted)",
30: "Overdriven Guitar",
31: "Distortion Guitar",
32: "Guitar harmonics",
33: "Acoustic Bass",
34: "Electric Bass (finger)",
35: "Electric Bass (pick)",
36: "Fretless Bass",
37: "Slap Bass 1",
38: "Slap Bass 2",
39: "Synth Bass 1",
40: "Synth Bass 2",
41: "Violin",
42: "Viola",
43: "Cello",
44: "Contrabass",
45: "Tremolo Strings",
46: "Pizzicato Strings",
47: "Orchestral Harp",
48: "Timpani",
49: "String Ensemble 1",
50: "String Ensemble 2",
51: "SynthStrings 1",
52: "SynthStrings 2",
53: "Choir Aahs",
54: "Voice Oohs",
55: "Synth Voice",
56: "Orchestra Hit",
57: "Trumpet",
58: "Trombone",
59: "Tuba",
60: "Muted Trumpet",
61: "French Horn",
62: "Brass Section",
63: "SynthBrass 1",
64: "SynthBrass 2",
65: "Soprano Sax",
66: "Alto Sax",
67: "Tenor Sax",
68: "Baritone Sax",
69: "Oboe",
70: "English Horn",
71: "Bassoon",
72: "Clarinet",
73: "Piccolo",
74: "Flute",
75: "Recorder",
76: "Pan Flute",
77: "Blown Bottle",
78: "Shakuhachi",
79: "Whistle",
80: "Ocarina",
81: "Lead 1 (square)",
82: "Lead 2 (sawtooth)",
83: "Lead 3 (calliope)",
84: "Lead 4 (chiff)",
85: "Lead 5 (charang)",
86: "Lead 6 (voice)",
87: "Lead 7 (fifths)",
88: "Lead 8 (bass + lead)",
89: "Pad 1 (new age)",
90: "Pad 2 (warm)",
91: "Pad 3 (polysynth)",
92: "Pad 4 (choir)",
93: "Pad 5 (bowed)",
94: "Pad 6 (metallic)",
95: "Pad 7 (halo)",
96: "Pad 8 (sweep)",
97: "FX 1 (rain)",
98: "FX 2 (soundtrack)",
99: "FX 3 (crystal)",
100: "FX 4 (atmosphere)",
101: "FX 5 (brightness)",
102: "FX 6 (goblins)",
103: "FX 7 (echoes)",
104: "FX 8 (sci-fi)",
105: "Sitar",
106: "Banjo",
107: "Shamisen",
108: "Koto",
109: "Kalimba",
110: "Bag pipe",
111: "Fiddle",
112: "Shanai",
113: "Tinkle Bell",
114: "Agogo",
115: "Steel Drums",
116: "Woodblock",
117: "Taiko Drum",
118: "Melodic Tom",
119: "Synth Drum",
120: "Reverse Cymbal",
121: "Guitar Fret Noise",
122: "Breath Noise",
123: "Seashore",
124: "Bird Tweet",
125: "Telephone Ring",
126: "Helicopter",
127: "Applause",
128: "Gunshot"
}

In [None]:
def update_unique_instrument_indexs(file_directory, unique_instruments):
    for filename in os.listdir(file_directory):
        if filename.endswith('.csv'):  
            file_path = os.path.join(file_directory, filename)
            df = pd.read_csv(file_path) 
            
            unique_instruments.update(df['instrument'].unique())

train_labels_directory = input_dir+'/train_labels'
test_labels_directory = input_dir+'/test_labels'

unique_instrument_indexes = set()

update_unique_instrument_indexs(train_labels_directory, unique_instrument_indexes)
update_unique_instrument_indexs(test_labels_directory, unique_instrument_indexes)

print(unique_instrument_indexes)

{1, 69, 7, 72, 41, 42, 43, 71, 74, 44, 61}


In [None]:
filtered_musicnet_instruments_map = {k: v for k, v in musicnet_instruments_map.items() if k in unique_instrument_indexes}

print(filtered_musicnet_instruments_map)

{1: 'Acoustic Grand Piano', 7: 'Harpsichord', 41: 'Violin', 42: 'Viola', 43: 'Cello', 44: 'Contrabass', 61: 'French Horn', 69: 'Oboe', 71: 'Bassoon', 72: 'Clarinet', 74: 'Flute'}


In [None]:
mapped_instruments = {}

# Simplify piano label to be able to mach the correct soundname from Yamnet
filtered_musicnet_instruments_map[1] = 'Piano'

for instrument_id, instrument_name in filtered_musicnet_instruments_map.items():
    # Iterate through sound categories to find a match
    for sound_category in sound_names:
        if instrument_name.lower() in sound_category.lower():
            mapped_instruments[instrument_id] = sound_category
            break

print("Mapped Instruments:")
for key, value in mapped_instruments.items():
    print(f"{key}: {value}")
print(mapped_instruments)

Mapped Instruments:
1: Piano
7: Harpsichord
41: Violin, fiddle
43: Cello
61: French horn
72: Clarinet
74: Flute
{1: 'Piano', 7: 'Harpsichord', 41: 'Violin, fiddle', 43: 'Cello', 61: 'French horn', 72: 'Clarinet', 74: 'Flute'}


In [None]:
# Manulally map instruments which names doesn't match to ones in Yamnet model sound names
mapped_instruments[42] = 'Violin, fiddle'
mapped_instruments[44] = 'Double bass'
mapped_instruments[69] = 'Wind instrument, woodwind instrument'
mapped_instruments[71] = 'Wind instrument, woodwind instrument'

# Print the resulting mapping
print("Mapped Sounds:")
for index, name in mapped_instruments.items():
    print(f"{index}: {name}")

Mapped Sounds:
1: Piano
7: Harpsichord
41: Violin, fiddle
43: Cello
61: French horn
72: Clarinet
74: Flute
42: Violin, fiddle
44: Double bass
69: Wind instrument, woodwind instrument
71: Wind instrument, woodwind instrument


In [None]:
# Add a target column to each file and save the results in the output directory
def process_csv_files(input_directory, output_directory, mapped_instruments):
    os.makedirs(output_directory, exist_ok=True)
    
    for filename in os.listdir(input_directory):
        if filename.endswith('.csv'):  
            file_path = os.path.join(input_directory, filename)
            df = pd.read_csv(file_path)  
            
            # Add a target column containing the mapped instrument name
            df['target'] = df['instrument'].map(mapped_instruments)
            
            updated_file_path = os.path.join(output_directory, filename)
            df.to_csv(updated_file_path, index=False)

train_labels_directory = input_dir+'/train_labels'
test_labels_directory = input_dir+'/test_labels'

output_train_labels_directory = output_dir +'/labels/train_labels'
output_test_labels_directory = output_dir +'/labels/test_labels'

process_csv_files(train_labels_directory, output_train_labels_directory, mapped_instruments)
process_csv_files(test_labels_directory, output_test_labels_directory, mapped_instruments)

In [None]:
def convert_to_pcm(input_path, output_path):
    """Convert an audio file to 16-bit PCM WAV format with 16 kHz sample rate and mono channel."""
    try:
        # Load the audio file
        audio = AudioSegment.from_file(input_path)
        
        # Ensure the audio is set to 16 kHz, mono, and 16-bit
        audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)  # 16 kHz, mono, 16-bit
        
        # Export the audio as PCM WAV format
        audio.export(output_path, format="wav")
        print(f"Successfully converted {input_path} to {output_path}")
    
    except Exception as e:
        print(f"Error converting {input_path}: {e}")
        return None  # Return None if conversion fails

    return output_path

def batch_convert_to_pcm(input_dir, output_dir):
    """
    Convert all audio files in a directory to PCM WAV format.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_name in os.listdir(input_dir):
        if file_name.endswith(('.wav')):
            input_file = os.path.join(input_dir, file_name)
            output_file = os.path.join(output_dir, f"{os.path.splitext(file_name)[0]}_pcm.wav")
            
            # Convert the file and save the PCM version
            pcm_file = convert_to_pcm(input_file, output_file)
            if pcm_file is None:
                print(f"Skipping file: {file_name} due to conversion error.")
                
input_test_directory =input_dir+ "/test_data"
output_test_directory = output_dir + "/converted_audio/test_data"
batch_convert_to_pcm(input_test_directory, output_test_directory)

input_train_directory = input_dir+ "/train_data"
output_train_directory = output_dir + "/converted_audio/training_data"
batch_convert_to_pcm(input_train_directory, output_train_directory)

Successfully converted ../instrument/musicnet/test_data/2416.wav to ./output/converted_audio/test_data/2416_pcm.wav
Successfully converted ../instrument/musicnet/test_data/2628.wav to ./output/converted_audio/test_data/2628_pcm.wav
Successfully converted ../instrument/musicnet/test_data/2303.wav to ./output/converted_audio/test_data/2303_pcm.wav
Successfully converted ../instrument/musicnet/test_data/2298.wav to ./output/converted_audio/test_data/2298_pcm.wav
Successfully converted ../instrument/musicnet/test_data/2106.wav to ./output/converted_audio/test_data/2106_pcm.wav
Successfully converted ../instrument/musicnet/test_data/1819.wav to ./output/converted_audio/test_data/1819_pcm.wav
Successfully converted ../instrument/musicnet/test_data/1759.wav to ./output/converted_audio/test_data/1759_pcm.wav
Successfully converted ../instrument/musicnet/test_data/2382.wav to ./output/converted_audio/test_data/2382_pcm.wav
Successfully converted ../instrument/musicnet/test_data/2556.wav to ./ou

In [None]:
def split_audio_into_chunks(audio, chunk_size=3, sample_rate=16000):
    """Split audio into chunks of specified duration in seconds."""
    samples_per_chunk = chunk_size * sample_rate
    num_chunks = len(audio) // samples_per_chunk
    return np.array_split(audio, num_chunks + 1)

In [None]:
def extract_embeddings(audio_chunk, yamnet_model):
    """Extract embeddings from a chunk of audio."""
    audio_tensor = tf.convert_to_tensor(audio_chunk, dtype=tf.float32)  # Convert chunk to tensor
    scores, embeddings, _ = yamnet_model(audio_tensor)  # No need to expand dimensions
    return embeddings.numpy(), scores.numpy()

In [None]:
def load_wav_16k_mono(filename):
    """
    Load an audio file, convert it to mono, and resample to 16 kHz.
    """
    audio, sr = librosa.load(filename, sr=16000, mono=True)
    return audio

def process_and_save_embeddings(dataset_path, yamnet_model, output_dir, chunk_size=3):
    """Process each audio file in the dataset and save embeddings."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    audio_files = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith('.wav')]

    for file_path in audio_files:
        print(f"Processing {file_path}...")
        audio = load_wav_16k_mono(file_path)  # Use the librosa-based function
        chunks = split_audio_into_chunks(audio, chunk_size=chunk_size)
        file_results = []

        for i, chunk in enumerate(chunks):
            if len(chunk) == 0:
                continue
            embeddings, scores = extract_embeddings(chunk, yamnet_model)
            file_results.append({
                "chunk_index": i,
                "embeddings": embeddings,
                "scores": scores,
            })

        output_file = os.path.join(output_dir, f"{os.path.basename(file_path)}_embeddings.pkl")
        with open(output_file, 'wb') as f:
            pickle.dump(file_results, f)

In [None]:

training_data_path = output_dir+'/converted_audio/training_data'
training_output_dir = output_dir+'/embeddings/training_data'

test_data_path = output_dir+'/converted_audio/test_data'
test_output_dir =  output_dir+'/embeddings/test_data'

process_and_save_embeddings(training_data_path, model, training_output_dir, chunk_size=3)
process_and_save_embeddings(test_data_path, model, test_output_dir, chunk_size=3)

Processing ./output/converted_audio/training_data/2319_pcm.wav...
Processing ./output/converted_audio/training_data/2424_pcm.wav...
Processing ./output/converted_audio/training_data/2117_pcm.wav...
Processing ./output/converted_audio/training_data/2213_pcm.wav...
Processing ./output/converted_audio/training_data/2203_pcm.wav...
Processing ./output/converted_audio/training_data/1859_pcm.wav...
Processing ./output/converted_audio/training_data/1931_pcm.wav...
Processing ./output/converted_audio/training_data/2138_pcm.wav...
Processing ./output/converted_audio/training_data/2336_pcm.wav...
Processing ./output/converted_audio/training_data/1771_pcm.wav...
Processing ./output/converted_audio/training_data/2573_pcm.wav...
Processing ./output/converted_audio/training_data/2295_pcm.wav...
Processing ./output/converted_audio/training_data/2285_pcm.wav...
Processing ./output/converted_audio/training_data/1876_pcm.wav...
Processing ./output/converted_audio/training_data/2501_pcm.wav...
Processing

In [None]:
# Function to align embeddings with labels
def align_embeddings_with_labels(embeddings, labels, target_column, class_list):
    # Combine all embeddings (average across time/chunks)
    combined_embeddings = np.vstack([chunk.mean(axis=0) for chunk in embeddings])  # Average per chunk
    
    # Ensure the number of embeddings matches the number of labels
    min_length = min(len(combined_embeddings), len(labels))
    combined_embeddings = combined_embeddings[:min_length]
    labels = labels.iloc[:min_length]
    
    # Create a binary matrix for the target column using the unified class list
    label_binarized = pd.get_dummies(labels[target_column])
    label_binarized = label_binarized.reindex(columns=class_list, fill_value=0).astype(int).values
    
    return combined_embeddings, label_binarized

# Preprocess all data
def preprocess_yamnet_data(embeddings_dir, labels_dir, target_column='instrument', class_list=None):
    import os
    import pandas as pd
    import pickle
    import numpy as np

    all_embeddings = []
    all_labels = []
    
    # List all embedding and label files
    embedding_files = sorted([os.path.join(embeddings_dir, f) for f in os.listdir(embeddings_dir) if f.endswith('.pkl')])
    label_files = sorted([os.path.join(labels_dir, f) for f in os.listdir(labels_dir) if f.endswith('.csv')])
    
    # If no class_list is provided (for training), create it
    if class_list is None:
        class_set = set()
        for lbl_file in label_files:
            labels = pd.read_csv(lbl_file)
            class_set.update(labels[target_column].unique())
        class_list = sorted(class_set)  # Sort for consistent ordering
    
    # Process embeddings and labels
    for emb_file, lbl_file in zip(embedding_files, label_files):
        # Load embeddings
        with open(emb_file, 'rb') as f:
            embeddings = pickle.load(f)
        embedding_chunks = [chunk['embeddings'] for chunk in embeddings]
        
        # Load and align labels
        labels = pd.read_csv(lbl_file)
        combined_embeddings = np.vstack([chunk.mean(axis=0) for chunk in embedding_chunks])
        min_length = min(len(combined_embeddings), len(labels))
        combined_embeddings = combined_embeddings[:min_length]
        labels = labels.iloc[:min_length]
        
        # Create binary label matrix aligned with the class_list
        label_binarized = pd.get_dummies(labels[target_column])
        label_binarized = label_binarized.reindex(columns=class_list, fill_value=0).values
        
        all_embeddings.append(combined_embeddings)
        all_labels.append(label_binarized)
    
    # Combine all processed data
    X = np.vstack(all_embeddings)
    y = np.vstack(all_labels)
    return X, y, class_list

In [None]:
train_embeddings = output_dir+'//embeddings/training_data'
train_labels = output_dir+'/labels/train_labels'
X_train, y_train, classes = preprocess_yamnet_data(train_embeddings, train_labels)

print(f"Feature matrix shape: {X_train.shape}")
print(f"Number of labels: {len(y_train)}")
print(f"Classes: {classes}")

Feature matrix shape: (40625, 1024)
Number of labels: 40625
Classes: [1, 7, 41, 42, 43, 44, 61, 69, 71, 72, 74]


In [None]:
test_embeddings = output_dir+'/embeddings/test_data'
test_labels = output_dir+'/labels/test_labels'
X_test, y_test,classes_test = preprocess_yamnet_data(test_embeddings, test_labels, target_column='instrument', class_list=classes)

print(f"Feature matrix shape: {X_test.shape}")
print(f"Number of labels: {len(y_test)}")
print(f"Classes: {classes_test}")

# Ensure alignment of y_test with training classes
import pandas as pd
y_test = pd.DataFrame(y_test, columns=classes[:len(y_test[0])]).reindex(columns=classes, fill_value=0).values

print(f"Aligned Feature matrix shape: {X_test.shape}")
print(f"Aligned Label matrix shape: {y_test.shape}")
print(f"Classes (aligned to training): {classes_test}")

Feature matrix shape: (499, 1024)
Number of labels: 499
Classes: [1, 7, 41, 42, 43, 44, 61, 69, 71, 72, 74]
Aligned Feature matrix shape: (499, 1024)
Aligned Label matrix shape: (499, 11)
Classes (aligned to training): [1, 7, 41, 42, 43, 44, 61, 69, 71, 72, 74]


In [None]:
# Save as files to avoid processing each time
save_dir = output_dir+'/processed_embeddings/'

# Ensure the directory exists
os.makedirs(save_dir, exist_ok=True)

np.save(save_dir + 'X_train.npy', X_train)
np.save(save_dir + 'y_train.npy', y_train)
np.save(save_dir + 'classes.npy', classes)

In [None]:
# Save as files to avoid processing each time
np.save(save_dir + 'X_test.npy', X_test)
np.save(save_dir + 'y_test.npy', y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

# Initialize and train the model
clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42))

# Convert all boolean values to integers
y_train = np.array(y_train, dtype=int)
clf.fit(X_train, y_train)

print("Model trained successfully!")

Model trained successfully!


In [24]:
from sklearn.metrics import hamming_loss
from sklearn.metrics import classification_report
class_names_array = list(mapped_instruments.keys())
print(mapped_instruments)
# Predict
y_pred = clf.predict(X_test)

# Ensure consistent shapes
print(f"Aligned y_test shape: {y_test.shape}")
print(f"y_pred shape: {y_pred.shape}")

y_test = y_test.astype(int)

print("Unique values in y_test:", np.unique(y_test))
print("Unique values in y_pred:", np.unique(y_pred))

# Hamming Loss
print(f"Hamming Loss: {hamming_loss(y_test, y_pred):.2f}")

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=[mapped_instruments[cls] for cls in class_names_array]))

{1: 'Piano', 7: 'Harpsichord', 41: 'Violin, fiddle', 43: 'Cello', 61: 'French horn', 72: 'Clarinet', 74: 'Flute'}


NameError: name 'clf' is not defined

In [None]:
#here we can save the model so we dont have to train it each time. later we will reload it to show how that works.
with open("trained_model.pkl", "wb") as file:
    pickle.dump(clf, file)

In [61]:
print(y_pred)

[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [62]:
def preprocess_single_yamnet_file(embedding_file, label_file, target_column='instrument', class_list=None):
    import pandas as pd
    import pickle
    import numpy as np
    
    # Load embeddings
    with open(embedding_file, 'rb') as f:
        embeddings = pickle.load(f)
    embedding_chunks = [chunk['embeddings'] for chunk in embeddings]
    
    # Load labels
    labels = pd.read_csv(label_file)
    
    # If no class_list is provided, infer it from the label file
    if class_list is None:
        class_list = sorted(labels[target_column].unique())  # Sort for consistent ordering
    
    # Align embeddings with labels
    combined_embeddings, label_binarized = align_embeddings_with_labels(
        embedding_chunks, labels, target_column, class_list
    )
    
    return combined_embeddings, label_binarized, class_list

fileName = output_dir+'/embeddings/test_data/1759_pcm.wav_embeddings.pkl'
labelFileName = output_dir+'/labels/test_labels/1759.csv'
x_values, y_values, class_list = preprocess_single_yamnet_file(fileName,labelFileName, 'instrument', classes)

y_pred = clf.predict(x_values)

In [63]:
# Convert each binary vector to a list of class names
def get_instruments(y_values, classes, mapped_instruments):
    sample_by_sample = []
    all_instruments_set = set()  # To collect all instruments across the whole dataset
    
    for row in y_values:
        # Find indices where the value is 1 (class presence)
        class_indices = np.where(row == 1)[0]

        # Get the corresponding instrument indexes
        instrument_indexes = [classes[idx] for idx in class_indices]

        # Map the indexes to instrument names and get unique names
        instrument_names = np.unique([mapped_instruments[idx] for idx in instrument_indexes])
        
        # Add instrument names to the human-readable output for the current row
        sample_by_sample.append(instrument_names)

        # Add the instrument names to the aggregated set (avoiding duplicates)
        all_instruments_set.update(instrument_names)
    
    # Convert the set to a sorted list
    aggregated_instruments = sorted(list(all_instruments_set))

    return sample_by_sample, aggregated_instruments

sample_by_sample_predicted, aggregated_predicted_instruments = get_instruments(y_pred,classes,mapped_instruments)
sample_by_sample_values, aggregated_instruments = get_instruments(y_values,classes,mapped_instruments)

print("Pedicted instruments:",', '.join(aggregated_predicted_instruments))
print("Correct instriments:",', '.join(aggregated_instruments))


for idx, (predicted, correct) in enumerate(zip(sample_by_sample_predicted, sample_by_sample_values)):
    # Print predicted value
    predicted_value = predicted[0] if predicted.size > 0 else "No instrument"
    
    # Print correct value
    correct_value = correct[0] if correct.size > 0 else "No instrument"
    
    print(f"Chunk {idx + 1}: Predicted: {predicted_value} | Correct: {correct_value}")

Pedicted instruments: Piano
Correct instriments: Piano
Chunk 1: Predicted: Piano | Correct: Piano
Chunk 2: Predicted: Piano | Correct: Piano
Chunk 3: Predicted: Piano | Correct: Piano
Chunk 4: Predicted: Piano | Correct: Piano
Chunk 5: Predicted: Piano | Correct: Piano
Chunk 6: Predicted: Piano | Correct: Piano
Chunk 7: Predicted: Piano | Correct: Piano
Chunk 8: Predicted: Piano | Correct: Piano
Chunk 9: Predicted: Piano | Correct: Piano
Chunk 10: Predicted: Piano | Correct: Piano
Chunk 11: Predicted: Piano | Correct: Piano
Chunk 12: Predicted: Piano | Correct: Piano
Chunk 13: Predicted: Piano | Correct: Piano
Chunk 14: Predicted: Piano | Correct: Piano
Chunk 15: Predicted: Piano | Correct: Piano
Chunk 16: Predicted: Piano | Correct: Piano
Chunk 17: Predicted: Piano | Correct: Piano
Chunk 18: Predicted: Piano | Correct: Piano
Chunk 19: Predicted: Piano | Correct: Piano
Chunk 20: Predicted: Piano | Correct: Piano
Chunk 21: Predicted: Piano | Correct: Piano
Chunk 22: Predicted: Piano | C

In [64]:
#load the saved model
with open("trained_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)


Now that we trained the model we can pickle and save it so that we can use it again without going through the training process from start.
Below I show how we can use it on different test files.

In [None]:
num = '1819'
fileName = output_dir+f'/embeddings/test_data/{num}_pcm.wav_embeddings.pkl'
labelFileName = output_dir+f'/labels/test_labels/{num}.csv'

#preprocess
x_values, y_values, class_list = preprocess_single_yamnet_file(fileName,labelFileName, 'instrument', classes)

#predict
y_pred = loaded_model.predict(x_values)

#preview
sample_by_sample_predicted, aggregated_predicted_instruments = get_instruments(y_pred,classes,mapped_instruments)
sample_by_sample_values, aggregated_instruments = get_instruments(y_values,classes,mapped_instruments)

print("Pedicted instruments:",', '.join(aggregated_predicted_instruments))
print("Correct instriments:",', '.join(aggregated_instruments))


In [None]:
# use the model to predict the instrument of a single audio file
def predict_instrument(file_path, model, yamnet_model, mapped_instruments, chunk_size=3):
    # Load and preprocess the audio file
    audio = load_wav_16k_mono(file_path)
    chunks = split_audio_into_chunks(audio, chunk_size=chunk_size)
    
    # Extract embeddings
    all_embeddings = []
    for chunk in chunks:
        if len(chunk) == 0:
            continue
        embeddings, _ = extract_embeddings(chunk, yamnet_model)
        all_embeddings.append(embeddings.mean(axis=0))  # Average across time
    
    # Convert to numpy array
    combined_embeddings = np.vstack(all_embeddings)
    
    # Predict using the loaded model
    y_pred = model.predict(combined_embeddings)
    
    # Get instrument names
    sample_by_sample_predicted, aggregated_predicted_instruments = get_instruments(y_pred, classes, mapped_instruments)
    
    return sample_by_sample_predicted, aggregated_predicted_instruments
# Example usage
file_path = output_dir+'/converted_audio/test_data/1819_pcm.wav'
predicted_instruments, aggregated_predicted_instruments = predict_instrument(file_path, loaded_model, model, mapped_instruments)
print("Predicted instruments:", ', '.join(aggregated_predicted_instruments))
# Print predicted value
predicted_value = predicted_instruments[0][0] if predicted_instruments[0].size > 0 else "No instrument"
print(f"Predicted: {predicted_value}")

In [None]:
# Save the trained model
model.save('instrument_classifier_model.h5')

# Save the scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the label encoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)