In [1]:
# Import libraries
import os
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input, BatchNormalization, Add
from tqdm import tqdm
import gc

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.utils import shuffle

print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("TensorFlow is using:", tf.test.gpu_device_name())
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled.")
    except RuntimeError as e:
        print(e)

2025-03-24 20:04:24.382320: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742871864.458297     647 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742871864.482069     647 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-24 20:04:24.656216: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available: 1
TensorFlow is using: /device:GPU:0
GPU memory growth enabled.


I0000 00:00:1742871868.569401     647 gpu_process_state.cc:201] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1742871868.571969     647 gpu_device.cc:2022] Created device /device:GPU:0 with 5564 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:06:00.0, compute capability: 8.6


In [3]:
# Data Preparation (Only Load Rare-Class Data)
def load_data(spectrogram_dir, label_dir):
    spectrograms = []
    labels = []

    # Load spectrograms
    spectrogram_files = sorted(os.listdir(spectrogram_dir))
    label_files = sorted(os.listdir(label_dir))

    total_files = len(spectrogram_files)  

    for spec_file, label_file in tqdm(zip(spectrogram_files, label_files), 
                                      total=total_files, 
                                      desc="Loading Rare-Class Data", 
                                      unit="file"):

        spectrograms.append(np.load(os.path.join(spectrogram_dir, spec_file)))
        labels.append(np.load(os.path.join(label_dir, label_file)).T)

    # Convert to numpy arrays 
    if spectrograms:
        X = np.vstack(spectrograms)  
        Y = np.vstack(labels)  
        return X, Y
    else:
        return np.array([]), np.array([])

In [4]:
# Load the base dataset
X_train, Y_train = load_data("spectrograms_train", "labels_train") 
X_train = np.expand_dims(X_train, axis=-1)  
print("Base X_train:", X_train.shape, "Base Y_train:", Y_train.shape)

Loading Rare-Class Data: 100%|██████████| 1289/1289 [00:05<00:00, 249.68file/s]


Base X_train: (639664, 32, 40, 1) Base Y_train: (639664, 5)


In [5]:
# Occurrences
class_counts = np.sum(Y_train, axis=0)
total_samples = len(Y_train)

for i, count in enumerate(class_counts):
    print(f"Class {i}: {count} occurrences ({count / total_samples:.2%} of the data)")

Class 0: 423890 occurrences (66.27% of the data)
Class 1: 463402 occurrences (72.44% of the data)
Class 2: 495511 occurrences (77.46% of the data)
Class 3: 289316 occurrences (45.23% of the data)
Class 4: 506235 occurrences (79.14% of the data)


In [14]:
model = Sequential([
    Input(shape=(32, 40, 1)), 
    Conv2D(16, (3, 3), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),
    Conv2D(32, (3, 3), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(5, activation='sigmoid', dtype='float32')  
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])
model.summary()


In [15]:
# Train model
history = model.fit(X_train, Y_train, batch_size=128, epochs=10, verbose=1)

# Final loss & accuracy
final_loss = history.history['loss'][-1]
final_accuracy = history.history['binary_accuracy'][-1]
print(f"Final Loss: {final_loss:.4f}, Final Accuracy: {final_accuracy:.4f}")

Epoch 1/10


I0000 00:00:1742874298.225678    2655 service.cc:148] XLA service 0x1ba7b9b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1742874298.226280    2655 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2025-03-24 20:44:58.284397: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


[1m  33/4998[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m24s[0m 5ms/step - binary_accuracy: 0.6602 - loss: 0.7729

I0000 00:00:1742874301.009677    2655 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m4998/4998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 5ms/step - binary_accuracy: 0.8235 - loss: 0.4060
Epoch 2/10
[1m4998/4998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - binary_accuracy: 0.8689 - loss: 0.3035
Epoch 3/10
[1m4998/4998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - binary_accuracy: 0.8779 - loss: 0.2839
Epoch 4/10
[1m4998/4998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - binary_accuracy: 0.8827 - loss: 0.2743
Epoch 5/10
[1m4998/4998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - binary_accuracy: 0.8865 - loss: 0.2661
Epoch 6/10
[1m4998/4998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - binary_accuracy: 0.8888 - loss: 0.2611
Epoch 7/10
[1m4998/4998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - binary_accuracy: 0.8907 - loss: 0.2571
Epoch 8/10
[1m4998/4998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - binary_acc

In [16]:
# Get predictions
Y_pred = model.predict(X_train)

# Convert to binary (threshold the probabilities)
threshold = 0.5
Y_pred_binary = (Y_pred > threshold).astype(int)

# Calculate metrics
precision = precision_score(Y_train, Y_pred_binary, average='samples', zero_division=0)
recall = recall_score(Y_train, Y_pred_binary, average='samples', zero_division=0)
f1 = f1_score(Y_train, Y_pred_binary, average='samples', zero_division=0)

# Print the metrics
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

[1m19990/19990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 2ms/step
Precision: 0.8689
Recall: 0.8921
F1-Score: 0.8697


In [17]:
X_test, Y_test = load_data("spectrograms_test", "labels_test")  
X_test = np.expand_dims(X_test, axis=-1)  

Y_pred_test = model.predict(X_test)
Y_pred_test_binary = (Y_pred_test > 0.5).astype(int)

precision_test = precision_score(Y_test, Y_pred_test_binary, average='samples', zero_division=0)
recall_test = recall_score(Y_test, Y_pred_test_binary, average='samples', zero_division=0)
f1_test = f1_score(Y_test, Y_pred_test_binary, average='samples', zero_division=0)

print(f"Test Precision: {precision_test:.4f}")
print(f"Test Recall: {recall_test:.4f}")
print(f"Test F1-Score: {f1_test:.4f}")

Loading Rare-Class Data: 100%|██████████| 151/151 [00:00<00:00, 802.66file/s]


[1m2470/2470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step
Test Precision: 0.8355
Test Recall: 0.8717
Test F1-Score: 0.8377


In [18]:
# Compute per-class precision, recall, and F1-score
precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
    Y_test, Y_pred_test_binary, average=None, zero_division=0
)

# Occurrences
class_counts = np.sum(Y_train, axis=0)
total_samples = len(Y_train)

num_classes = Y_test.shape[1]  # Number of instrument classes
for i in range(num_classes):
    print(f"Class {i}: Precision={precision_per_class[i]:.4f}, Recall={recall_per_class[i]:.4f}, "
          f"F1={f1_per_class[i]:.4f} | Occurrences={int(class_counts[i])} ({class_counts[i] / total_samples:.2%})")

Class 0: Precision=0.7688, Recall=0.9323, F1=0.8427 | Occurrences=423890 (66.27%)
Class 1: Precision=0.8638, Recall=0.9189, F1=0.8905 | Occurrences=463402 (72.44%)
Class 2: Precision=0.9289, Recall=0.9554, F1=0.9420 | Occurrences=495511 (77.46%)
Class 3: Precision=0.8625, Recall=0.8475, F1=0.8549 | Occurrences=289316 (45.23%)
Class 4: Precision=0.9681, Recall=0.9195, F1=0.9432 | Occurrences=506235 (79.14%)


In [19]:
model.summary()

In [20]:
model.save("instrument_classifier.h5")
model = tf.keras.models.load_model("instrument_classifier.h5", compile=False)

converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

with open("instrument_classifier.tflite", "wb") as f:
    f.write(tflite_model)



INFO:tensorflow:Assets written to: /tmp/tmpuvpmf_09/assets


INFO:tensorflow:Assets written to: /tmp/tmpuvpmf_09/assets


Saved artifact at '/tmp/tmpuvpmf_09'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 32, 40, 1), dtype=tf.float32, name='input_layer_3')
Output Type:
  TensorSpec(shape=(None, 5), dtype=tf.float32, name=None)
Captures:
  140103781968256: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140103781973360: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140103781960688: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140104423013040: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140103782011952: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140103782012656: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140103782009840: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140103782015120: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140103782069552: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140103782067440: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140104423461408

W0000 00:00:1742875013.770337     647 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1742875013.770383     647 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.
2025-03-24 20:56:53.770793: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpuvpmf_09
2025-03-24 20:56:53.771591: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-03-24 20:56:53.771602: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmpuvpmf_09
I0000 00:00:1742875013.777529     647 mlir_graph_optimization_pass.cc:401] MLIR V1 optimization pass is not enabled
2025-03-24 20:56:53.778843: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-03-24 20:56:53.816662: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmpuvpmf_09
2025-03-24 20:56:53.828498: I tensorflow/cc/saved_model/loader.cc:466] SavedModel 