In [1]:
# Import libraries
import os
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input
from tensorflow.keras.regularizers import l2
from tensorflow.keras import backend as K

from tqdm import tqdm

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("TensorFlow is using:", tf.test.gpu_device_name())
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled.")
    except RuntimeError as e:
        print(e)

2025-06-15 18:17:45.885160: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750036665.998390     697 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750036666.025760     697 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-15 18:17:46.259030: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available: 1
TensorFlow is using: /device:GPU:0
GPU memory growth enabled.


I0000 00:00:1750036670.603343     697 gpu_process_state.cc:201] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1750036670.604509     697 gpu_device.cc:2022] Created device /device:GPU:0 with 5564 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:06:00.0, compute capability: 8.6


In [2]:
def load_data(spectrogram_dir, label_dir):
    spectrograms = []
    labels = []

    # Load spectrograms
    spectrogram_files = sorted(os.listdir(spectrogram_dir))
    label_files = sorted(os.listdir(label_dir))

    total_files = len(spectrogram_files)

    for spec_file, label_file in tqdm(zip(spectrogram_files, label_files), 
                                        total=total_files, 
                                        desc="Loading Data", 
                                        unit="file"):
        # Load data
        spec_data = np.load(os.path.join(spectrogram_dir, spec_file)) # (n, 128, 42)
        label_data = np.load(os.path.join(label_dir, label_file)).T   # (n, 5)

        # Select multiple columns
        label_data = label_data[:, [0, 1, 2, 3, 4]]  # (n, 5)

        # Append if data is valid
        if len(spec_data) > 0:
            spectrograms.append(spec_data)
            labels.append(label_data)

    # Convert to numpy arrays 
    if spectrograms:
        X = np.vstack(spectrograms)
        Y = np.vstack(labels)
        return X, Y
    else:
        return np.array([]), np.array([])

In [3]:
X_train, Y_train = load_data("spectrograms_train", "labels_train")
X_train = np.expand_dims(X_train, axis=-1)
print("Final X_train:", X_train.shape, "Final Y_train:", Y_train.shape)

Loading Data:   0%|          | 0/1289 [00:00<?, ?file/s]

Loading Data: 100%|██████████| 1289/1289 [00:14<00:00, 90.11file/s]


Final X_train: (319508, 128, 42, 1) Final Y_train: (319508, 5)


In [4]:
# Occurrences
class_counts = np.sum(Y_train, axis=0)
total_samples = len(Y_train)

for i, count in enumerate(class_counts):
    print(f"Class {i}: {count} occurrences ({count / total_samples:.2%} of the data)")

Class 0: 128695 occurrences (40.28% of the data)
Class 1: 246807 occurrences (77.25% of the data)
Class 2: 257671 occurrences (80.65% of the data)
Class 3: 155212 occurrences (48.58% of the data)
Class 4: 271620 occurrences (85.01% of the data)


In [5]:
model = Sequential([
    Input(shape=(128, 42, 1)),
    Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.0001)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.0001)),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.0001)),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu', kernel_regularizer=l2(0.0001)),
    Dropout(0.4),  
    Dense(5, activation='sigmoid', dtype='float32')  
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])
model.summary()

I0000 00:00:1750036964.825789     697 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5564 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:06:00.0, compute capability: 8.6


In [6]:
# Train model
history = model.fit(X_train, Y_train, batch_size=64, epochs=5, verbose=1)

# Final loss & accuracy
final_loss = history.history['loss'][-1]
final_accuracy = history.history['binary_accuracy'][-1]
print(f"Final Loss: {final_loss:.4f}, Final Accuracy: {final_accuracy:.4f}")

2025-06-15 18:22:57.050230: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 6870700032 exceeds 10% of free system memory.
2025-06-15 18:23:31.619058: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 6870700032 exceeds 10% of free system memory.


Epoch 1/5


I0000 00:00:1750037035.965708    1829 service.cc:148] XLA service 0x7188180068a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1750037035.968084    1829 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2025-06-15 18:23:56.414318: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1750037036.869519    1829 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m   7/4993[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:38[0m 20ms/step - binary_accuracy: 0.6240 - loss: 2.8499

I0000 00:00:1750037042.263544    1829 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m4993/4993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 19ms/step - binary_accuracy: 0.8393 - loss: 0.4037
Epoch 2/5
[1m4993/4993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 18ms/step - binary_accuracy: 0.8944 - loss: 0.2811
Epoch 3/5
[1m4993/4993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 18ms/step - binary_accuracy: 0.9034 - loss: 0.2658
Epoch 4/5
[1m4993/4993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 18ms/step - binary_accuracy: 0.9074 - loss: 0.2596
Epoch 5/5
[1m4993/4993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 18ms/step - binary_accuracy: 0.9094 - loss: 0.2562
Final Loss: 0.2569, Final Accuracy: 0.9096


In [7]:
# Get predictions
Y_pred = model.predict(X_train)

# Convert to binary (threshold the probabilities)
threshold = 0.5
Y_pred_binary = (Y_pred > threshold).astype(int)

# Calculate metrics
precision = precision_score(Y_train, Y_pred_binary, average='samples', zero_division=0)
recall = recall_score(Y_train, Y_pred_binary, average='samples', zero_division=0)
f1 = f1_score(Y_train, Y_pred_binary, average='samples', zero_division=0)

# Print the metrics
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

2025-06-15 18:32:17.244901: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 6870700032 exceeds 10% of free system memory.
2025-06-15 18:32:26.806880: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 6870700032 exceeds 10% of free system memory.


[1m9985/9985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 5ms/step
Precision: 0.9047
Recall: 0.9020
F1-Score: 0.8953


In [8]:
X_test, Y_test = load_data("spectrograms_test", "labels_test")
X_test = np.expand_dims(X_test, axis=-1)
Y_pred_test = model.predict(X_test)

Loading Data: 100%|██████████| 151/151 [00:01<00:00, 114.31file/s]


[1m1234/1234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step


In [25]:
# --- Threshold Adjustment ---
num_classes = Y_test.shape[1]
thresholds = np.array([0.5] * num_classes)  # Default 0.5
# Adjust thresholds as needed (example):
thresholds[0] = 0.5
thresholds[1] = 0.6
thresholds[2] = 0.5
thresholds[3] = 0.3
thresholds[4] = 0.5

Y_pred_test_binary = (Y_pred_test >= thresholds).astype(int)
# --- End Threshold Adjustment ---

precision_test = precision_score(Y_test, Y_pred_test_binary, average='samples', zero_division=0)
recall_test = recall_score(Y_test, Y_pred_test_binary, average='samples', zero_division=0)
f1_test = f1_score(Y_test, Y_pred_test_binary, average='samples', zero_division=0)

print(f"Test Precision: {precision_test:.4f}")
print(f"Test Recall: {recall_test:.4f}")
print(f"Test F1-Score: {f1_test:.4f}")

# Compute per-class precision, recall, and F1-score
precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
    Y_test, Y_pred_test_binary, average=None, zero_division=0
)

# Occurrences
class_counts = np.sum(Y_train, axis=0) # Make sure Y_train is defined.
total_samples = len(Y_train)

for i in range(num_classes):
    print(f"Class {i} | Precision={precision_per_class[i]:.4f}, Recall={recall_per_class[i]:.4f}, "
          f"F1={f1_per_class[i]:.4f} | Thresh={thresholds[i]:.1f}, Occurrences={int(class_counts[i])} ({class_counts[i] / total_samples:.2%})")

Test Precision: 0.9047
Test Recall: 0.8781
Test F1-Score: 0.8809
Class 0 | Precision=0.8702, Recall=0.6942, F1=0.7723 | Thresh=0.5, Occurrences=128695 (40.28%)
Class 1 | Precision=0.9161, Recall=0.9204, F1=0.9182 | Thresh=0.6, Occurrences=246807 (77.25%)
Class 2 | Precision=0.9526, Recall=0.9846, F1=0.9683 | Thresh=0.5, Occurrences=257671 (80.65%)
Class 3 | Precision=0.9602, Recall=0.8053, F1=0.8760 | Thresh=0.3, Occurrences=155212 (48.58%)
Class 4 | Precision=0.9663, Recall=0.9876, F1=0.9769 | Thresh=0.5, Occurrences=271620 (85.01%)


In [25]:
model.summary()

In [26]:
model.save("instrument_classifier.h5")
model = tf.keras.models.load_model("instrument_classifier.h5", compile=False)

converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

with open("instrument_classifier.tflite", "wb") as f:
    f.write(tflite_model)



INFO:tensorflow:Assets written to: /tmp/tmpmqp6c_c_/assets


INFO:tensorflow:Assets written to: /tmp/tmpmqp6c_c_/assets


Saved artifact at '/tmp/tmpmqp6c_c_'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 128, 42, 1), dtype=tf.float32, name='input_layer')
Output Type:
  TensorSpec(shape=(None, 5), dtype=tf.float32, name=None)
Captures:
  124830677862528: TensorSpec(shape=(), dtype=tf.resource, name=None)
  124830677867280: TensorSpec(shape=(), dtype=tf.resource, name=None)
  124830678306304: TensorSpec(shape=(), dtype=tf.resource, name=None)
  124830678307712: TensorSpec(shape=(), dtype=tf.resource, name=None)
  124830678318448: TensorSpec(shape=(), dtype=tf.resource, name=None)
  124830678315104: TensorSpec(shape=(), dtype=tf.resource, name=None)
  124830678347168: TensorSpec(shape=(), dtype=tf.resource, name=None)
  124830678345232: TensorSpec(shape=(), dtype=tf.resource, name=None)
  124830678351216: TensorSpec(shape=(), dtype=tf.resource, name=None)
  124830676438352: TensorSpec(shape=(), dtype=tf.resource, name=None)


W0000 00:00:1750037852.353901     697 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1750037852.353999     697 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.
2025-06-15 18:37:32.355571: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpmqp6c_c_
2025-06-15 18:37:32.356157: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-06-15 18:37:32.356170: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmpmqp6c_c_
I0000 00:00:1750037852.361946     697 mlir_graph_optimization_pass.cc:401] MLIR V1 optimization pass is not enabled
2025-06-15 18:37:32.363191: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-06-15 18:37:32.404743: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmpmqp6c_c_
2025-06-15 18:37:32.413552: I tensorflow/cc/saved_model/loader.cc:466] SavedModel 