In [1]:
# Import libraries
import os
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input
from tensorflow.keras.regularizers import l2
from tensorflow.keras import backend as K

from tqdm import tqdm

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("TensorFlow is using:", tf.test.gpu_device_name())
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled.")
    except RuntimeError as e:
        print(e)

2025-03-30 21:51:28.335467: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743396688.425733     693 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743396688.457926     693 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-30 21:51:28.690122: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available: 1
TensorFlow is using: /device:GPU:0
GPU memory growth enabled.


I0000 00:00:1743396695.403866     693 gpu_process_state.cc:201] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1743396695.406381     693 gpu_device.cc:2022] Created device /device:GPU:0 with 5564 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:06:00.0, compute capability: 8.6


In [3]:
def load_data(spectrogram_dir, label_dir):
    spectrograms = []
    labels = []

    # Load spectrograms
    spectrogram_files = sorted(os.listdir(spectrogram_dir))
    label_files = sorted(os.listdir(label_dir))

    total_files = len(spectrogram_files)

    for spec_file, label_file in tqdm(zip(spectrogram_files, label_files), 
                                        total=total_files, 
                                        desc="Loading Data", 
                                        unit="file"):
        # Load data
        spec_data = np.load(os.path.join(spectrogram_dir, spec_file)) # (n, 64, 42)
        label_data = np.load(os.path.join(label_dir, label_file)).T   # (n, 8)

        # Select multiple columns
        label_data = label_data[:, [0, 3, 4, 5, 6, 7]]  # (n, 6)

        # Append if data is valid
        if len(spec_data) > 0:
            spectrograms.append(spec_data)
            labels.append(label_data)

    # Convert to numpy arrays 
    if spectrograms:
        X = np.vstack(spectrograms)
        Y = np.vstack(labels)
        return X, Y
    else:
        return np.array([]), np.array([])

In [4]:
X_train, Y_train = load_data("spectrograms_train", "labels_train")
X_train = np.expand_dims(X_train, axis=-1)
print("Final X_train:", X_train.shape, "Final Y_train:", Y_train.shape)

Loading Data:   0%|          | 0/1289 [00:00<?, ?file/s]

Loading Data: 100%|██████████| 1289/1289 [00:02<00:00, 491.87file/s]


Final X_train: (319508, 64, 42, 1) Final Y_train: (319508, 6)


In [5]:
# Occurrences
class_counts = np.sum(Y_train, axis=0)
total_samples = len(Y_train)

for i, count in enumerate(class_counts):
    print(f"Class {i}: {count} occurrences ({count / total_samples:.2%} of the data)")

Class 0: 128695 occurrences (40.28% of the data)
Class 1: 125932 occurrences (39.41% of the data)
Class 2: 171444 occurrences (53.66% of the data)
Class 3: 257671 occurrences (80.65% of the data)
Class 4: 155212 occurrences (48.58% of the data)
Class 5: 271620 occurrences (85.01% of the data)


In [6]:
model = Sequential([
    Input(shape=(64, 42, 1)),
    Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.0001)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.0001)),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.0001)),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu', kernel_regularizer=l2(0.0001)),
    Dropout(0.4),  
    Dense(6, activation='sigmoid', dtype='float32')  
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])
model.summary()

I0000 00:00:1743396741.521929     693 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5564 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:06:00.0, compute capability: 8.6


In [7]:
# Train model
history = model.fit(X_train, Y_train, batch_size=64, epochs=10, verbose=1)

# Final loss & accuracy
final_loss = history.history['loss'][-1]
final_accuracy = history.history['binary_accuracy'][-1]
print(f"Final Loss: {final_loss:.4f}, Final Accuracy: {final_accuracy:.4f}")

2025-03-30 21:52:26.197642: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 3435350016 exceeds 10% of free system memory.
2025-03-30 21:52:28.570454: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 3435350016 exceeds 10% of free system memory.


Epoch 1/10


I0000 00:00:1743396750.917403    1416 service.cc:148] XLA service 0x7fbf70005290 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1743396750.918383    1416 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2025-03-30 21:52:30.978378: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1743396751.185765    1416 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  21/4993[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m40s[0m 8ms/step - binary_accuracy: 0.6121 - loss: 1.2963

I0000 00:00:1743396754.070579    1416 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m4993/4993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 9ms/step - binary_accuracy: 0.7909 - loss: 0.4573
Epoch 2/10
[1m4993/4993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 9ms/step - binary_accuracy: 0.8663 - loss: 0.3342
Epoch 3/10
[1m4993/4993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 9ms/step - binary_accuracy: 0.8796 - loss: 0.3153
Epoch 4/10
[1m4993/4993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 9ms/step - binary_accuracy: 0.8845 - loss: 0.3072
Epoch 5/10
[1m4993/4993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 9ms/step - binary_accuracy: 0.8881 - loss: 0.3019
Epoch 6/10
[1m4993/4993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 9ms/step - binary_accuracy: 0.8901 - loss: 0.2990
Epoch 7/10
[1m4993/4993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 15ms/step - binary_accuracy: 0.8921 - loss: 0.2963
Epoch 8/10
[1m4993/4993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 5ms/step - binary_ac

In [8]:
# Get predictions
Y_pred = model.predict(X_train)

# Convert to binary (threshold the probabilities)
threshold = 0.5
Y_pred_binary = (Y_pred > threshold).astype(int)

# Calculate metrics
precision = precision_score(Y_train, Y_pred_binary, average='samples', zero_division=0)
recall = recall_score(Y_train, Y_pred_binary, average='samples', zero_division=0)
f1 = f1_score(Y_train, Y_pred_binary, average='samples', zero_division=0)

# Print the metrics
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

2025-03-30 21:59:35.975492: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 3435350016 exceeds 10% of free system memory.
2025-03-30 21:59:37.383716: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 3435350016 exceeds 10% of free system memory.


[1m9985/9985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step
Precision: 0.8944
Recall: 0.8730
F1-Score: 0.8735


In [9]:
X_test, Y_test = load_data("spectrograms_test", "labels_test")
X_test = np.expand_dims(X_test, axis=-1)
Y_pred_test = model.predict(X_test)

Loading Data: 100%|██████████| 151/151 [00:00<00:00, 703.84file/s]


[1m1234/1234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step


In [11]:
# --- Threshold Adjustment ---
num_classes = Y_test.shape[1]
thresholds = np.array([0.5] * num_classes)  # Default 0.5
# Adjust thresholds as needed (example):
thresholds[0] = 0.3
thresholds[1] = 0.3
thresholds[2] = 0.4
thresholds[3] = 0.5
thresholds[4] = 0.3
thresholds[5] = 0.5

Y_pred_test_binary = (Y_pred_test >= thresholds).astype(int)
# --- End Threshold Adjustment ---

precision_test = precision_score(Y_test, Y_pred_test_binary, average='samples', zero_division=0)
recall_test = recall_score(Y_test, Y_pred_test_binary, average='samples', zero_division=0)
f1_test = f1_score(Y_test, Y_pred_test_binary, average='samples', zero_division=0)

print(f"Test Precision: {precision_test:.4f}")
print(f"Test Recall: {recall_test:.4f}")
print(f"Test F1-Score: {f1_test:.4f}")

# Compute per-class precision, recall, and F1-score
precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
    Y_test, Y_pred_test_binary, average=None, zero_division=0
)

# Occurrences
class_counts = np.sum(Y_train, axis=0) # Make sure Y_train is defined.
total_samples = len(Y_train)

for i in range(num_classes):
    print(f"Class {i} | Precision={precision_per_class[i]:.4f}, Recall={recall_per_class[i]:.4f}, "
          f"F1={f1_per_class[i]:.4f} | Thresh={thresholds[i]:.1f}, Occurrences={int(class_counts[i])} ({class_counts[i] / total_samples:.2%})")

Test Precision: 0.8439
Test Recall: 0.8788
Test F1-Score: 0.8472
Class 0 | Precision=0.7103, Recall=0.8442, F1=0.7715 | Thresh=0.3, Occurrences=128695 (40.28%)
Class 1 | Precision=0.8019, Recall=0.8384, F1=0.8198 | Thresh=0.3, Occurrences=125932 (39.41%)
Class 2 | Precision=0.7790, Recall=0.8693, F1=0.8217 | Thresh=0.4, Occurrences=171444 (53.66%)
Class 3 | Precision=0.9425, Recall=0.9867, F1=0.9641 | Thresh=0.5, Occurrences=257671 (80.65%)
Class 4 | Precision=0.9450, Recall=0.7905, F1=0.8609 | Thresh=0.3, Occurrences=155212 (48.58%)
Class 5 | Precision=0.9547, Recall=0.9917, F1=0.9728 | Thresh=0.5, Occurrences=271620 (85.01%)


In [12]:
model.summary()

In [13]:
model.save("instrument_classifier.h5")
model = tf.keras.models.load_model("instrument_classifier.h5", compile=False)

converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

with open("instrument_classifier.tflite", "wb") as f:
    f.write(tflite_model)



INFO:tensorflow:Assets written to: /tmp/tmp_t4bt446/assets


INFO:tensorflow:Assets written to: /tmp/tmp_t4bt446/assets


Saved artifact at '/tmp/tmp_t4bt446'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 64, 42, 1), dtype=tf.float32, name='input_layer')
Output Type:
  TensorSpec(shape=(None, 6), dtype=tf.float32, name=None)
Captures:
  140461936728320: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140461936726912: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140461540058992: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140461540063920: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140461540183728: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140461540181616: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140461540196048: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140461540103568: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140461540112192: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140461540110256: TensorSpec(shape=(), dtype=tf.resource, name=None)


W0000 00:00:1743397876.808108     693 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1743397876.808130     693 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.
2025-03-30 22:11:16.808419: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmp_t4bt446
2025-03-30 22:11:16.808999: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-03-30 22:11:16.809009: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmp_t4bt446
I0000 00:00:1743397876.813134     693 mlir_graph_optimization_pass.cc:401] MLIR V1 optimization pass is not enabled
2025-03-30 22:11:16.813947: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-03-30 22:11:16.841367: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmp_t4bt446
2025-03-30 22:11:16.849391: I tensorflow/cc/saved_model/loader.cc:466] SavedModel 