In [1]:
import numpy as np
import pandas as pd 
import os

In [2]:
wav_files = []
for root, dirs, files in os.walk('/kaggle/input'):
    for file in files:
        if file.endswith('wav'):
            wav_files.append(os.path.join(root, file))

In [3]:
df_audio = pd.DataFrame({'file_path': wav_files})

In [4]:
df_audio['label'] = df_audio['file_path'].apply(lambda x: os.path.basename(os.path.dirname(x)))

In [5]:
import librosa

In [6]:
def audio_process(file_path, target_frames):
    y, sr = librosa.load(file_path)
    mel = librosa.feature.melspectrogram(y=y)
    mel_db = librosa.power_to_db(mel)

    min_val = np.min(mel_db)
    max_val = np.max(mel_db)

    if (max_val - min_val) == 0:
        norm_mel = np.zeros_like(mel_db)
    else:
        norm_mel = (mel_db - min_val) / (max_val - min_val)

    current_frames = norm_mel.shape[1]
    if current_frames > target_frames:
        processed_spectrogram = norm_mel[:, :target_frames]
    elif current_frames < target_frames:
        padding = target_frames - current_frames
        processed_spectrogram = np.pad(norm_mel, ((0, 0), (0, padding)), mode = 'constant')
    else:
        processed_spectrogram = norm_mel

    return processed_spectrogram

In [7]:
df_audio.head()

Unnamed: 0,file_path,label
0,/kaggle/input/human-screaming-detection-datase...,Screaming
1,/kaggle/input/human-screaming-detection-datase...,Screaming
2,/kaggle/input/human-screaming-detection-datase...,Screaming
3,/kaggle/input/human-screaming-detection-datase...,Screaming
4,/kaggle/input/human-screaming-detection-datase...,Screaming


In [8]:
OUTPUT_FOLDER_NAME = 'processed_spectrogram'

In [9]:
processed_data = []
os.makedirs(OUTPUT_FOLDER_NAME, exist_ok=True)
for i, row in df_audio.iterrows():
    audio_path = row['file_path']
    label = row['label']
    spectrogram_data = audio_process(audio_path, 500)

    output_filename = f"{os.path.splitext(os.path.basename(audio_path))[0]}.npy"
    output_filepath = os.path.join(OUTPUT_FOLDER_NAME, output_filename)
    np.save(output_filepath, spectrogram_data)

    processed_data.append({
        'spectrogram_path': output_filepath,
        'label': label
    })

In [10]:
processed_df = pd.DataFrame(processed_data)

In [11]:
processed_df.head()

Unnamed: 0,spectrogram_path,label
0,processed_spectrogram/nIFbKv1qjfw_out.npy,Screaming
1,processed_spectrogram/d4v3_z0ISrM_out.npy,Screaming
2,processed_spectrogram/9AZZncb_yek_out.npy,Screaming
3,processed_spectrogram/IdenFdkeASo_out.npy,Screaming
4,processed_spectrogram/LY90s5AgkWM_out.npy,Screaming


In [12]:
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models, optimizers

X = []
y = []

for index, row in processed_df.iterrows():
    spectrogram = np.load(row['spectrogram_path'])
    X.append(spectrogram)
    y.append(row['label'])

X = np.array(X)
y = np.array(y)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)

2025-07-24 01:04:35.050368: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753319075.214152      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753319075.258513      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [13]:
X.shape

(3493, 128, 500)

In [14]:
y_encoded.shape

(3493,)

In [15]:
if len(X.shape) == 3:
    X = np.expand_dims(X, axis=-1)

In [16]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        print("GPU is available and configured for use.")
    except RuntimeError as e:
        print(e)
        print("GPU is available but could not be configured. Falling back to CPU.")
else:
    print("No GPU detected. TensorFlow will run on CPU.")

1 Physical GPUs, 1 Logical GPUs
GPU is available and configured for use.


I0000 00:00:1753319087.939910      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [17]:
def create_cnn_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),

        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),

        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),

        layers.Flatten(),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

input_shape = X.shape[1:]
print(f"Input shape untuk model: {input_shape}")

Input shape untuk model: (128, 500, 1)


In [18]:
n_splits = 5

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

fold_accuracies = []
fold_losses = []

for fold, (train_index, val_index) in enumerate(skf.split(X, y_encoded)):
    print(f"\nFold {fold+1}/{n_splits}")

    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y_encoded[train_index], y_encoded[val_index]

    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    train_dataset = train_dataset.shuffle(buffer_size=len(X_train)).batch(32).prefetch(tf.data.AUTOTUNE)

    val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
    val_dataset = val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

    model = create_cnn_model(input_shape, num_classes)

    history = model.fit(train_dataset,
                        epochs=20,
                        validation_data=val_dataset,
                        verbose=1)

    loss, accuracy = model.evaluate(val_dataset, verbose=0)
    print(f"Fold {fold+1} Validation Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

    fold_losses.append(loss)
    fold_accuracies.append(accuracy)

print(f"Average Validation Accuracy across {n_splits} folds: {np.mean(fold_accuracies):.4f} +/- {np.std(fold_accuracies):.4f}")
print(f"Average Validation Loss across {n_splits} folds: {np.mean(fold_losses):.4f} +/- {np.std(fold_losses):.4f}")


Fold 1/5
Epoch 1/20


I0000 00:00:1753319095.890230      69 service.cc:148] XLA service 0x78c9e8004710 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1753319095.891195      69 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1753319096.340800      69 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 5/88[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 38ms/step - accuracy: 0.5706 - loss: 42.7209

I0000 00:00:1753319102.026016      69 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 108ms/step - accuracy: 0.6625 - loss: 13.8978 - val_accuracy: 0.2475 - val_loss: 59.4878
Epoch 2/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step - accuracy: 0.7495 - loss: 0.6948 - val_accuracy: 0.5036 - val_loss: 11.2499
Epoch 3/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step - accuracy: 0.8028 - loss: 0.4590 - val_accuracy: 0.7325 - val_loss: 13.3531
Epoch 4/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step - accuracy: 0.7943 - loss: 0.4300 - val_accuracy: 0.7468 - val_loss: 9.3976
Epoch 5/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step - accuracy: 0.8037 - loss: 0.4426 - val_accuracy: 0.7554 - val_loss: 2.6835
Epoch 6/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step - accuracy: 0.7748 - loss: 0.4691 - val_accuracy: 0.7997 - val_loss: 0.6725
Epoch 7/20
[1m88/88[0m [32m━━━━━━━━━

In [19]:
full_dataset = tf.data.Dataset.from_tensor_slices((X, y_encoded))
full_dataset = full_dataset.shuffle(buffer_size=len(X)).batch(32).prefetch(tf.data.AUTOTUNE)

final_model = create_cnn_model(input_shape, num_classes)

history_final = final_model.fit(full_dataset,
                                epochs=30,
                                verbose=1)

Epoch 1/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 65ms/step - accuracy: 0.7204 - loss: 11.6653
Epoch 2/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 37ms/step - accuracy: 0.7687 - loss: 0.6592
Epoch 3/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 37ms/step - accuracy: 0.7765 - loss: 0.5314
Epoch 4/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 36ms/step - accuracy: 0.8053 - loss: 0.4512
Epoch 5/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 37ms/step - accuracy: 0.7954 - loss: 0.4558
Epoch 6/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 37ms/step - accuracy: 0.7969 - loss: 0.4487
Epoch 7/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 37ms/step - accuracy: 0.8130 - loss: 0.4072
Epoch 8/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 37ms/step - accuracy: 0.7930 - loss: 0.4772
Epoch 9/30
[1m110/110[0m [3

In [20]:
import shutil

MODEL_SAVE_PATH = 'final_audio_classification_model'
tf.saved_model.save(final_model, MODEL_SAVE_PATH)

ZIP_FILE_NAME = 'final_audio_classification_model.zip'
shutil.make_archive(MODEL_SAVE_PATH, 'zip', MODEL_SAVE_PATH)

'/kaggle/working/final_audio_classification_model.zip'

In [21]:
converter = tf.lite.TFLiteConverter.from_saved_model(MODEL_SAVE_PATH)
converter.optimizations = [tf.lite.Optimize.DEFAULT]

tflite_model = converter.convert()

TFLITE_MODEL_NAME = 'audio_classification_model.tflite'
with open(TFLITE_MODEL_NAME, 'wb') as f:
    f.write(tflite_model)

W0000 00:00:1753319662.864790      19 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1753319662.864830      19 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.
I0000 00:00:1753319662.878607      19 mlir_graph_optimization_pass.cc:401] MLIR V1 optimization pass is not enabled


In [22]:
import pickle

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)