# EchoCare: Cry Detection Model Training
## Stage 1 of Two-Stage Pipeline

This notebook trains a binary classification model to detect infant cries vs non-cry sounds.

**Dataset:**
- Cry sounds: Baby Chillanto Database (normal, hungry, pain)
- Non-cry sounds: ESC-50 Dataset

**Target Performance:** >85% accuracy for cry detection

**Architecture:** MobileNetV2 (lightweight for Raspberry Pi deployment) with custom classification head

## 1. Import Libraries

In [19]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import cv2
import json
from datetime import datetime

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")

TensorFlow version: 2.20.0
Keras version: 3.12.0


## 2. Configuration

In [20]:
# Paths
train_dir = Path("C:/Users/danel/FYP/echocare-infant-cry-classification/dataset/processed/mel-spectrograms/cry_detection/train")
val_dir = Path("C:/Users/danel/FYP/echocare-infant-cry-classification/dataset/processed/mel-spectrograms/cry_detection/validate")
test_dir = Path("C:/Users/danel/FYP/echocare-infant-cry-classification/dataset/processed/mel-spectrograms/cry_detection/test")
save_dir = Path("C:/Users/danel/FYP/echocare-infant-cry-classification/model/cry_detection")

# Model hyperparameters
img_size = (224, 224)  # MobileNetV2 input size
batch_size = 32
epochs = 30
learning_rate = 0.0001
dropout_rate = 0.4 

# Class information
class_names = ['non-cry', 'cry']
num_classes = 2

## 3. Data Loading Functions

In [22]:
def load_spectrograms(data_dir, verbose=True):
    """
    Load .npy spectrograms from directory structure.

    Args:
        data_dir: Path object or string path to directory containing cry/non_cry folders
        verbose: Print loading progress
    
    Returns:
        spectrograms: numpy array of spectrograms
        labels: numpy array of labels (0=non-cry, 1=cry)
    """

    spectrograms = []
    labels = []
    
    # Load cry mel-spectrograms (label = 1)
    cry_dir = data_dir / 'cry'  # Find the 'cry' subfolder
    cry_files = list(cry_dir.glob('*.npy')) # Get all .npy files in the 'cry' folder
    
    for file in cry_files: # Load each cry file and label it as 1 (cry = 1, non-cry = 0)
        spec = np.load(file) # load the .npy (mel spectrogram) file
        spectrograms.append(spec) # add to mel-spectrograms list
        labels.append(1) # label as cry (1)
    
    if verbose:
        print(f"Loaded {len(cry_files)} cry spectrograms")
    
    # Load non-cry spectrograms (label = 0)
    non_cry_dir = data_dir / 'non-cry'
    non_cry_files = list(non_cry_dir.glob('*.npy'))
    
    for file in non_cry_files:
        spec = np.load(file)
        spectrograms.append(spec)
        labels.append(0)
    
    if verbose:
        print(f"Loaded {len(non_cry_files)} non-cry spectrograms")
        print(f"Total samples: {len(spectrograms)}")
    
    return np.array(spectrograms), np.array(labels)

## 4. Prepare Mel-Spectrograms for MobileNetV2 Architecture

In [None]:
def prepare_for_mobilenet(spectrograms, target_size=(224, 224)):
    """
    Prepare mel-spectrograms for MobileNetV2 input.
    
    Transforms all mel-spectrograms from (128, 32) grayscale format to (224, 224, 3) RGB format required by MobileNetV2's pre-trained weights from ImageNet.
    
    Process:
    1. Input: (128, 32) - 128 mel bands x 32 time frames
    2. Add channel: (128, 32, 1) - add dimension for grayscale
    3. Resize: (224, 224, 1) - match MobileNetV2 input size
    4. Repeat channels: (224, 224, 3) - convert to RGB by duplicating grayscale values
    
    Args:
        spectrograms: numpy array of mel-spectrograms with shape (n_samples, 128, 32)
        target_size: tuple (height, width) for resizing, default (224, 224)
    
    Returns:
        prepared: numpy array with shape (n_samples, 224, 224, 3)
    """
    prepared = []
    
    for spec in spectrograms:
        # Step 1: Add channel dimension
        # Transform from (128, 32) to (128, 32, 1)
        # This prepares the 2D mel-spectrogram for image processing
        if len(spec.shape) == 2:
            spec = np.expand_dims(spec, axis=-1)  # Expand the shape of the array. Shape: (128, 32, 1)
        
        # Step 2: Resize to MobileNetV2's expected input size
        # Transform from (128, 32, 1) to (224, 224, 1)
        # Width stretches from 32 → 224 frames (7x expansion)
        # Height expands from 128 → 224 mel bands (~1.75x expansion)
        resized = cv2.resize(spec, target_size)  # Expand the shape of the array. Shape: (224, 224, 1)
        
        # Step 3: Ensure channel dimension exists after resize
        if len(resized.shape) == 2:
            resized = np.expand_dims(resized, axis=-1)  # Shape: (224, 224, 1)
        
        # Step 4: Convert grayscale (1 channel) to RGB (3 channels)
        # Transform from (224, 224, 1) to (224, 224, 3)
        # Creates "fake RGB" by duplicating the same values across R, G, B channels
        # This allows MobileNetV2 (trained on colour images) to process the grayscale spectrograms
        if resized.shape[-1] == 1:
            resized = np.repeat(resized, 3, axis=-1)  # Final shape: (224, 224, 3)
        
        prepared.append(resized) # add to prepared list
    
    return np.array(prepared, dtype=np.float32)