In [63]:
import numpy as np
import pandas as pd
import librosa
import os
import pickle
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.exceptions import ConvergenceWarning
import gzip

warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [64]:
def make_key_invariant(chromagram):
    """Returns a key-invariant chromagram."""
    maj_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
    min_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
    avg_chroma = np.mean(chromagram, axis=1)
    maj_corrs = np.correlate(avg_chroma, maj_profile, mode='same')
    min_corrs = np.correlate(avg_chroma, min_profile, mode='same')
    key_shift = np.argmax(np.concatenate((maj_corrs, min_corrs))) % 12
    return np.roll(chromagram, -key_shift, axis=0)

def segment_by_beats(feature_array, beat_frames):
    """Segments a 2D array of audio features by beat frames."""
    beat_frames = np.append(beat_frames, feature_array.shape[1])
    return [feature_array[:, beat_frames[i]:beat_frames[i + 1]] for i in range(len(beat_frames) - 1)]

def map_labels_to_beats(df_labels, beat_times):
    """Map chorus labels to beat-synced data."""
    labels = np.zeros(len(beat_times) - 1)
    for _, row in df_labels.iterrows():
        indices = np.where((beat_times[:-1] >= row['start_time']) & (beat_times[:-1] < row['end_time']))[0]
        labels[indices] = 1 if row['label'] == 'chorus' else 0
    return labels

def save_compressed_pickle(file_path, data):
    """Save data to a compressed pickle file."""
    with gzip.open(file_path, 'wb') as f:
        pickle.dump(data, f)

In [70]:
# Ensure required directories exist
os.makedirs("../data/pkl/segments", exist_ok=True)
os.makedirs("../data/pkl/labels", exist_ok=True)

# Constants
df = pd.read_csv('../data/dataframes/clean_labeled.csv')
TARGET_SR = 12000 # Target sample rate chosen to be 1/4 of the original 48kHz.
HOP_LENGTH = 128  # Hop length for short-time Fourier transform. Hop length of 128 at 12kHz gives a similar frame rate to a hop length of 512 at 48kHz.

# Process each song in the dataset
for song_id in tqdm(df['SongID'].unique(), desc="Processing..."):
    # Load the audio file
    audio_path = f'../data/audio_files/processed/{song_id}.mp3'
    y, _ = librosa.load(audio_path, sr=TARGET_SR)
    
    # Harmonic-percussive source separation
    y_harm, y_perc = librosa.effects.hpss(y)
    
    # Compute onset envelope from the percussive component
    onset_env = librosa.onset.onset_strength(y=y_perc, sr=TARGET_SR, hop_length=HOP_LENGTH)

    # Beat tracking
    _, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=TARGET_SR, hop_length=HOP_LENGTH)
    beat_times = librosa.frames_to_time(beats, sr=TARGET_SR, hop_length=HOP_LENGTH)
    
    # Map labels to beats
    df_labels = df[df['SongID'] == song_id][['start_time', 'end_time', 'label']]
    labels = map_labels_to_beats(df_labels, beat_times)
    
    # Compute RMS energy from spectrogram to give a more accurate representation of energy over time because its frames can be windowed
    S = np.abs(librosa.stft(y, hop_length=HOP_LENGTH))
    rms = librosa.feature.rms(S=S)
    
    # Compute Mel Spectrogram and decompose into 4 components (4 chosen from EDA)
    mel = librosa.feature.melspectrogram(y=y, sr=TARGET_SR, n_mels=128, hop_length=HOP_LENGTH)
    mel_acts = librosa.decompose.decompose(mel, n_components=4, sort=True)[1]
    
    # Compute chromagram, make it key invariant, and decompose 
    chromagram = librosa.feature.chroma_cqt(y=y_harm, sr=TARGET_SR, hop_length=HOP_LENGTH)
    chroma_ki = make_key_invariant(chromagram)
    chroma_acts = librosa.decompose.decompose(chroma_ki, n_components=3, sort=True)[1]
    
    # Compute tempogram, ensure non-negative, and decompose 
    tempogram = np.clip(librosa.feature.tempogram(onset_envelope=onset_env, sr=TARGET_SR, hop_length=HOP_LENGTH), 0, None)
    tempogram_acts = librosa.decompose.decompose(tempogram, n_components=3, sort=True)[1]
    
    # Compute MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=TARGET_SR, n_mfcc=13, hop_length=HOP_LENGTH)
    
    # Standardize features, stack, and segment by beats
    features = [rms, mel_acts, chroma_acts, tempogram_acts, mfccs]
    total_inv_dim = sum(1.0 / dim for dim in dims.values()) # Calculate the total sum of inverse dimensions to normalize weights
    weights = {feature: (1.0 / dims[feature]) / total_inv_dim for feature in dims} # Normalize weights so each feature weighs the same despite dimensionality
    # Apply StandardScaler and weights to each feature
    standardized_weighted_features = [StandardScaler().fit_transform(feature.T).T * weights[feature_name]
                                      for feature, feature_name in zip(features, dims)]
    concat_features = np.vstack(standardized_weighted_features)
    segments = segment_by_beats(concat_features, beats)

    # Save results with compression
    save_compressed_pickle(f"../data/pkl/segments/{song_id}_beats.pkl.gz", segments)
    save_compressed_pickle(f"../data/pkl/labels/{song_id}_labels.pkl.gz", labels)

Processing...: 100%|██████████| 332/332 [1:06:43<00:00, 12.06s/it]


# Start here after processing data

---

In [193]:
import os
import gzip
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import pickle
from tqdm.auto import tqdm
import tensorflow as tf
import math

In [266]:
# Load the data
with gzip.open(os.path.join(save_path, 'beat_data.pkl.gz'), 'rb') as f:
    X_dict, y_dict = pickle.load(f)

song_data = list(X_dict.values())
song_labels = list(y_dict.values())

## Data Structure Overview

- `song_data`: A list where each element corresponds to an individual song's feature data.
- `song_data[0]`: The feature data for the first song in the dataset.
- `song_data[i]`: The feature data for the i-th song in the dataset.
- `song_data[i][j]`: The j-th beat segment's feature data within the i-th song.
- Each `song_data[i][j]` is structured as a 2D array:
  - The first dimension has a fixed size of 24, representing the number of features.
  - The second dimension has a variable size y, representing the number of frames in the beat segment.
- The feature count is consistent across the dataset, with each beat segment containing 24 features.
- `song_labels`: A list containing the corresponding labels for each song's beat segments.
- `song_labels[i]`: An array of labels for each beat segment within the i-th song.

---

## Padding Process for Beat Segments and Labels

### Step 1: Find max beat and max frames in a beat
- Find maximum number of beats in any song (`max_beats`).
- Find maximum number of frames in any beat segment across all songs (`max_frames`).

### Step 2: Pad Data using -1
- Pad songs to have max beats, pad beats to have max frames, ensuring that newly added beats have max frames.
- Pad using -1

### Step 3: Pad Labels using -1
- Pad the array of labels for each song to match `max_beats`. Use `-1` as the padding value to denote labels for dummy beats.

### Implementation Notes

- This process assumes that your dataset's current structure allows iteration over songs and their corresponding beat segments and labels.
- Padding with `-1` is a common approach when the padded values need to be easily distinguished from valid data. Ensure that your model or subsequent processing steps can handle this special value appropriately.


In [269]:
# Find max beat and frames across entire dataset
max_beats = 0 
max_frames = 0  
for song in song_data:
    max_beats = max(max_beats, len(song))
    for beat_segment in song:
        max_frames = max(max_frames, len(beat_segment[0])) 

# Initialize a new list for the padded song data
padded_song_data = []

# Iterate over each song in song_data
for song in song_data:
    # Pad each beat segment within the song to have max_frames
    # Using -1 instead of 0
    padded_beats = [np.pad(beat, ((0, 0), (0, max_frames - beat.shape[1])), 'constant', constant_values=-1) for beat in song]
    
    # If the song has fewer beats than max_beats, we create empty beats with -1
    if len(padded_beats) < max_beats:
        # Empty beats are 2D arrays filled with -1 with dimensions (24, max_frames)
        empty_beats = np.full((max_beats - len(padded_beats), 24, max_frames), -1)
        # Combine the existing padded beats with the empty beats
        padded_song = np.vstack((padded_beats, empty_beats))
    else:
        padded_song = np.array(padded_beats)
    
    # Add the padded song to the new list
    padded_song_data.append(padded_song)        

MemoryError: Unable to allocate 398. MiB for an array with shape (803, 24, 2705) and data type float64

In [242]:
# To load the data back:
with gzip.open(os.path.join(save_path, 'beat_data.pkl.gz'), 'rb') as f:
    X_dict, y_dict = pickle.load(f)

song_data = list(X_dict.values())
song_labels = list(y_dict.values())

In [249]:
# Step 1: Determine Maximum Number of Beats and Frames
max_beats = max(len(beats) for beats in X_dict.values())
max_frames = max(len(beat) for beats in X_dict.values() for beat in beats)

# Step 2: Pad Beat Segments (Data)
for song_id, beats in X_dict.items():
    # Pad each beat segment to have max_frames
    X_dict[song_id] = [np.pad(beat, (0, max_frames - len(beat)), 'constant', constant_values=-1) for beat in beats]
    # Pad the list of beat segments to have max_beats
    X_dict[song_id] = np.pad(X_dict[song_id], (0, max_beats - len(X_dict[song_id])), 'constant', constant_values=(-1, -1))

# Step 3: Pad Labels
for song_id, labels in y_dict.items():
    # Pad the array of labels for each song to match max_beats
    y_dict[song_id] = np.pad(labels, (0, max_beats - len(labels)), 'constant', constant_values=-1)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (543, 24) + inhomogeneous part.

In [243]:
# Determine maximum number of beats and frames
max_beats = 0
max_frames = 0
for song_id, beat_segments in X_dict.items():
    max_beats = max(max_beats, len(beat_segments))
    for beat_segment in beat_segments:
        max_frames = max(max_frames, beat_segment.shape[0])

# Pad beat segments
for song_id, beat_segments in X_dict.items():
    padded_beat_segments = []
    for beat_segment in beat_segments:
        padded_beat_segment = np.pad(beat_segment, ((0, max_frames - beat_segment.shape[0]), (0, 0)), mode='constant', constant_values=-1)
        padded_beat_segments.append(padded_beat_segment)
    padded_beat_segments = np.pad(padded_beat_segments, ((0, max_beats - len(beat_segments)), (0, 0), (0, 0)), mode='constant', constant_values=-1)
    X_dict[song_id] = padded_beat_segments

# Pad labels
for song_id, labels in y_dict.items():
    padded_labels = np.pad(labels, (0, max_beats - len(labels)), mode='constant', constant_values=-1)
    y_dict[song_id] = padded_labels

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (543, 24) + inhomogeneous part.

In [254]:
type(beat_segments)

list

In [187]:

type(X_train), len(X_train), type(X_train[0]), len(X_train[0]), type(X_train[0][0]),(X_train[0][0].shape)

(list, 232, list, 386, numpy.ndarray, (24, 41))

In [189]:
type(y_train), len(y_train), type(y_train[0]), len(y_train[0]), type(y_train[0][0])

(list, 232, numpy.ndarray, 385, numpy.float64)

In [183]:
# Convert dictionaries to lists for further processing
X = list(beat_segments.values())
y = list(beat_labels.values())

# Split the data in 70/15/15 train/val/test splits
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X_train is a list of songs. 
Each song in X_train is a list of beats. 
Each beat is a 2d array of (24 features, num_frames)

y_train is a list of song labels
each song in y_train is an array of beat labels (0,1)

In [190]:
# Initialize empty lists for batched data
batched_X_train = []
batched_y_train = []

# Iterate over songs in X_train
for song, label in zip(X_train, y_train):
    # Flatten the song to a 1D array of beats
    flat_song = np.array(song).flatten()
    
    # Calculate the number of batches for the song
    num_batches = len(flat_song) // batch_size
    
    # Split the flattened song into batches
    batches = np.array_split(flat_song, num_batches)
    
    # Reshape the batches to have the same shape as the original beats
    reshaped_batches = [batch.reshape((24, -1)) for batch in batches]
    
    # Append the batches to the batched_X_train list
    batched_X_train.extend(reshaped_batches)
    
    # Repeat the song label for each batch
    batched_y_train.extend([label] * num_batches)

# Convert the batched data to numpy arrays
batched_X_train = np.array(batched_X_train)
batched_y_train = np.array(batched_y_train)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (386, 24) + inhomogeneous part.

In [169]:
# Convert data to numpy arrays for batching
X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)
X_test = np.array(X_test)
y_test = np.array(y_test)

# Define batch size
batch_size = 16

# Calculate the number of batches for train, validation, and test sets
num_train_batches = len(X_train) // batch_size
num_val_batches = len(X_val) // batch_size
num_test_batches = len(X_test) // batch_size

# Split train, validation, and test sets into batches
train_batches = np.array_split(X_train, num_train_batches)
train_labels_batches = np.array_split(y_train, num_train_batches)
val_batches = np.array_split(X_val, num_val_batches)
val_labels_batches = np.array_split(y_val, num_val_batches)
test_batches = np.array_split(X_test, num_test_batches)
test_labels_batches = np.array_split(y_test, num_test_batches)

232

In [166]:
for i in range(0, len(X_train), batch_size):
    X_train_batches = X_train[i:i + batch_size]
    label_batch = labels[i:i + batch_size]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15


### In progress..

In [161]:
def get_positional_encoding(position, d_model):
    """Compute the sinusoidal encoding for a given position and model size."""
    angle_rates = 1 / np.power(10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
    angle_rads = position[:, np.newaxis] * angle_rates
    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])
    pos_encoding = np.concatenate([sines, cosines], axis=-1)
    pos_encoding = pos_encoding[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

def add_positional_encoding(padded_sequences):
    """Add the positional encoding to each sequence."""
    encoded_sequences = []
    for sequence in padded_sequences:
        encoded_seq = []
        for sub_seq in sequence:
            # Get the sequence length and feature dimension
            sequence_length = sub_seq.shape[0]
            feature_dim = sub_seq.shape[1]

            # Compute and add the positional encoding
            pos_encoding = get_positional_encoding(np.arange(sequence_length), feature_dim)
            encoded_sub_seq = sub_seq + pos_encoding

            encoded_seq.append(encoded_sub_seq)
        encoded_sequences.append(encoded_seq)

    return encoded_sequences

def pad_data(data_batches, padding='post', value=-1):
    """Pad beats within songs in data_batches to have the same number of frames,
    and ensure all songs have beats padded to the same length."""
    
    # First, find the maximum number of frames across all beats in the batch
    max_frames = max([beat.shape[1] for batch in data_batches for song in batch for beat in song])
    
    # Next, find the maximum number of beats in a song across the entire batch
    max_beats = max([len(song) for batch in data_batches for song in batch])
    
    padded_batches = []
    for batch in data_batches:
        padded_batch = []
        for song in batch:
            padded_song = []
            for beat in song:
                # Pad each beat to have the same number of frames
                padded_beat = np.pad(beat, ((0, 0), (0, max_frames - beat.shape[1]), (0, 0)), mode='constant', constant_values=value)
                padded_song.append(padded_beat)
            
            # Now pad the song to have the same number of beats
            while len(padded_song) < max_beats:
                # Create a dummy beat with the correct number of frames and features, but no actual data
                dummy_beat = np.full((1, max_frames, beat.shape[2]), value)
                padded_song.append(dummy_beat)
            
            padded_batch.append(np.array(padded_song))
        padded_batches.append(padded_batch)
    
    return padded_batches

def pad_labels(labels, padding='post', value=-1):
    """Pad label sequences to the same length."""
    max_length = max([len(batch) for batch in labels])  # Assuming labels are simpler and directly reflect the batch size
    padded = []
    for batch in labels:
        padded_batch = np.full((max_length,), value, dtype=np.int32)  # Create a padded batch with the default value
        padded_batch[:len(batch)] = batch  # Fill in the actual labels
        padded.append(padded_batch)
    return padded

def create_batches_pad_encode(data, labels, batch_size, padding_value=-1):
    """Create and pad batches of sequences and labels."""
    padded_batches = []
    for i in range(0, len(data), batch_size):
        data_batch = data[i:i + batch_size]
        label_batch = labels[i:i + batch_size]
        # Pad sequences and labels
        padded_data = pad_data(data_batch, padding='post', value=padding_value)
        padded_labels = pad_labels(label_batch, padding='post', value=padding_value)
        # Positionally encode sequences
        padded_data = add_positional_encoding(padded_data)
        padded_batches.append((padded_data, padded_labels))
    return padded_batches

batch_size = 16
train_dataset = create_batches_pad_encode(X_train, y_train, batch_size, padding_value=-1)
val_dataset = create_batches_pad_encode(X_val, y_val, batch_size, padding_value=-1)
test_dataset = create_batches_pad_encode(X_test, y_test, batch_size, padding_value=-1)

# [batch_size, num_sequences, max_sequence_length, feature_dim]

IndexError: tuple index out of range

In [171]:
def inspect_dataset_structure(dataset):
    # Define a generic function to print the properties of an object
    def print_properties(obj, name):
        print(f"{name}: Type={type(obj)}", end="")
        if hasattr(obj, '__len__'):
            print(f", Length={len(obj)}", end="")
        if isinstance(obj, tf.Tensor):
            print(f", Shape={obj.shape}", end="")
        print()  # Newline
    
    print_properties(dataset, "Dataset")
    print_properties(dataset[0], "Dataset[0]")
    print_properties(dataset[0][0], "Dataset[0][0]")
    print_properties(dataset[0][0][0], "Dataset[0][0][0]")
    print_properties(dataset[0][0][0][0], "Dataset[0][0][0][0]")
inspect_dataset_structure(X_train)

Dataset: Type=<class 'list'>, Length=232
Dataset[0]: Type=<class 'list'>, Length=386
Dataset[0][0]: Type=<class 'numpy.ndarray'>, Length=24
Dataset[0][0][0]: Type=<class 'numpy.ndarray'>, Length=41
Dataset[0][0][0][0]: Type=<class 'numpy.float64'>


datasets are a list of tuples, where each tuple contains a batch of padded and positionally encoded sequences (songs) (`padded_sequences`) and their corresponding padded labels (`padded_labels`):

Each sequence batch contains 16 song, each song is has the following shape and structure:

Shape: [batch_size, num_beats, max_beat_length, feature_dim]


In [170]:
def inspect_dataset_structure(dataset):
    # Define a generic function to print the properties of an object
    def print_properties(obj, name):
        print(f"{name}: Type={type(obj)}", end="")
        if hasattr(obj, '__len__'):
            print(f", Length={len(obj)}", end="")
        if isinstance(obj, tf.Tensor):
            print(f", Shape={obj.shape}", end="")
        print()  # Newline
    
    print_properties(dataset, "Dataset")
    print_properties(dataset[0], "Dataset[0]")
    print_properties(dataset[0][0], "Dataset[0][0]")
    print_properties(dataset[0][0][0], "Dataset[0][0][0]")
    print_properties(dataset[0][0][0][0], "Dataset[0][0][0][0]")

    # Now, let's inspect how the tensor shape changes for the first 5 beats
    for i in range(5):  # Assuming there are at least 5 beats in the first song
        beat = dataset[0][0][0][i]
        song_label_len = len(dataset[0][1][i])
        print_properties(beat, f"Beat {i+1}")
inspect_dataset_structure(X_train)

Dataset: Type=<class 'list'>, Length=232
Dataset[0] (data, labels): Type=<class 'list'>, Length=386
Dataset[0][0]: Type=<class 'numpy.ndarray'>, Length=24
Dataset[0][0][0]: Type=<class 'numpy.ndarray'>, Length=41
Dataset[0][0][0][0]: Type=<class 'numpy.float64'>
Dataset[0][1] (labels): Type=<class 'numpy.ndarray'>, Length=24
Dataset[0][1][0]: Type=<class 'numpy.ndarray'>, Length=45
Beat 1: Type=<class 'numpy.float64'>
Song label length 45: Type=<class 'int'>
Beat 2: Type=<class 'numpy.float64'>
Song label length 45: Type=<class 'int'>
Beat 3: Type=<class 'numpy.float64'>
Song label length 45: Type=<class 'int'>
Beat 4: Type=<class 'numpy.float64'>
Song label length 45: Type=<class 'int'>
Beat 5: Type=<class 'numpy.float64'>
Song label length 45: Type=<class 'int'>


In [None]:
def custom_binary_crossentropy(y_true, y_pred):
    """Custom binary cross-entropy loss to handle -1 labels, which are used for padding and should be ignored during loss calculation."""
    y_true = tf.cast(y_true, tf.float32)
    bce = tf.keras.backend.binary_crossentropy(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, -1), tf.float32)
    loss = bce * mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

def custom_accuracy(y_true, y_pred):
    """Custom accuracy metric to handle -1 labels, which are used for padding and should be ignored during accuracy calculation."""
    mask = tf.cast(tf.not_equal(y_true, -1), tf.float32)
    correct_predictions = tf.equal(tf.cast(tf.round(y_pred), tf.float32), y_true)
    masked_correct_predictions = tf.cast(correct_predictions, tf.float32) * mask
    accuracy = tf.reduce_sum(masked_correct_predictions) / tf.reduce_sum(mask)
    return accuracy


def compile_model(max_frames, max_freq_bins, n_features, custom_binary_crossentropy, custom_accuracy, max_beats):
    """
    Define and compile a CNN model with considerations for environmental sound classification.
    """
    frame_input = layers.Input(shape=(max_frames, max_freq_bins, n_features))
    
    # Assuming Mx3 filter means spanning the entire frequency bins with a width of 3.
    conv1 = layers.Conv2D(filters=180, kernel_size=(max_freq_bins, 3), activation='relu', padding='same')(frame_input)
    pool1 = layers.MaxPooling2D(pool_size=(4, 4), padding='same')(conv1)
    dropout1 = layers.Dropout(0.5)(pool1)  # Applying 50% dropout after pooling
    
    conv2 = layers.Conv2D(filters=180, kernel_size=(1, 3), activation='relu', padding='same')(dropout1) # Changed to 1x3 to not reduce the frequency dimension further
    pool2 = layers.MaxPooling2D(pool_size=(4, 4), padding='same')(conv2)
    dropout2 = layers.Dropout(0.5)(pool2)  # Applying 50% dropout after pooling
    
    conv3 = layers.Conv2D(filters=180, kernel_size=(1, 3), activation='relu', padding='same')(dropout2) # Changed to 1x3 for consistency
    pool3 = layers.MaxPooling2D(pool_size=(4, 4), padding='same')(conv3)
    dropout3 = layers.Dropout(0.5)(pool3)  # Applying 50% dropout after pooling
    
    frame_features = layers.Flatten()(dropout3)
    frame_feature_model = models.Model(inputs=frame_input, outputs=frame_features)

    measure_input = layers.Input(shape=(max_beats, max_frames, max_freq_bins, n_features))
    time_distributed = layers.TimeDistributed(frame_feature_model)(measure_input)
    masking_layer = layers.Masking(mask_value=-1)(time_distributed)
    lstm_out = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(masking_layer)
    output = layers.TimeDistributed(layers.Dense(1, activation='sigmoid'))(lstm_out)
    model = models.Model(inputs=measure_input, outputs=output)
    model.compile(optimizer='adam', loss=custom_binary_crossentropy, metrics=[custom_accuracy])
    
    return model

In [None]:
# 5-fold cross-validation with stratified splitting, batching, and positional encoding
def get_positional_encoding(max_len, d_model):
    """Generates sinusoidal positional encodings."""
    pos = np.arange(max_len)[:, np.newaxis]
    i = np.arange(d_model // 2)[np.newaxis, :]
    angles = pos / np.power(10000, (2 * i) / d_model)
    sin_enc = np.sin(angles)
    cos_enc = np.cos(angles)
    encoding = np.concatenate([sin_enc, cos_enc[:, :d_model // 2]], axis=-1)
    return encoding

def apply_positional_encoding(features, pos_encoding):
    """Applies positional encoding to the input features."""
    return features + pos_encoding[:features.shape[1], :]


def run_cross_validation(n_splits=5, batch_size=32, feat_dim=128):
    """Runs cross-validation with stratified splitting, batching, and positional encoding."""
    song_ids = df['SongID'].unique()
    all_labels = np.concatenate([padded_beat_labels[id] for id in song_ids])

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(song_ids, all_labels), start=1):
        print(f"Fold {fold}/{n_splits}")

        train_ids, val_ids = song_ids[train_idx], song_ids[val_idx]
        train_batches = [(get_batch_segments(ids, padded_beat_segments, padded_beat_labels, feat_dim), 
                          get_batch_labels(ids, padded_beat_labels)) for ids in create_batches(train_ids, batch_size)]
        val_batches = [(get_batch_segments(ids, padded_beat_segments, padded_beat_labels, feat_dim), 
                        get_batch_labels(ids, padded_beat_labels)) for ids in create_batches(val_ids, batch_size)]

        # Train and evaluate model
        model = train_model(train_batches)
        val_score = evaluate_model(model, val_batches)
        fold_scores.append(val_score)

    avg_score = np.mean(fold_scores)
    print(f"Average Cross-Validation Score: {avg_score}")

def train_model(train_batches):
    # Define the directories for checkpoints and models
    checkpoint_dir = os.path.join('..', 'checkpoints', 'CRNN')
    model_dir = os.path.join('..', 'models', 'CRNN')

    # Ensure the checkpoint and model directories exist
    os.makedirs(checkpoint_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

    # Define the checkpoint path for the best model
    best_model_filepath = os.path.join(model_dir, 'best_model.h5')

    callbacks = [
        ModelCheckpoint(
            filepath=best_model_filepath,
            save_weights_only=False,
            monitor='val_custom_accuracy',  # Use 'val_custom_accuracy' for validation custom accuracy
            mode='max',
            save_best_only=True,
            verbose=1
        ),
        EarlyStopping(
            monitor='val_loss',  
            patience=3,
            verbose=1,
            restore_best_weights=True
        ),
        ReduceLROnPlateau(
            monitor='val_loss',  
            factor=0.1,
            patience=2,
            verbose=1,
            min_delta=0.0001,
            min_lr=0.00001
        )
    ]

    # Train the model with the simplified callbacks list
    history = model.fit(
        train_dataset,
        epochs=10,
        validation_data=val_dataset,
        callbacks=callbacks
    )

def evaluate_model(model, val_batches):
    # Implement model evaluation here
    pass