In [None]:
import numpy as np
import pandas as pd
import librosa
import os
import pickle
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.exceptions import ConvergenceWarning
import gzip

warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [None]:
def make_key_invariant(chromagram):
    """Returns a key-invariant chromagram."""
    maj_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
    min_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
    avg_chroma = np.mean(chromagram, axis=1)
    maj_corrs = np.correlate(avg_chroma, maj_profile, mode='same')
    min_corrs = np.correlate(avg_chroma, min_profile, mode='same')
    key_shift = np.argmax(np.concatenate((maj_corrs, min_corrs))) % 12
    return np.roll(chromagram, -key_shift, axis=0)

def segment_by_beats(feature_array, beat_frames):
    """Segments a 2D array of audio features by beat frames."""
    beat_frames = np.append(beat_frames, feature_array.shape[1])
    return [feature_array[:, beat_frames[i]:beat_frames[i + 1]] for i in range(len(beat_frames) - 1)]

def map_labels_to_beats(df_labels, beat_times):
    """Map chorus labels to beat-synced data."""
    labels = np.zeros(len(beat_times) - 1)
    for _, row in df_labels.iterrows():
        indices = np.where((beat_times[:-1] >= row['start_time']) & (beat_times[:-1] < row['end_time']))[0]
        labels[indices] = 1 if row['label'] == 'chorus' else 0
    return labels

def save_compressed_pickle(file_path, data):
    """Save data to a compressed pickle file."""
    with gzip.open(file_path, 'wb') as f:
        pickle.dump(data, f)

In [None]:
# Ensure required directories exist
os.makedirs("../data/pkl/segments", exist_ok=True)
os.makedirs("../data/pkl/labels", exist_ok=True)

# Constants
df = pd.read_csv('../data/dataframes/clean_labeled.csv')
TARGET_SR = 12000 # Target sample rate chosen to be 1/4 of the original 48kHz.
HOP_LENGTH = 128  # Hop length for short-time Fourier transform. Hop length of 128 at 12kHz gives a similar frame rate to a hop length of 512 at 48kHz.

# Process each song in the dataset
for song_id in tqdm(df['SongID'].unique(), desc="Processing..."):
    # Load the audio file
    audio_path = f'../data/audio_files/processed/{song_id}.mp3'
    y, _ = librosa.load(audio_path, sr=TARGET_SR)
    
    # Harmonic-percussive source separation
    y_harm, y_perc = librosa.effects.hpss(y)
    
    # Compute onset envelope from the percussive component
    onset_env = librosa.onset.onset_strength(y=y_perc, sr=TARGET_SR, hop_length=HOP_LENGTH)

    # Beat tracking
    _, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=TARGET_SR, hop_length=HOP_LENGTH)
    beat_times = librosa.frames_to_time(beats, sr=TARGET_SR, hop_length=HOP_LENGTH)
    
    # Map labels to beats
    df_labels = df[df['SongID'] == song_id][['start_time', 'end_time', 'label']]
    labels = map_labels_to_beats(df_labels, beat_times)
    
    # Compute RMS energy from spectrogram to give a more accurate representation of energy over time because its frames can be windowed
    S = np.abs(librosa.stft(y, hop_length=HOP_LENGTH))
    rms = librosa.feature.rms(S=S)
    
    # Compute Mel Spectrogram and decompose into 4 components (4 chosen from EDA)
    mel = librosa.feature.melspectrogram(y=y, sr=TARGET_SR, n_mels=128, hop_length=HOP_LENGTH)
    mel_acts = librosa.decompose.decompose(mel, n_components=4, sort=True)[1]
    
    # Compute chromagram, make it key invariant, and decompose 
    chromagram = librosa.feature.chroma_cqt(y=y_harm, sr=TARGET_SR, hop_length=HOP_LENGTH)
    chroma_ki = make_key_invariant(chromagram)
    chroma_acts = librosa.decompose.decompose(chroma_ki, n_components=3, sort=True)[1]
    
    # Compute tempogram, ensure non-negative b/c tempograms are finicky, and decompose 
    tempogram = np.clip(librosa.feature.tempogram(onset_envelope=onset_env, sr=TARGET_SR, hop_length=HOP_LENGTH), 0, None)
    tempogram_acts = librosa.decompose.decompose(tempogram, n_components=3, sort=True)[1]
    
    # Compute MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=TARGET_SR, n_mfcc=13, hop_length=HOP_LENGTH)
    
    # Standardize features, stack, and segment by beats
    features = [rms, mel_acts, chroma_ki, chroma_acts, tempogram_acts, mfccs]
    total_inv_dim = sum(1.0 / dim for dim in dims.values()) # Calculate the total sum of inverse dimensions to normalize weights
    weights = {feature: (1.0 / dims[feature]) / total_inv_dim for feature in dims} # Normalize weights so each feature weighs the same despite dimensionality
    # Apply StandardScaler and weights to each feature
    standardized_weighted_features = [StandardScaler().fit_transform(feature.T).T * weights[feature_name]
                                      for feature, feature_name in zip(features, dims)]
    concat_features = np.vstack(standardized_weighted_features)
    segments = segment_by_beats(concat_features, beats)

    # Save results with compression
    save_compressed_pickle(f"../data/pkl/segments/{song_id}_beats.pkl.gz", segments)
    save_compressed_pickle(f"../data/pkl/labels/{song_id}_labels.pkl.gz", labels)

# Start here after processing data

---

In [None]:
# Standard library imports
import os
import gzip
import pickle
import math

# Third-party imports for numerical operations and machine learning
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split

# Progress bar
from tqdm.auto import tqdm

# Deep learning frameworks
import torch
from torch.utils.data import Dataset, DataLoader
import tensorflow as tf
from tensorflow.keras import layers, models, metrics

In [None]:
save_path = '../data/pkl'
# Load the data
with gzip.open(os.path.join(save_path, 'beat_data.pkl.gz'), 'rb') as f:
    X_dict, y_dict = pickle.load(f)

song_data = list(X_dict.values())
song_labels = list(y_dict.values())

## Data Structure Overview

- `song_data`: A list where each element corresponds to an individual song's feature data.
- `song_data[0]`: The feature data for the first song in the dataset.
- `song_data[i]`: The feature data for the i-th song in the dataset.
- `song_data[i][j]`: The j-th beat segment's feature data within the i-th song.
- Each `song_data[i][j]` is structured as a 2D array:
  - The first dimension has a fixed size of 24, representing the number of features.
  - The second dimension has a variable size y, representing the number of frames in the beat segment.
- The feature count is consistent across the dataset, with each beat segment containing 24 features.
- `song_labels`: A list containing the corresponding labels for each song's beat segments.
- `song_labels[i]`: An array of labels for each beat segment within the i-th song.

---

### 1) 70/15/15 train/val/test split

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    song_data, song_labels, test_size=0.3, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
len(X_train), len(X_val), len(X_test)

### 2) Generate batches with batch_size = 32

---

### 1: Find max beat and max frames in a beat
- Find maximum number of beats in any song in the batch (`max_beats`).
- Find maximum number of frames in any beat segment across all songs in a batch (`max_frames`).

### 2: Pad Data using -1
- Pad songs to have max beats, pad beats to have max frames, ensuring that newly added beats have max frames.
- Pad using -1

### 3: Pad Labels using -1
- Pad the array of labels for each song to match `max_beats`. Use `-1` as the padding value to denote labels for dummy beats.

### Implementation Notes
- Padding with `-1` is a common approach when the padded values need to be easily distinguished from valid data. Ensure that your model or subsequent processing steps can handle this special value appropriately.

In [None]:
def song_data_generator_with_pos_encoding(song_data, song_labels, pos_encoding_matrix):
    for features, label in zip(song_data, song_labels):
        seq_length = features.shape[0]
        features += pos_encoding_matrix[:seq_length, :]
        yield features, label

def positional_encoding(max_len, d_model):
    pos_enc = np.zeros((max_len, d_model))
    for pos in range(max_len):
        for i in range(d_model):
            if i % 2 == 0:
                pos_enc[pos, i] = np.sin(pos / np.power(10000, (2 * i) / d_model))
            else:
                pos_enc[pos, i] = np.cos(pos / np.power(10000, (2 * (i - 1)) / d_model))
    return tf.constant(pos_enc, dtype=tf.float32)

def make_tf_dataset_with_pos_encoding(song_data, song_labels, pos_encoding_matrix, d_model):
    output_signature = (
        tf.TensorSpec(shape=(None, None, d_model), dtype=tf.float32),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )

    dataset = tf.data.Dataset.from_generator(
        lambda: song_data_generator_with_pos_encoding(song_data, song_labels, pos_encoding_matrix),
        output_signature=output_signature
    )

    padded_shapes = ([None, None, d_model], [None])
    padding_values = (tf.constant(-1, dtype=tf.float32), tf.constant(-1, dtype=tf.float32))
    dataset = dataset.padded_batch(32, padded_shapes=padded_shapes, padding_values=padding_values)

    return dataset

max_length = max(max(len(song) for song in song_data), max(len(song) for song in song_labels))  
d_model = 24  
pos_encoding_matrix = positional_encoding(max_length, d_model)

# Create TensorFlow datasets
train_dataset_tf = make_tf_dataset_with_pos_encoding(X_train, y_train, pos_encoding_matrix, d_model)
val_dataset_tf = make_tf_dataset_with_pos_encoding(X_val, y_val, pos_encoding_matrix, d_model)
test_dataset_tf = make_tf_dataset_with_pos_encoding(X_test, y_test, pos_encoding_matrix, d_model)

In [None]:
def custom_binary_crossentropy(y_true, y_pred):
    """Custom binary cross-entropy loss to handle -1 labels, which are used for padding and should be ignored during loss calculation."""
    y_true = tf.cast(y_true, tf.float32)
    bce = tf.keras.backend.binary_crossentropy(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, -1), tf.float32)
    loss = bce * mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)


class MaskedF1Score(tf.keras.metrics.Metric):
    def __init__(self, name='masked_f1_score', **kwargs):
        super(MaskedF1Score, self).__init__(name=name, **kwargs)
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        # Create a mask for all values not equal to -1
        mask = tf.cast(tf.not_equal(y_true, -1), tf.float32)

        # If sample_weight is provided, combine it with the mask
        if sample_weight is not None:
            sample_weight = tf.cast(sample_weight, tf.float32)
            # Ensure sample_weight has the same shape as y_true and y_pred
            sample_weight = tf.broadcast_to(sample_weight, tf.shape(y_true))
            # Combine the existing mask with the sample_weight
            mask *= sample_weight

        # Use the resulting mask as the sample_weight in update_state calls
        self.precision.update_state(y_true, y_pred, sample_weight=mask)
        self.recall.update_state(y_true, y_pred, sample_weight=mask)

    def result(self):
        p = self.precision.result()
        r = self.recall.result()
        return 2 * ((p * r) / (p + r + tf.keras.backend.epsilon()))

    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()

def create_model(max_frames, n_features, max_beats):
    """
    Model definition including custom loss and F1 score metric.
    """
    frame_input = layers.Input(shape=(max_frames, n_features, 1), name='FrameInput')

    # Convolutional block with 2D convolution and pooling
    conv = layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu', padding='same')(frame_input)
    pool = layers.MaxPooling2D(pool_size=(2, 2), padding='same')(conv)
    dropout = layers.Dropout(0.3)(pool)

    frame_features = layers.Flatten()(dropout)
    frame_feature_model = models.Model(inputs=frame_input, outputs=frame_features, name='FrameFeatureModel')

    # Input shape for sequences of beat segments
    measure_input = layers.Input(shape=(max_beats, max_frames, n_features, 1), name='BeatInput')
    time_distributed = layers.TimeDistributed(frame_feature_model)(measure_input)

    masking_layer = layers.Masking(mask_value=-1.0)(time_distributed)
    lstm_out = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(masking_layer)
    output = layers.TimeDistributed(layers.Dense(1, activation='sigmoid'), name='Output')(lstm_out)

    model = models.Model(inputs=measure_input, outputs=output, name='ChorusIdentificationModel')

    # Note: Using custom_binary_crossentropy as the loss function
    model.compile(optimizer='adam',
                  loss=custom_binary_crossentropy,
                  metrics=[metrics.BinaryAccuracy(name='accuracy'), 
                           MaskedF1Score(),
                           metrics.Precision(name='precision'),
                           metrics.Recall(name='recall'),
                           metrics.AUC(name='auc')])

    return model

def create_model(max_beats, n_features, max_frames):
    """
    Model definition including custom loss and F1 score metric.
    """
    beat_input = layers.Input(shape=(max_beats, max_frames, n_features, 1), name='BeatInput')

    # 2D Convolutional block
    conv = layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu', padding='same')(beat_input)

    # Reshape the output to 4 dimensions
    reshape = layers.Reshape((max_beats, max_frames, 64))(conv)

    pool = layers.MaxPooling2D(pool_size=(2, 2), padding='same')(reshape)
    dropout = layers.Dropout(0.3)(pool)

    beat_features = layers.Flatten()(dropout)
    beat_feature_model = models.Model(inputs=beat_input, outputs=beat_features, name='BeatFeatureModel')

    # Input shape for sequences of song segments
    song_input = layers.Input(shape=(None, max_beats, max_frames, n_features, 1), name='SongInput')
    time_distributed = layers.TimeDistributed(beat_feature_model)(song_input)

    masking_layer = layers.Masking(mask_value=-1.0)(time_distributed)
    lstm_out = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(masking_layer)
    output = layers.TimeDistributed(layers.Dense(1, activation='sigmoid'), name='Output')(lstm_out)

    model = models.Model(inputs=song_input, outputs=output, name='ChorusIdentificationModel')

    # Note: Using custom_binary_crossentropy as the loss function
    model.compile(optimizer='adam',
                  loss=custom_binary_crossentropy,
                  metrics=[metrics.BinaryAccuracy(name='accuracy'), 
                           MaskedF1Score(),
                           metrics.Precision(name='precision'),
                           metrics.Recall(name='recall'),
                           metrics.AUC(name='auc')])

    return model

# Find the maximum dimensions of frames and beats in your datasets for training data
max_frames_train = max(max(len(segment) for segment in song) for song in X_train)
max_beats_train = max(len(song) for song in X_train)

# Find the maximum dimensions of frames and beats in your datasets for validation data
max_frames_val = max(max(len(segment) for segment in song) for song in X_val)
max_beats_val = max(len(song) for song in X_val)

# Find the maximum dimensions of frames and beats in your datasets for testing data
max_frames_test = max(max(len(segment) for segment in song) for song in X_test)
max_beats_test = max(len(song) for song in X_test)

# Create the model
model = create_model(max_beats=max_beats_train, n_features=24, max_frames=max_frames_train)
model.summary()

In [None]:
X_test

In [None]:
max_frames_train, max_beats_train

In [None]:
# Find the maximum dimensions of frames and beats in your datasets for training data
max_frames_train = max(max(len(segment) for segment in song) for song in X_train)
max_beats_train = max(len(song) for song in X_train)

# Find the maximum dimensions of frames and beats in your datasets for validation data
max_frames_val = max(max(len(segment) for segment in song) for song in X_val)
max_beats_val = max(len(song) for song in X_val)

# Find the maximum dimensions of frames and beats in your datasets for testing data
max_frames_test = max(max(len(segment) for segment in song) for song in X_test)
max_beats_test = max(len(song) for song in X_test)

# Create the model
model = create_model(max_frames=max_frames_train, n_features=24, max_beats=max_beats_train)
model.summary()

In [None]:
def train_model(model, train_dataset, val_dataset):
    # Define the directories for checkpoints and models
    checkpoint_dir = os.path.join('checkpoints', 'ChorusIdentificationModel')
    model_dir = os.path.join('models', 'ChorusIdentificationModel')

    # Ensure the checkpoint and model directories exist
    os.makedirs(checkpoint_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

    # Define the checkpoint path for the best model
    best_model_filepath = os.path.join(model_dir, 'best_model.h5')

    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            filepath=best_model_filepath,
            save_weights_only=False,
            monitor='val_masked_f1_score',  # Monitor custom metric
            mode='max',
            save_best_only=True,
            verbose=1
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',  
            patience=3,
            verbose=1,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',  
            factor=0.1,
            patience=2,
            verbose=1,
            min_delta=0.0001,
            min_lr=0.00001
        )
    ]

    # Train the model with the simplified callbacks list
    history = model.fit(
        train_dataset,
        epochs=10,  # Adjust number of epochs as needed
        validation_data=val_dataset,
        callbacks=callbacks
    )
    
    return history

def evaluate_model(model, test_dataset):
    # Evaluate the model on the test set
    test_loss, test_accuracy, test_f1_score, test_precision, test_recall, test_auc = model.evaluate(test_dataset)

    print("Test Loss:", test_loss)
    print("Test Accuracy:", test_accuracy)
    print("Test F1 Score:", test_f1_score)
    print("Test Precision:", test_precision)
    print("Test Recall:", test_recall)
    print("Test AUC:", test_auc)
    
def plot_training_history(history):
    # Plot training & validation accuracy values
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

    # Plot training & validation loss values
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

In [None]:
# Train the model
history = train_model(model, train_dataset=train_dataset_tf, val_dataset=val_dataset_tf)

In [None]:
# Train the model
history = train_model(model, train_dataset=train_dataset_tf, val_dataset=val_dataset_tf)

# Evaluate the model on the test dataset
evaluate_model(model, test_dataset=test_dataset_tf)

# Visualize training history
plot_training_history(history)

In [None]:
def train_model(model, train_dataset, val_dataset):
    # Define the directories for checkpoints and models
    checkpoint_dir = os.path.join('checkpoints', 'ChorusIdentificationModel')
    model_dir = os.path.join('models', 'ChorusIdentificationModel')

    # Ensure the checkpoint and model directories exist
    os.makedirs(checkpoint_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

    # Define the checkpoint path for the best model
    best_model_filepath = os.path.join(model_dir, 'best_model.h5')

    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            filepath=best_model_filepath,
            save_weights_only=False,
            monitor='val_masked_f1_score',  # Monitor custom metric
            mode='max',
            save_best_only=True,
            verbose=1
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',  
            patience=3,
            verbose=1,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',  
            factor=0.1,
            patience=2,
            verbose=1,
            min_delta=0.0001,
            min_lr=0.00001
        )
    ]

    # Train the model with the simplified callbacks list
    history = model.fit(
        train_dataset,
        epochs=10,  # Adjust number of epochs as needed
        validation_data=val_dataset,
        callbacks=callbacks
    )
    
def plot_training_history(history):
    # Plot training & validation accuracy values
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

    # Plot training & validation loss values
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

# Train the model and get the history
history = train_model(model, train_dataset_tf, val_dataset_tf)

# Visualize training history
plot_training_history(history)

# Evaluate the model on the test set
test_loss, test_accuracy, test_f1_score, test_precision, test_recall, test_auc = model.evaluate(test_dataset_tf)

print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)
print("Test F1 Score:", test_f1_score)
print("Test Precision:", test_precision)
print("Test Recall:", test_recall)
print("Test AUC:", test_auc)

In [None]:
def train_model(model, train_dataset, val_dataset):
    # Define the directories for checkpoints and models
    checkpoint_dir = os.path.join('checkpoints', 'ChorusIdentificationModel')
    model_dir = os.path.join('models', 'ChorusIdentificationModel')

    # Ensure the checkpoint and model directories exist
    os.makedirs(checkpoint_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

    # Define the checkpoint path for the best model
    best_model_filepath = os.path.join(model_dir, 'best_model.h5')

    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            filepath=best_model_filepath,
            save_weights_only=False,
            monitor='val_masked_f1_score',  # Monitor custom metric
            mode='max',
            save_best_only=True,
            verbose=1
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',  
            patience=3,
            verbose=1,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',  
            factor=0.1,
            patience=2,
            verbose=1,
            min_delta=0.0001,
            min_lr=0.00001
        )
    ]

    # Train the model with the simplified callbacks list
    history = model.fit(
        train_dataset,
        epochs=10,  # Adjust number of epochs as needed
        validation_data=val_dataset,
        callbacks=callbacks
    )

In [None]:
def define_model(max_frames, max_freq_bins, n_features, custom_binary_crossentropy, custom_accuracy, max_beats):
    """
    Define and compile a CNN model with considerations for environmental sound classification.
    """
    frame_input = layers.Input(shape=(max_frames, max_freq_bins, n_features))
    
    # Assuming Mx3 filter means spanning the entire frequency bins with a width of 3.
    conv1 = layers.Conv2D(filters=180, kernel_size=(max_freq_bins, 3), activation='relu', padding='same')(frame_input)
    pool1 = layers.MaxPooling2D(pool_size=(4, 4), padding='same')(conv1)
    dropout1 = layers.Dropout(0.5)(pool1)  # Applying 50% dropout after pooling
    
    conv2 = layers.Conv2D(filters=180, kernel_size=(1, 3), activation='relu', padding='same')(dropout1) # Changed to 1x3 to not reduce the frequency dimension further
    pool2 = layers.MaxPooling2D(pool_size=(4, 4), padding='same')(conv2)
    dropout2 = layers.Dropout(0.5)(pool2)  # Applying 50% dropout after pooling
    
    conv3 = layers.Conv2D(filters=180, kernel_size=(1, 3), activation='relu', padding='same')(dropout2) # Changed to 1x3 for consistency
    pool3 = layers.MaxPooling2D(pool_size=(4, 4), padding='same')(conv3)
    dropout3 = layers.Dropout(0.5)(pool3)  # Applying 50% dropout after pooling
    
    frame_features = layers.Flatten()(dropout3)
    frame_feature_model = models.Model(inputs=frame_input, outputs=frame_features)

    measure_input = layers.Input(shape=(max_beats, max_frames, max_freq_bins, n_features))
    time_distributed = layers.TimeDistributed(frame_feature_model)(measure_input)
    masking_layer = layers.Masking(mask_value=-1)(time_distributed)
    lstm_out = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(masking_layer)
    output = layers.TimeDistributed(layers.Dense(1, activation='sigmoid'))(lstm_out)
    model = models.Model(inputs=measure_input, outputs=output)
    
    return model

In [None]:
# 5-fold cross-validation with stratified splitting, batching, and positional encoding
def get_positional_encoding(max_len, d_model):
    """Generates sinusoidal positional encodings."""
    pos = np.arange(max_len)[:, np.newaxis]
    i = np.arange(d_model // 2)[np.newaxis, :]
    angles = pos / np.power(10000, (2 * i) / d_model)
    sin_enc = np.sin(angles)
    cos_enc = np.cos(angles)
    encoding = np.concatenate([sin_enc, cos_enc[:, :d_model // 2]], axis=-1)
    return encoding

def apply_positional_encoding(features, pos_encoding):
    """Applies positional encoding to the input features."""
    return features + pos_encoding[:features.shape[1], :]


def run_cross_validation(n_splits=5, batch_size=32, feat_dim=128):
    """Runs cross-validation with stratified splitting, batching, and positional encoding."""
    song_ids = df['SongID'].unique()
    all_labels = np.concatenate([padded_beat_labels[id] for id in song_ids])

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(song_ids, all_labels), start=1):
        print(f"Fold {fold}/{n_splits}")

        train_ids, val_ids = song_ids[train_idx], song_ids[val_idx]
        train_batches = [(get_batch_segments(ids, padded_beat_segments, padded_beat_labels, feat_dim), 
                          get_batch_labels(ids, padded_beat_labels)) for ids in create_batches(train_ids, batch_size)]
        val_batches = [(get_batch_segments(ids, padded_beat_segments, padded_beat_labels, feat_dim), 
                        get_batch_labels(ids, padded_beat_labels)) for ids in create_batches(val_ids, batch_size)]

        # Train and evaluate model
        model = train_model(train_batches)
        val_score = evaluate_model(model, val_batches)
        fold_scores.append(val_score)

    avg_score = np.mean(fold_scores)
    print(f"Average Cross-Validation Score: {avg_score}")

def train_model(train_batches):
    # Define the directories for checkpoints and models
    checkpoint_dir = os.path.join('..', 'checkpoints', 'CRNN')
    model_dir = os.path.join('..', 'models', 'CRNN')

    # Ensure the checkpoint and model directories exist
    os.makedirs(checkpoint_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

    # Define the checkpoint path for the best model
    best_model_filepath = os.path.join(model_dir, 'best_model.h5')

    callbacks = [
        ModelCheckpoint(
            filepath=best_model_filepath,
            save_weights_only=False,
            monitor='val_custom_accuracy',  # Use 'val_custom_accuracy' for validation custom accuracy
            mode='max',
            save_best_only=True,
            verbose=1
        ),
        EarlyStopping(
            monitor='val_loss',  
            patience=3,
            verbose=1,
            restore_best_weights=True
        ),
        ReduceLROnPlateau(
            monitor='val_loss',  
            factor=0.1,
            patience=2,
            verbose=1,
            min_delta=0.0001,
            min_lr=0.00001
        )
    ]

    # Train the model with the simplified callbacks list
    history = model.fit(
        train_dataset,
        epochs=10,
        validation_data=val_dataset,
        callbacks=callbacks
    )

def evaluate_model(model, val_batches):
    # Implement model evaluation here
    pass