In [63]:
import numpy as np
import pandas as pd
import librosa
import os
import pickle
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.exceptions import ConvergenceWarning
import gzip

warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [64]:
def make_key_invariant(chromagram):
    """Returns a key-invariant chromagram."""
    maj_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
    min_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
    avg_chroma = np.mean(chromagram, axis=1)
    maj_corrs = np.correlate(avg_chroma, maj_profile, mode='same')
    min_corrs = np.correlate(avg_chroma, min_profile, mode='same')
    key_shift = np.argmax(np.concatenate((maj_corrs, min_corrs))) % 12
    return np.roll(chromagram, -key_shift, axis=0)

def segment_by_beats(feature_array, beat_frames):
    """Segments a 2D array of audio features by beat frames."""
    beat_frames = np.append(beat_frames, feature_array.shape[1])
    return [feature_array[:, beat_frames[i]:beat_frames[i + 1]] for i in range(len(beat_frames) - 1)]

def map_labels_to_beats(df_labels, beat_times):
    """Map chorus labels to beat-synced data."""
    labels = np.zeros(len(beat_times) - 1)
    for _, row in df_labels.iterrows():
        indices = np.where((beat_times[:-1] >= row['start_time']) & (beat_times[:-1] < row['end_time']))[0]
        labels[indices] = 1 if row['label'] == 'chorus' else 0
    return labels

def save_compressed_pickle(file_path, data):
    """Save data to a compressed pickle file."""
    with gzip.open(file_path, 'wb') as f:
        pickle.dump(data, f)

In [70]:
# Ensure required directories exist
os.makedirs("../data/pkl/segments", exist_ok=True)
os.makedirs("../data/pkl/labels", exist_ok=True)

# Constants
df = pd.read_csv('../data/dataframes/clean_labeled.csv')
TARGET_SR = 12000 # Target sample rate chosen to be 1/4 of the original 48kHz.
HOP_LENGTH = 128  # Hop length for short-time Fourier transform. Hop length of 128 at 12kHz gives a similar frame rate to a hop length of 512 at 48kHz.

# Process each song in the dataset
for song_id in tqdm(df['SongID'].unique(), desc="Processing..."):
    # Load the audio file
    audio_path = f'../data/audio_files/processed/{song_id}.mp3'
    y, _ = librosa.load(audio_path, sr=TARGET_SR)
    
    # Harmonic-percussive source separation
    y_harm, y_perc = librosa.effects.hpss(y)
    
    # Compute onset envelope from the percussive component
    onset_env = librosa.onset.onset_strength(y=y_perc, sr=TARGET_SR, hop_length=HOP_LENGTH)

    # Beat tracking
    _, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=TARGET_SR, hop_length=HOP_LENGTH)
    beat_times = librosa.frames_to_time(beats, sr=TARGET_SR, hop_length=HOP_LENGTH)
    
    # Map labels to beats
    df_labels = df[df['SongID'] == song_id][['start_time', 'end_time', 'label']]
    labels = map_labels_to_beats(df_labels, beat_times)
    
    # Compute RMS energy from spectrogram to give a more accurate representation of energy over time because its frames can be windowed
    S = np.abs(librosa.stft(y, hop_length=HOP_LENGTH))
    rms = librosa.feature.rms(S=S)
    
    # Compute Mel Spectrogram and decompose into 4 components (4 chosen from EDA)
    mel = librosa.feature.melspectrogram(y=y, sr=TARGET_SR, n_mels=128, hop_length=HOP_LENGTH)
    mel_acts = librosa.decompose.decompose(mel, n_components=4, sort=True)[1]
    
    # Compute chromagram, make it key invariant, and decompose 
    chromagram = librosa.feature.chroma_cqt(y=y_harm, sr=TARGET_SR, hop_length=HOP_LENGTH)
    chroma_ki = make_key_invariant(chromagram)
    chroma_acts = librosa.decompose.decompose(chroma_ki, n_components=3, sort=True)[1]
    
    # Compute tempogram, ensure non-negative, and decompose 
    tempogram = np.clip(librosa.feature.tempogram(onset_envelope=onset_env, sr=TARGET_SR, hop_length=HOP_LENGTH), 0, None)
    tempogram_acts = librosa.decompose.decompose(tempogram, n_components=3, sort=True)[1]
    
    # Compute MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=TARGET_SR, n_mfcc=13, hop_length=HOP_LENGTH)
    
    # Standardize features, stack, and segment by beats
    features = [rms, mel_acts, chroma_acts, tempogram_acts, mfccs]
    standardized_features = [StandardScaler().fit_transform(feature.T).T for feature in features]
    concat_features = np.vstack(standardized_features)
    segments = segment_by_beats(concat_features, beats)

    # Save results with compression
    save_compressed_pickle(f"../data/pkl/segments/{song_id}_beats.pkl.gz", segments)
    save_compressed_pickle(f"../data/pkl/labels/{song_id}_labels.pkl.gz", labels)

Processing...: 100%|██████████| 332/332 [1:06:43<00:00, 12.06s/it]


In [72]:
import os
import gzip
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import pickle
from tqdm.auto import tqdm

In [73]:
# Initialize dictionaries to hold the segments and labels
beat_segments, beat_labels = {}, {}

# Define the base paths for the segments and labels directories
segments_base_path = "../data/pkl/segments/"
labels_base_path = "../data/pkl/labels/"

for song_id in tqdm(df['SongID'].unique(), desc="Unloading data...beep boop I am a robot"):
    # Construct the full file paths for the segments and labels
    segments_path = os.path.join(segments_base_path, f"{song_id}_beats.pkl.gz")
    labels_path = os.path.join(labels_base_path, f"{song_id}_labels.pkl.gz")
    
    # Load the segments
    with gzip.open(segments_path, 'rb') as f:
        beat_segments[song_id] = pickle.load(f)
    
    # Load the labels
    with gzip.open(labels_path, 'rb') as f:
        beat_labels[song_id] = pickle.load(f)
        
# Convert dictionaries to lists for further processing
X = list(beat_segments.values())
y = list(beat_labels.values())

# Split the data in 70/15/15 train/val/test splits
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

Unloading data...beep boop I am a robot:   0%|          | 0/332 [00:00<?, ?it/s]

### In progress..

In [None]:
def create_batches_and_pad(sequences, labels, batch_size, padding_value=-1, add_pos_encoding=False):
    """Create and pad batches of sequences and labels."""
    padded_batches = []
    for i in range(0, len(sequences), batch_size):
        batch_sequences = sequences[i:i + batch_size]
        batch_labels = labels[i:i + batch_size]
        
        # Pad sequences and labels
        padded_sequences = pad_sequences(batch_sequences, padding='post', value=padding_value)
        padded_labels = pad_sequences(batch_labels, padding='post', value=padding_value)
        
        if add_pos_encoding:
            padded_sequences = add_positional_encoding(padded_sequences)
        
        padded_batches.append((padded_sequences, padded_labels))
    
    return padded_batches


def custom_binary_crossentropy(y_true, y_pred):
    """Custom binary cross-entropy loss to handle -1 labels, which are used for padding and should be ignored during loss calculation."""
    y_true = tf.cast(y_true, tf.float32)
    bce = tf.keras.backend.binary_crossentropy(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, -1), tf.float32)
    loss = bce * mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

def custom_accuracy(y_true, y_pred):
    """Custom accuracy metric to handle -1 labels, which are used for padding and should be ignored during accuracy calculation."""
    mask = tf.cast(tf.not_equal(y_true, -1), tf.float32)
    correct_predictions = tf.equal(tf.cast(tf.round(y_pred), tf.float32), y_true)
    masked_correct_predictions = tf.cast(correct_predictions, tf.float32) * mask
    accuracy = tf.reduce_sum(masked_correct_predictions) / tf.reduce_sum(mask)
    return accuracy


def compile_model(max_frames, max_freq_bins, n_features, custom_binary_crossentropy, custom_accuracy, max_beats):
    """
    Define and compile a CNN model with considerations for environmental sound classification.
    """
    frame_input = layers.Input(shape=(max_frames, max_freq_bins, n_features))
    
    # Assuming Mx3 filter means spanning the entire frequency bins with a width of 3.
    conv1 = layers.Conv2D(filters=128, kernel_size=(max_freq_bins, 3), activation='relu', padding='same')(frame_input)
    pool1 = layers.MaxPooling2D(pool_size=(4, 4), padding='same')(conv1)
    dropout1 = layers.Dropout(0.5)(pool1)  # Applying 50% dropout after pooling
    
    conv2 = layers.Conv2D(filters=256, kernel_size=(1, 3), activation='relu', padding='same')(dropout1) # Changed to 1x3 to not reduce the frequency dimension further
    pool2 = layers.MaxPooling2D(pool_size=(4, 4), padding='same')(conv2)
    dropout2 = layers.Dropout(0.5)(pool2)  # Applying 50% dropout after pooling
    
    conv3 = layers.Conv2D(filters=256, kernel_size=(1, 3), activation='relu', padding='same')(dropout2) # Changed to 1x3 for consistency
    pool3 = layers.MaxPooling2D(pool_size=(4, 4), padding='same')(conv3)
    dropout3 = layers.Dropout(0.5)(pool3)  # Applying 50% dropout after pooling
    
    frame_features = layers.Flatten()(dropout3)
    frame_feature_model = models.Model(inputs=frame_input, outputs=frame_features)

    measure_input = layers.Input(shape=(max_beats, max_frames, max_freq_bins, n_features))
    time_distributed = layers.TimeDistributed(frame_feature_model)(measure_input)
    masking_layer = layers.Masking(mask_value=-1)(time_distributed)
    lstm_out = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(masking_layer)
    output = layers.TimeDistributed(layers.Dense(1, activation='sigmoid'))(lstm_out)
    model = models.Model(inputs=measure_input, outputs=output)
    model.compile(optimizer='adam', loss=custom_binary_crossentropy, metrics=[custom_accuracy])
    
    return model

In [None]:
# 5-fold cross-validation with stratified splitting, batching, and positional encoding
def get_positional_encoding(max_len, d_model):
    """Generates sinusoidal positional encodings."""
    pos = np.arange(max_len)[:, np.newaxis]
    i = np.arange(d_model // 2)[np.newaxis, :]
    angles = pos / np.power(10000, (2 * i) / d_model)
    sin_enc = np.sin(angles)
    cos_enc = np.cos(angles)
    encoding = np.concatenate([sin_enc, cos_enc[:, :d_model // 2]], axis=-1)
    return encoding

def apply_positional_encoding(features, pos_encoding):
    """Applies positional encoding to the input features."""
    return features + pos_encoding[:features.shape[1], :]


def run_cross_validation(n_splits=5, batch_size=32, feat_dim=128):
    """Runs cross-validation with stratified splitting, batching, and positional encoding."""
    song_ids = df['SongID'].unique()
    all_labels = np.concatenate([padded_beat_labels[id] for id in song_ids])

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(song_ids, all_labels), start=1):
        print(f"Fold {fold}/{n_splits}")

        train_ids, val_ids = song_ids[train_idx], song_ids[val_idx]
        train_batches = [(get_batch_segments(ids, padded_beat_segments, padded_beat_labels, feat_dim), 
                          get_batch_labels(ids, padded_beat_labels)) for ids in create_batches(train_ids, batch_size)]
        val_batches = [(get_batch_segments(ids, padded_beat_segments, padded_beat_labels, feat_dim), 
                        get_batch_labels(ids, padded_beat_labels)) for ids in create_batches(val_ids, batch_size)]

        # Train and evaluate model
        model = train_model(train_batches)
        val_score = evaluate_model(model, val_batches)
        fold_scores.append(val_score)

    avg_score = np.mean(fold_scores)
    print(f"Average Cross-Validation Score: {avg_score}")

def train_model(train_batches):
    # Define the directories for checkpoints and models
    checkpoint_dir = os.path.join('..', 'checkpoints', 'CRNN')
    model_dir = os.path.join('..', 'models', 'CRNN')

    # Ensure the checkpoint and model directories exist
    os.makedirs(checkpoint_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

    # Define the checkpoint path for the best model
    best_model_filepath = os.path.join(model_dir, 'best_model.h5')

    callbacks = [
        ModelCheckpoint(
            filepath=best_model_filepath,
            save_weights_only=False,
            monitor='val_custom_accuracy',  # Use 'val_custom_accuracy' for validation custom accuracy
            mode='max',
            save_best_only=True,
            verbose=1
        ),
        EarlyStopping(
            monitor='val_loss',  
            patience=3,
            verbose=1,
            restore_best_weights=True
        ),
        ReduceLROnPlateau(
            monitor='val_loss',  
            factor=0.1,
            patience=2,
            verbose=1,
            min_delta=0.0001,
            min_lr=0.00001
        )
    ]

    # Train the model with the simplified callbacks list
    history = model.fit(
        train_dataset,
        epochs=10,
        validation_data=val_dataset,
        callbacks=callbacks
    )

def evaluate_model(model, val_batches):
    # Implement model evaluation here
    pass