# Musical Note Classifier
Dongim Lee, Sally Lee, Zara Coakley

### A CNN model that can detect the pitch and length of a single note on a staff

This notebook contains a walkthrough of using our code to generate note data, augement the data, and train our CNN model on that data.

## Data Generation

We used Music21 to generate our note data. To use this part of the code you will need to download LilyPond: https://lilypond.org/doc/v2.23/Documentation/web/download

This code generates 1,785 images of notes ranging from A3 to C6. The possible lengths are whole, half, quarter, eighth, sixteenth. To add variation into the model, it also generates distracting symbols, including sharps, flats, and dots. Notably, our model itself doesn't take these symbols into account. It does not differentiate between G, G sharp, and G flat but instead calls them all G. It also doesn't take into account dotted notes when classifying length.

In [None]:
"""Generate single music notes"""

import os
from music21 import note, stream
from PIL import Image


def generate_note_image_with_variations(
    pitch, name, note_length, filename, variations, beams=False
):
    """
    Generate an image of a single note with variations and crop it.
    :param pitch: Pitch of the note (e.g., 'C4', 'D#5')
    :param note_length: Length of the note ('whole', 'half', 'quarter', etc.)
    :param filename: Output filename for the image
    :param variations: Dict containing variations (dynamics, articulation, accidental, fermata)
    :param beams: Flag to include/exclude beams
    """
    s = stream.Stream()

    if name in ["eighth", "16th"] and beams:
        # Add connected notes (beams)
        n1 = note.Note(pitch, quarterLength=note_length)
        n2 = note.Note(pitch, quarterLength=note_length)  # Another note for the beam
        n1.beams.fill(name, type="start")  # Start beam
        n2.beams.fill(name, type="stop")  # Stop beam

        # Apply variations
        if variations.get("accidental"):
            n1.pitch.accidental = variations["accidental"]
        if variations.get("articulation"):
            n1.articulations.append(variations["articulation"])
        s.append([n1, n2])

    else:
        # Add a single note
        n = note.Note(pitch)
        n.quarterLength = note_length

        # Apply variations
        if variations.get("accidental"):
            n.pitch.accidental = variations["accidental"]
        if variations.get("articulation"):
            n.articulations.append(variations["articulation"])
        s.append(n)

    # Save to PNG
    temp_filename = os.path.join("temp", filename)
    s.write("lily.png", fp=temp_filename)

    # Crop the image to focus on the note
    # crop_note(f"{temp_filename}.png", os.path.join("data", f"{filename}.png"))


def crop_note(input_filename, output_filename):
    """
    Crop the generated note image to focus on the relevant area.
    :param input_filename: Full-sized image filename
    :param output_filename: Cropped image filename
    """
    with Image.open(input_filename) as img:
        left, top, right, bottom = 120, 15, 150, 70  # Adjust based on staff size
        cropped_img = img.crop((left, top, right, bottom))
        cropped_img.save(output_filename)
        print(f"Cropped and saved {output_filename}")


if __name__ == "__main__":
    os.makedirs("temp", exist_ok=True)
    # os.makedirs("data", exist_ok=True)
    pitches = [
        "A3",
        "B3",
        "C4",
        "D4",
        "E4",
        "F4",
        "G4",
        "A4",
        "B4",
        "C5",
        "D5",
        "E5",
        "F5",
        "G5",
        "A5",
        "B5",
        "C6",
    ]  # Example pitches
    lengths = {
        "whole": 4.0,
        "half": 2.0,
        "quarter": 1.0,
        "eighth": 0.5,
        "16th": 0.25,
    }  # Note lengths

    # Define all variations
    articulation_variations = [None]
    accidental_variations = [None, "sharp", "flat"]
    is_beams = [True, False]

    # Generate combinations of variations
    for pitch in pitches:
        for length, length_num in lengths.items():
            VARIATION_ID = 0
            for art in articulation_variations:
                for acc in accidental_variations:
                    variations = {
                        "articulation": art,
                        "accidental": acc,
                    }
                    filename = f"note_{pitch}_{length}_variation_{VARIATION_ID}"
                    generate_note_image_with_variations(
                        pitch, length, length_num, filename, variations
                    )
                    VARIATION_ID += 1
                    if length in ["eighth", "16th"]:
                        filename = f"note_{pitch}_{length}_variation_{VARIATION_ID}"
                        generate_note_image_with_variations(
                            pitch, length, length_num, filename, variations, beams=True
                        )
                        VARIATION_ID += 1

## Data Augmentation

We then augmented the generated note data to add more variation to our dataset. If you generated raw data using the code above, use these functions to augment the data. If you're just using the dataset from the github/huggingface (https://huggingface.co/dongim04/musical-note-classifier), you don't need to do this.

Augmentations:
- random cropping
- rotation by up to three degrees
- shearing
- zooming
- flipping horizontally

In [None]:
"""Augment original data by cropping in different places"""

import os
from PIL import Image
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import LabelEncoder
import numpy as np
import os
from tqdm import tqdm


def crop_note_with_variation(input_filename, output_filename, crop_id=0):
    """
    Crop the generated note image to focus on the relevant area.
    :param input_filename: Full-sized image filename
    :param output_filename: Cropped image filename
    """
    with Image.open(input_filename) as img:
        if crop_id == 0:
            left, top, right, bottom = 115, 10, 155, 50  # Adjust based on staff size
        elif crop_id == 1:
            left, top, right, bottom = 115, 5, 155, 45  # up
        elif crop_id == 2:
            left, top, right, bottom = 115, 15, 155, 55  # down
        elif crop_id == 3:
            left, top, right, bottom = 110, 10, 150, 50  # left
        else:
            left, top, right, bottom = 120, 10, 160, 50  # right
        cropped_img = img.crop((left, top, right, bottom))
        cropped_img.save(output_filename)
        print(f"Cropped and saved {output_filename}")

def augment_data(input_foldername, output_foldername):
    file_names = os.listdir(input_foldername)

    pitch_labels = []
    length_labels = []

    for file_name in file_names:
        pitch_labels.append(file_name[5:7])
        length_labels.append(file_name[8:-18])

    file_names = np.char.add(input_foldername, file_names)
    file_names = np.array(file_names)
    pitch_labels = np.array(pitch_labels)
    length_labels = np.array(length_labels)

    images = []

    for path in tqdm(file_names, desc="Processing images"):
        try:
            image = Image.open(path)
            image = image.convert('L')  # 'L' for grayscale
            image = image.resize((64, 64))
            image_array = np.array(image) / 255.0
            images.append(image_array)

        except Exception as e:
            print(f"Error loading image {path}: {e}")

    images = np.array(images)

    num_pitch_classes = len(set(pitch_labels))  # (A3 to C6)
    num_length_classes = len(set(length_labels))  # (whole, half, quarter, eighth, 16th)
    num_classes = num_pitch_classes * num_length_classes

    images = np.expand_dims(images, axis=-1)  # Shape: (n, 64, 64, 1)

    combined_labels = [p + l for p, l in zip(pitch_labels, length_labels)]
    encoder = LabelEncoder()
    combined_labels_encoded = encoder.fit_transform(combined_labels)

    labels = to_categorical(combined_labels_encoded, num_classes=num_classes) # Shape: (n, num_classes)

    augmented_augmented_data_dir = output_foldername
    os.makedirs(augmented_augmented_data_dir, exist_ok=True)

    datagen = ImageDataGenerator(
        rotation_range=3,
        width_shift_range=0.1,
        height_shift_range=0.1,
        shear_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        fill_mode="nearest"
    )

    augmentations_per_image = 30
    augmented_images = []
    augmented_labels = []

    # Generate augmented images
    for i, (image, label) in enumerate(zip(images, labels)):
        for batch in datagen.flow(
            np.expand_dims(image, axis=0),  # Shape: (1, 64, 64, 1)
            batch_size=1,
            save_to_dir=augmented_augmented_data_dir,
            save_prefix=f"{file_names[i][67:-4]}",
            save_format="png"
        ):
            augmented_images.append(batch[0])
            augmented_labels.append(label)

            if len(augmented_images) % augmentations_per_image == 0:
                break

    all_images = np.concatenate((images, np.array(augmented_images)), axis=0)
    all_labels = np.concatenate((labels, np.array(augmented_labels)), axis=0)

## Combined Pitch and Length Model

Next, we defined our model architecture and parameters and trained our model on the dataset.

 If you download the dataset from the github/huggingface you can start running the code here. You don't have to do any of the augmentation.

First, import necessary libraries.

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, Input, Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import os
from PIL import Image
from tqdm import tqdm  # for progress bar

Split the data into train and test images.

In [None]:
train_images, temp_images, train_labels, temp_labels = train_test_split(
    all_images, all_labels, test_size=0.3, random_state=42
)

val_images, test_images, val_labels, test_labels = train_test_split(
    temp_images, temp_labels, test_size=0.5, random_state=42
)

Define the model architecture.

In [None]:
def create_advanced_cnn_model_with_l2(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)

    # Block 1
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.001))(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.001))(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Dropout(0.25)(x)

    # Block 2
    x = layers.Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.001))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.001))(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Dropout(0.25)(x)

    # Block 3
    x = layers.Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.001))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.001))(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Dropout(0.25)(x)

    # Attention Mechanism (SE Block)
    se = layers.GlobalAveragePooling2D()(x)
    se = layers.Dense(128 // 16, activation='relu', kernel_regularizer=l2(0.001))(se)  # Bottleneck
    se = layers.Dense(128, activation='sigmoid', kernel_regularizer=l2(0.001))(se)
    x = layers.multiply([x, se])  # Scale the feature maps

    # Block 4
    x = layers.Conv2D(256, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.001))(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Dropout(0.25)(x)

    # Global Average Pooling and Fully Connected Layers
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(256, activation='relu', kernel_regularizer=l2(0.001))(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(128, activation='relu', kernel_regularizer=l2(0.001))(x)
    x = layers.Dropout(0.5)(x)

    # Output
    outputs = layers.Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.001), name='output')(x)

    model = models.Model(inputs, outputs)
    return model

Callbacks: early stopping and L2 regularization to prevent overfitting.

In [None]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

lr_scheduler = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    verbose=1
)

Create the model.

In [None]:
input_shape = (64, 64, 1)
model = create_advanced_cnn_model_with_l2(input_shape, num_classes) # create_advanced_cnn_model_with_l2
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

Train the model.

In [None]:
train_generator = datagen.flow(train_images, train_labels, batch_size=32)
val_generator = (val_images, val_labels)

history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=500,
    callbacks=[early_stopping, lr_scheduler] # lr_scheduler
)

Plot losses.

In [None]:
import matplotlib.pyplot as plt

train_loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(train_loss) + 1)

plt.figure(figsize=(8, 6))
plt.plot(epochs, train_loss, label='Training Loss')
plt.plot(epochs, val_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

Evaluate the model.

In [None]:
test_loss, test_accuracy = model.evaluate(test_images, test_labels, verbose=1)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

Calculate metrics. If you would like to actually see the metrics from our model, visit our website! https://musical-note-classifier.onrender.com/ (It might take 15 sec or so to load)

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Evaluate the model and get predictions
# test_loss, test_accuracy = model.evaluate(test_images, test_labels, verbose=1)
predicted_probs = model.predict(test_images)
predicted_classes = np.argmax(predicted_probs, axis=1)
true_classes = np.argmax(test_labels, axis=1)

# Print the test loss and accuracy
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Compute the confusion matrix
conf_matrix = confusion_matrix(true_classes, predicted_classes)
print("\nConfusion Matrix:")
print(conf_matrix)

# Compute additional metrics
print("\nClassification Report:")
print(classification_report(true_classes, predicted_classes, digits=4))

# Optionally, calculate and print additional individual metrics
accuracy = accuracy_score(true_classes, predicted_classes)
print(f"Accuracy: {accuracy}")


# Import the Model

If you would like to import the model instead of training it yourself, use the code below.

First, download the model and encoder from our github and upload them here.

Load the model and encoder.

In [None]:
import tensorflow as tf
import pickle

model = tf.keras.models.load_model('path/to/the/model_weights.h5')

with open('path/to/the/encoder.pkl', 'rb') as file:
    encoder = pickle.load(file)

Run the model on your images.

In [None]:
import os
import numpy as np
from PIL import Image

examples_folder = 'folder containing the images you want to test the model on'

for file_name in os.listdir(examples_folder):
    file_path = os.path.join(examples_folder, file_name)

    image = Image.open(file_path)
    image_array = np.array(image.convert('L').resize((64, 64))) / 255.0
    image_array = np.expand_dims(image_array, axis=0)  # Shape becomes (1, 64, 64)
    image_array = np.expand_dims(image_array, axis=-1)  # Shape becomes (1, 64, 64, 1)

    predictions = model.predict(image_array)
    predicted_class_index = np.argmax(predictions)
    predicted_label = encoder.inverse_transform([predicted_class_index])
    confidence = predictions[0][predicted_class_index]

    print(f"Actual label: {file_name[:-4]}")
    print(f"Predicted label: {predicted_label[0]}")
    print(f"Confidence: {confidence:.2f}")
    print('-' * 40)

# Generate a summary of the model

Important characteristics:
- 696,255 parameters
- Dropout layers (25%)
- ReLU activatation
- Batch normalization
- Max pooling
- L2 regularization
- Early stopping and learning rate scheduler

In [None]:
model.summary()