In [1]:
import os
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Extract Sentiment
zip_path = '/content/drive/My Drive/AAI511_ML/midi_classic_music.zip'
extract_path = '/content/midi_classic_music/'

# Unzip the file.
!unzip -q -n "{zip_path}" -d "{extract_path}"

Mounted at /content/drive


# Keep only the composers mentioned in the final project

Please only do your prediction only for below composers, therefore you need to select the required composers from the given dataset above.

1-Bach

2-Beethoven

3-Chopin

4-Mozart

In [2]:
import shutil

composers = ['Bach', 'Beethoven', 'Chopin', 'Mozart']

for item in os.listdir(extract_path):
    item_path = os.path.join(extract_path, item)
    if os.path.isfile(item_path) and (item.endswith('.mid') or item.endswith('.MID')):
        os.remove(item_path)
        print(f"Remove file at root: {item}")

midi_classic_path = extract_path + 'midiclassics'

for item in os.listdir(midi_classic_path):
    item_path = os.path.join(midi_classic_path, item)
    if os.path.isfile(item_path) and (item.endswith('.mid') or item.endswith('.MID')):
        os.remove(item_path)
        print(f"Remove file at midi_classic_path: {item}")

# remove rest of non-composer files.
for root, dirs, files in os.walk(extract_path, topdown=False):
    for dir_name in dirs:
        if dir_name in composers:
            continue  # Keep this directory

        full_path = os.path.join(root, dir_name)
        contains_desired = False

        for subroot, subdirs, subfiles in os.walk(full_path):
            if any(composer in subdirs for composer in composers):
                contains_desired = True
                break

        if not contains_desired:
            shutil.rmtree(full_path)
            print(f"Removed: {full_path}")

Remove file at root: Tchaikovsky Lake Of The Swans Act 2 14mov.mid
Remove file at root: Tchaicovsky Waltz of the Flowers.MID
Remove file at root: Sibelius Kuolema Vals op44.mid
Remove file at root: Tchaikovsky Lake Of The Swans Act 1 2mov.mid
Remove file at root: Tchaikovsky Lake Of The Swans Act 1 7-8movs.mid
Remove file at root: Rothchlid Symphony Rmw12 3mov.mid
Remove file at root: Tchaikovsky Lake Of The Swans Act 2 12mov.mid
Remove file at root: Tchaikovsky Lake Of The Swans Act 2 10mov.mid
Remove file at root: Tchaikovsky Lake Of The Swans Act 1 5mov.mid
Remove file at root: Tchaikovsky Lake Of The Swans Act 2 13mov.mid
Remove file at root: Tchaikovsky Lake Of The Swans Act 1 6mov.mid
Remove file at root: Tchaikovsky Lake Of The Swans Act 2 11mov.mid
Remove file at root: Tchaikovsky Lake Of The Swans Act 1 1mov.mid
Remove file at root: Tchaikovsky Lake Of The Swans Act 1 3mov.mid
Remove file at root: Wagner Ride of the valkyries.mid
Remove file at root: Rothchild Symphony Rmw12 2

# Pre-processing

Convert the musical scores into a format suitable for deep learning models. This involves converting the musical scores into MIDI files and applying data augmentation techniques.

In [4]:
!pip install music21
!pip install mido

Collecting mido
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Downloading mido-1.3.3-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mido
Successfully installed mido-1.3.3


In [7]:
import numpy as np
import os
from collections import Counter
from pathlib import Path
import music21 # Still needed for pitch conversion
import mido    # The fast MIDI parser
from multiprocessing import Pool, cpu_count

# --- Function 1: Get Song Paths (No changes) ---
def get_songs_by_composer(root_dir):
    # ... (same function as before)
    songs_dict = {}
    root_path = Path(root_dir)
    for composer_path in root_path.iterdir():
        if composer_path.is_dir():
            composer_name = composer_path.name.lower()
            songs_dict[composer_name] = list(composer_path.glob('**/*.mid*'))
    return songs_dict

# --- Function 2: The NEW Fast Parser ---
def parse_notes_from_file_mido(file_path):
    notes_to_parse = []
    try:
        midi_file = mido.MidiFile(file_path)
        current_chord_notes = []
        for msg in mido.merge_tracks(midi_file.tracks):
            if msg.time > 0 and current_chord_notes:
                current_chord_notes.sort()
                notes_to_parse.append('.'.join(current_chord_notes))
                current_chord_notes = []
            if msg.type == 'note_on' and msg.velocity > 0:
                pitch_name = music21.pitch.Pitch(midi=msg.note).nameWithOctave
                current_chord_notes.append(pitch_name)
        if current_chord_notes:
            current_chord_notes.sort()
            notes_to_parse.append('.'.join(current_chord_notes))
    except Exception as e:
        print(f"  - Could not parse {file_path} with mido: {e}")
    return notes_to_parse

if __name__ == '__main__':
    MIDI_ROOT_DIR = '/content/midi_classic_music/midiclassics/'

    print(f"Step 1: Building vocabulary in parallel using {cpu_count()} cores...")
    songs_dict = get_songs_by_composer(MIDI_ROOT_DIR)

    # Create a flat list of all file paths to process
    all_files = [file for files_list in songs_dict.values() for file in files_list]

    # Use a multiprocessing Pool to parse files in parallel
    with Pool() as pool:
        all_note_lists = pool.map(parse_notes_from_file_mido, all_files)

    # Flatten the list of lists into a single list of all notes
    all_notes = [note for sublist in all_note_lists for note in sublist]

    # vocabulary creation
    note_counts = Counter(all_notes)
    n_vocab = len(note_counts)
    pitch_names = sorted(note_counts.keys())
    note_to_int = {note: number for number, note in enumerate(pitch_names)}
    print(f"\nVocabulary Size (unique notes/chords): {n_vocab}")

    # --- Create sequences and save ---
    print("\nStep 2: Creating sequences and labeling with composer IDs...")
    network_input = []
    network_output = []
    sequence_length = 100

    file_to_notes_map = dict(zip(all_files, all_note_lists))
    composer_map = {name: i for i, name in enumerate(sorted(songs_dict.keys()))}

    for composer_name, song_files in songs_dict.items():
        composer_id = composer_map[composer_name]
        for song_file in song_files:
            notes_in_song = file_to_notes_map[song_file] # Get notes from cache
            if len(notes_in_song) > sequence_length:
                for i in range(len(notes_in_song) - sequence_length):
                    sequence_in = notes_in_song[i : i + sequence_length]
                    network_input.append([note_to_int[note] for note in sequence_in])
                    network_output.append(composer_id)

    print("\nStep 3: Reshaping and normalizing data...")
    n_patterns = len(network_input)
    network_input = np.reshape(network_input, (n_patterns, sequence_length, 1))
    network_input = network_input / float(n_vocab)
    network_output = np.array(network_output)

Step 1: Building vocabulary in parallel using 8 cores...
  - Could not parse /content/midi_classic_music/midiclassics/Beethoven/Anhang 14-3.mid with mido: Could not decode key with 3 flats and mode 255

Vocabulary Size (unique notes/chords): 55053

Step 2: Creating sequences and labeling with composer IDs...

Step 3: Reshaping and normalizing data...

Step 4: Saving preprocessed data to file...


In [8]:
# saved preprocessed data for later to save time.

# Ensure drive is mounted
drive.mount('/content/drive')

# Create directory
os.makedirs('/content/drive/MyDrive/AAI511_ML', exist_ok=True)

# we had to go through several iterations to get this right!
save_path = '/content/drive/MyDrive/AAI511_ML/preprocessed_composer_data_v3.npz'

print(f"\nSaving pre-processed data to {save_path}...")

np.savez_compressed(
    save_path,
    network_input=network_input,
    network_output=network_output,
    n_vocab=np.array([n_vocab]), # Save n_vocab as a numpy array
    composer_map=composer_map,   # Save the composer mapping dictionary
    note_to_int=note_to_int      # Save the note mapping dictionary
)

print("Data saved")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Saving pre-processed data to /content/drive/MyDrive/AAI511_ML/preprocessed_composer_data_v3.npz...
Data saved
