# Unzip the dataset

In [1]:
import zipfile
import os

def unzip_file(zip_file_path, extract_to_path):
  """
  Unzips a zip file to a specified directory.

  Args:
      zip_file_path: The path to the zip file.
      extract_to_path: The path to the directory where contents should be extracted.
  """
  try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
      zip_ref.extractall(extract_to_path)
    print(f"Successfully unzipped '{zip_file_path}' to '{extract_to_path}'")
  except FileNotFoundError:
    print(f"Error: Zip file not found at '{zip_file_path}'")
  except zipfile.BadZipFile:
    print(f"Error: Invalid zip file: '{zip_file_path}'")
  except Exception as e:
    print(f"An error occurred: {e}")

zip_file_path = 'lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6.zip'
extract_to_path = 'lmd-full_and_reddit_MIDI_dataset/'

if not os.path.exists(extract_to_path):
    os.makedirs(extract_to_path)

unzip_file(zip_file_path, extract_to_path)

Successfully unzipped 'lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6.zip' to 'lmd-full_and_reddit_MIDI_dataset/'


# Explore a npy file

In [2]:
import numpy as np

def inspect_npy_file(file_path):
    """
    Loads and inspects the contents of a .npy file.

    Args:
        file_path: The path to the .npy file.

    Returns:
        None. Prints information about the array stored in the file.
    """
    try:
        data = np.load(file_path, allow_pickle=True)

        print(f"File: {file_path}")
        print(f"Data type: {data.dtype}")
        print(f"Shape: {data.shape}")

        if data.dtype == 'object':
            print("The array contains objects.")
            if data.ndim == 0:
                # Handle 0-dimensional object arrays (scalar objects)
                print(f"  Object Type: {type(data.item())}")
                if isinstance(data.item(), dict):
                    print("  Keys:", data.item().keys())
                else:
                    print(f"  Value: {data.item()}")
            else:
                # Handle multi-dimensional object arrays
                for i in range(data.shape[0]):
                    for j in range(data.shape[1]):
                        item = data[i, j]
                        print(f"  Item at index ({i}, {j}):")
                        print(f"    Object Type: {type(item)}")
                        if isinstance(item, dict):
                            print("    Keys:", item.keys())
                        else:
                            print(f"    Value: {item}")

        else:
            # Handle numerical arrays
            print("First few elements (up to 10):")
            if data.ndim == 1:
                print(data[:10])
            elif data.ndim == 2:
                print(data[:min(data.shape[0], 10), :min(data.shape[1], 10)])
            else:
                print(data.flatten()[:10])

            if data.size < 20:
                print("Full array:")
                print(data)

        # Handle metadata (if present)
        if hasattr(data, 'metadata') and data.metadata is not None:
            print("\nMetadata:")
            for key, value in data.metadata.items():
                print(f"  {key}: {value}")

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

file_path = "lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6/TURNER.When the heartache is over K.npy"

inspect_npy_file(file_path)

File: lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6/TURNER.When the heartache is over K.npy
Data type: object
Shape: (1, 4)
The array contains objects.
  Item at index (0, 0):
    Object Type: <class 'list'>
    Value: [[[[0.0, 0.25441833333333363, 146.8323839587038, 125.0]]], [[[0.2654799999999966, 0.25441833333333363, 174.61411571650194, 125.0]], [[0.2654800000000037, 0.25441833333333363, 329.6275569128699, 125.0]], [[0.2654799999999966, 0.25441833333333363, 329.6275569128699, 125.0]], [[0.2654800000000037, 0.25441833333332653, 164.81377845643496, 125.0]]], [[[0.5309600000000003, 0.25441833333333363, 329.6275569128699, 125.0]]], [[[0.2654799999999966, 0.25441833333333363, 293.6647679174076, 125.0]], [[0.796440000000004, 0.2544183333333194, 146.8323839587038, 125.0]], [[0.2654799999999966, 0.25441833333333363, 164.81377845643496, 125.0]]], [[[0.2654799999999966, 0.25441833333333363, 174.61411571650194, 125.0]], [[0.2654799999999966, 0.25441833333333363, 293.6647679174076, 125.

# Make midi files out of the npy files

In [4]:
import numpy as np
import pretty_midi
import os

def npy_to_midi_discrete(npy_file, midi_file, tempo=120):
    """
    Converts a .npy file with encoded musical data (discrete attributes only)
    into a MIDI file.

    Args:
        npy_file: Path to the input .npy file.
        midi_file: Path to the output MIDI file (.mid).
        tempo: Tempo of the MIDI file in beats per minute (BPM).
    """
    data = np.load(npy_file, allow_pickle=True)
    midi = pretty_midi.PrettyMIDI(initial_tempo=tempo)

    # Create an instrument instance for a piano (instrument number 0)
    instrument = pretty_midi.Instrument(program=0)

    # Assuming data[0, 1] is discrete
    discrete_data = data[0, 1]

    # Keep track of the current time in the MIDI file
    current_time = 0

    for sentence_index, sentence in enumerate(discrete_data):
        for word_index, word in enumerate(sentence):
            for note_index, note in enumerate(word):
                pitch = int(note[0])
                duration_staves = note[1]
                rest_duration_staves = note[2]

                # Calculate note duration in seconds based on tempo and staves
                note_duration_seconds = (60 / tempo) * duration_staves

                # Calculate rest duration in seconds
                rest_duration_seconds = (60 / tempo) * rest_duration_staves

                # Advance current time by the rest duration
                current_time += rest_duration_seconds

                # Create a PrettyMIDI Note instance
                pm_note = pretty_midi.Note(
                    velocity=100,  # Default velocity
                    pitch=pitch,
                    start=current_time,
                    end=current_time + note_duration_seconds
                )

                # Add the note to the instrument
                instrument.notes.append(pm_note)

                # Advance current time by the note duration
                current_time += note_duration_seconds

    # Add the instrument to the MIDI object
    midi.instruments.append(instrument)

    # Write the MIDI data to a file
    midi.write(midi_file)
    print(f"MIDI file saved to: {midi_file}")

def convert_folder_to_midi(npy_folder, midi_folder, tempo=120):
    """
    Converts all .npy files in a folder to MIDI files using discrete data.

    Args:
        npy_folder: Path to the folder containing .npy files.
        midi_folder: Path to the folder where MIDI files will be saved.
        tempo: Tempo for the MIDI files in BPM.
    """

    if not os.path.exists(midi_folder):
        os.makedirs(midi_folder)

    # Iterate through all files in the .npy folder
    for filename in os.listdir(npy_folder):
        if filename.endswith(".npy"):
            npy_path = os.path.join(npy_folder, filename)
            midi_filename = os.path.splitext(filename)[0] + ".mid"
            midi_path = os.path.join(midi_folder, midi_filename)

            try:
                npy_to_midi_discrete(npy_path, midi_path, tempo=tempo)
            except Exception as e:
                print(f"Error converting {filename}: {e}")

npy_folder = "lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6" 
midi_folder = "lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6_MIDI"
tempo = 120  # Set the desired tempo

convert_folder_to_midi(npy_folder, midi_folder, tempo)

MIDI file saved to: lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6_MIDI/ce25a307a49ceec65cfd294eaae67002.mid
MIDI file saved to: lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6_MIDI/7983ed3369b2ac4a0a7266c76a790699.mid
MIDI file saved to: lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6_MIDI/nickelback-savin_me.mid
MIDI file saved to: lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6_MIDI/METALLICA.I disappear K.mid
MIDI file saved to: lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6_MIDI/068ede806242a1020eee2849b73b5b80.mid
MIDI file saved to: lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6_MIDI/25b22bc3cd54f521afb289a211f6129f.mid
MIDI file saved to: lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6_MIDI/bbe6e824002578dc24617bba92f85c58.mid
MIDI file saved to: lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6_MIDI/df076a42ffe2e75da7051f178a64fc75.mid
MIDI file saved to: lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6_MIDI/RAINBOW L.mid
M

# Make a pandas dataframe containing the lyrics and the path to the midi files

In [2]:
import numpy as np
import pandas as pd
import os

def extract_lyrics(npy_file):
    """
    Extracts lyrics from a .npy file.

    Args:
        npy_file: Path to the input .npy file.

    Returns:
        A string containing the extracted lyrics.
    """
    data = np.load(npy_file, allow_pickle=True)

    # Assuming lyrics data is in data[0, 3]
    word_list = data[0, 3]

    lyrics = ""
    for phrase in word_list:
        line = ""
        for part in phrase:
            line += "".join(part) + " "
        lyrics += line.strip() + "\n"
    return lyrics

def create_lyrics_midi_dataframe(npy_folder, midi_folder):
    """
    Creates a Pandas DataFrame with lyrics and corresponding MIDI file paths.

    Args:
        npy_folder: Path to the folder containing .npy files.
        midi_folder: Path to the folder containing the corresponding MIDI files.

    Returns:
        A Pandas DataFrame with 'lyrics' and 'midi_path' columns.
    """
    data = []
    for filename in os.listdir(npy_folder):
        if filename.endswith(".npy"):
            npy_path = os.path.join(npy_folder, filename)
            midi_filename = os.path.splitext(filename)[0] + ".mid"
            midi_path = os.path.join(midi_folder, midi_filename)

            # Check if the corresponding MIDI file exists
            if os.path.exists(midi_path):
                try:
                    lyrics = extract_lyrics(npy_path)
                    if lyrics.strip():  # Check if lyrics are not empty after removing whitespace
                        data.append({'lyrics': lyrics, 'midi_path': midi_path})
                except Exception as e:
                    print(f"Error processing {filename}: {e}")
            else:
                print(f"MIDI file not found for {filename}")

    return pd.DataFrame(data)

# Example usage:
npy_folder = "lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6"
midi_folder = "lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6_MIDI"

df = create_lyrics_midi_dataframe(npy_folder, midi_folder)
df.head()

# Save to CSV (optional)
# df.to_csv("lyrics_midi_data.csv", index=False)

Unnamed: 0,lyrics,midi_path
0,In sleep he sang to me\nin dreams he came\ntha...,lmd-full_and_reddit_MIDI_dataset/sentenceWord_...
1,I have plans and schemes\nAnd I have hopes and...,lmd-full_and_reddit_MIDI_dataset/sentenceWord_...
2,I get up and nothing gets me You got\nit tough...,lmd-full_and_reddit_MIDI_dataset/sentenceWord_...
3,Man a hot like seven inches\nfrom the midday I...,lmd-full_and_reddit_MIDI_dataset/sentenceWord_...
4,We come from the land of the ice and snow\nfro...,lmd-full_and_reddit_MIDI_dataset/sentenceWord_...


# Tokenize the midi files

In [8]:
from pathlib import Path
from miditok import TSD, TokenizerConfig

# Define the configuration
config = TokenizerConfig(
    num_velocities=1,  # Remove velocity tokens (not relevant for your vocal data)
    use_chords=False,  # Disable chord tokens (unless your vocals have complex harmonies)
    use_rests=False,  # Disable rest tokens (unless silence is significant in your data)
    use_tempos=False,  # Disable tempo tokens (unless you have multiple pieces with varying tempos)
    use_time_signatures=False,  # Disable time signature tokens (unless relevant to your data)
)

# Create the tokenizer
tokenizer = TSD(config)

# Tokenize the MIDI files
tokens = tokenizer(Path("lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6_MIDI/0a1c541bc1005aea8440ad9f68511bd8.mid"))
print(tokens[0][:10])

TokSequence(tokens=['Pitch_46', 'Velocity_127', 'Duration_1.0.8', 'TimeShift_1.0.8', 'Pitch_50', 'Velocity_127', 'Duration_1.0.8', 'TimeShift_2.0.8', 'Pitch_53', 'Velocity_127'], ids=[29, 92, 100, 164, 33, 92, 100, 172, 36, 92], bytes='', events=[Event(type=Pitch, value=46, time=0, desc=8), Event(type=Velocity, value=127, time=0, desc=127), Event(type=Duration, value=1.0.8, time=0, desc=8 ticks), Event(type=TimeShift, value=1.0.8, time=0, desc=8 ticks), Event(type=Pitch, value=50, time=8, desc=16), Event(type=Velocity, value=127, time=8, desc=127), Event(type=Duration, value=1.0.8, time=8, desc=8 ticks), Event(type=TimeShift, value=2.0.8, time=8, desc=16 ticks), Event(type=Pitch, value=53, time=24, desc=32), Event(type=Velocity, value=127, time=24, desc=127)], are_ids_encoded=False, _ticks_bars=[0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, 1056, 1088, 1120, 1152, 1184, 12

## Train tokenizer

In [9]:
midi_paths = list(Path("lmd-full_and_reddit_MIDI_dataset", "sentenceWord_level_6_MIDI").glob("**/*.mid"))

# Train the tokenizer (builds vocabulary)
tokenizer.train(vocab_size=30000, files_paths=midi_paths)

# Save the tokenizer parameters (vocabulary and configuration)
tokenizer.save(Path("tokenizer", "tokenizer.json"))






## Load tokenizer

In [15]:
tokenizer = tokenizer.from_pretrained(Path("tokenizer", "tokenizer.json"))
tokens = tokenizer(Path("lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6_MIDI/0a1c541bc1005aea8440ad9f68511bd8.mid"))
print(tokens[0][:10])

TokSequence(tokens=['Pitch_46', 'Velocity_127', 'Duration_1.0.8', 'TimeShift_1.0.8', 'Pitch_50', 'Velocity_127', 'Duration_1.0.8', 'TimeShift_2.0.8', 'Pitch_53', 'Velocity_127'], ids=[495, 432, 3866, 152, 3019, 2242, 526, 26003, 15440, 10940], bytes='>}\x85ÅB}\x85ÍE}', events=[Event(type=Pitch, value=46, time=0, desc=8), Event(type=Velocity, value=127, time=0, desc=127), Event(type=Duration, value=1.0.8, time=0, desc=8 ticks), Event(type=TimeShift, value=1.0.8, time=0, desc=8 ticks), Event(type=Pitch, value=50, time=8, desc=16), Event(type=Velocity, value=127, time=8, desc=127), Event(type=Duration, value=1.0.8, time=8, desc=8 ticks), Event(type=TimeShift, value=2.0.8, time=8, desc=16 ticks), Event(type=Pitch, value=53, time=24, desc=32), Event(type=Velocity, value=127, time=24, desc=127)], are_ids_encoded=True, _ticks_bars=[0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, 10