In [1]:
import numpy as np
import pandas as pd
import os

def extract_lyrics(npy_file):
    """
    Extracts lyrics from a .npy file.

    Args:
        npy_file: Path to the input .npy file.

    Returns:
        A string containing the extracted lyrics.
    """
    data = np.load(npy_file, allow_pickle=True)

    # Assuming lyrics data is in data[0, 3]
    word_list = data[0, 3]

    lyrics = ""
    for phrase in word_list:
        line = ""
        for part in phrase:
            line += "".join(part) + " "
        lyrics += line.strip() + "\n"
    return lyrics

def create_lyrics_midi_dataframe(npy_folder, midi_folder):
    """
    Creates a Pandas DataFrame with lyrics and corresponding MIDI file paths.

    Args:
        npy_folder: Path to the folder containing .npy files.
        midi_folder: Path to the folder containing the corresponding MIDI files.

    Returns:
        A Pandas DataFrame with 'lyrics' and 'midi_path' columns.
    """
    data = []
    for filename in os.listdir(npy_folder):
        if filename.endswith(".npy"):
            npy_path = os.path.join(npy_folder, filename)
            midi_filename = os.path.splitext(filename)[0] + ".mid"
            midi_path = os.path.join(midi_folder, midi_filename)

            # Check if the corresponding MIDI file exists
            if os.path.exists(midi_path):
                try:
                    lyrics = extract_lyrics(npy_path)
                    if lyrics.strip():  # Check if lyrics are not empty after removing whitespace
                        data.append({'lyrics': lyrics, 'midi_path': midi_path})
                except Exception as e:
                    print(f"Error processing {filename}: {e}")
            else:
                print(f"MIDI file not found for {filename}")

    return pd.DataFrame(data)

# Example usage:
npy_folder = "data/lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6"
midi_folder = "data/lmd-full_and_reddit_MIDI_dataset/sentenceWord_level_6_MIDI"

df = create_lyrics_midi_dataframe(npy_folder, midi_folder)
df.head()

Unnamed: 0,lyrics,midi_path
0,In sleep he sang to me\nin dreams he came\ntha...,data/lmd-full_and_reddit_MIDI_dataset/sentence...
1,I have plans and schemes\nAnd I have hopes and...,data/lmd-full_and_reddit_MIDI_dataset/sentence...
2,I get up and nothing gets me You got\nit tough...,data/lmd-full_and_reddit_MIDI_dataset/sentence...
3,Man a hot like seven inches\nfrom the midday I...,data/lmd-full_and_reddit_MIDI_dataset/sentence...
4,We come from the land of the ice and snow\nfro...,data/lmd-full_and_reddit_MIDI_dataset/sentence...


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import random
import numpy as np
# For reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import random
from copy import deepcopy
from symusic import Note, Score

def randomize_midi_pitch(midi_score, prob=0.2, max_change=4):
    """
    Randomizes the pitch of notes in a symusic.Score object with a given probability.

    Args:
        midi_score: A symusic.Score object.
        prob: The probability of applying pitch randomization to a note.
        max_change: The maximum number of semitones to add or subtract.

    Returns:
        A new symusic.Score object with the randomized pitches.
    """
    new_score = deepcopy(midi_score)

    for track in new_score.tracks:
        for note in track.notes:
            if random.random() < prob:
                # Generate a random change within the range [-max_change, max_change]
                change = random.randint(-max_change, max_change)

                # Apply the change to the note's pitch
                new_pitch = note.pitch + change

                # Ensure the new pitch is within a valid range (0-127 for MIDI)
                new_pitch = max(0, min(new_pitch, 127))

                # Update the note's pitch
                note.pitch = new_pitch

    return new_score


class LyricsMidiDataset(Dataset):
    def __init__(self, dataframe, lyrics_tokenizer, midi_tokenizer, max_length):
        self.dataframe = dataframe
        self.lyrics_tokenizer = lyrics_tokenizer
        self.midi_tokenizer = midi_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        lyrics = self.dataframe.iloc[idx]['lyrics']
        midi_path = self.dataframe.iloc[idx]['midi_path']

        # Tokenize lyrics
        lyrics = lyrics + self.lyrics_tokenizer.eos_token
        lyrics_tokens = self.lyrics_tokenizer(lyrics, 
                                             return_tensors="pt", 
                                             max_length=self.max_length, 
                                             padding="max_length", 
                                             truncation=True)
        
        midi_score = Score(midi_path)
        midi_score = randomize_midi_pitch(midi_score)
        
        # Tokenize MIDI (using your custom tokenizer)
        midi_tokens = self.midi_tokenizer(midi_score)[0].ids
        midi_tokens = midi_tokens[:self.max_length] # Truncate
        
        # Pad MIDI tokens 
        padding_length = self.max_length - len(midi_tokens)
        midi_tokens = midi_tokens + [0] * padding_length # Assuming 0 is your padding token for MIDI
        midi_tokens = torch.tensor(midi_tokens, dtype=torch.long)

        return {
            'lyrics_ids': lyrics_tokens['input_ids'].squeeze(),
            'lyrics_attention_mask': lyrics_tokens['attention_mask'].squeeze(),
            'midi_tokens': midi_tokens
        }

In [4]:
class LyricsGenerator(nn.Module):
    def __init__(self, lyrics_vocab_size, midi_vocab_size, d_model, max_length):
        super(LyricsGenerator, self).__init__()

        self.midi_embedding = nn.Embedding(midi_vocab_size, d_model)
        self.lyrics_embedding = nn.Embedding(lyrics_vocab_size, d_model)
        self.positional_embedding = nn.Embedding(max_length, d_model)

        self.linear = nn.Linear(d_model, d_model)  # Linear layer for MIDI
        
        self.gpt2 = GPT2LMHeadModel.from_pretrained('gpt2', 
                                                   pad_token_id=lyrics_tokenizer.eos_token_id)

        # Resize embeddings to match lyrics vocab size (IMPORTANT)
        self.gpt2.resize_token_embeddings(lyrics_vocab_size)

        self.d_model = d_model

    def forward(self, lyrics_ids, lyrics_attention_mask, midi_tokens):
        # Create positional embeddings
        positions = torch.arange(0, lyrics_ids.size(1)).expand(lyrics_ids.size(0), -1).to(device)
        positional_embeds = self.positional_embedding(positions)

        # MIDI embeddings and linear transformation
        midi_embeds = self.midi_embedding(midi_tokens)
        midi_embeds = self.linear(midi_embeds)  # Shape: (batch_size, seq_len, d_model)

        # Lyrics embeddings
        lyrics_embeds = self.lyrics_embedding(lyrics_ids) + positional_embeds

        # Concatenate along the sequence length dimension
        combined_embeds = torch.cat((midi_embeds, lyrics_embeds), dim=1)
        # Adjust attention mask for combined sequence (filled with 1s for simplicity)
        combined_attention_mask = torch.ones_like(combined_embeds[..., 0]).to(device)

        # Pass through GPT-2
        outputs = self.gpt2(inputs_embeds=combined_embeds,
                            attention_mask=combined_attention_mask)
        
        # Return logits for lyrics part, taking into account the offset from the midi sequence length
        return outputs.logits[:, midi_embeds.size(1):, :]

In [11]:
def train(model, dataloader, optimizer, scheduler, epochs):
    model.train()
    loss_fct = nn.CrossEntropyLoss(ignore_index=lyrics_tokenizer.pad_token_id)

    for epoch in range(epochs):
        loop = tqdm(dataloader, leave=True)
        for batch in loop:
            optimizer.zero_grad()

            lyrics_ids = batch['lyrics_ids'].to(device)
            lyrics_attention_mask = batch['lyrics_attention_mask'].to(device)
            midi_tokens = batch['midi_tokens'].to(device)

            outputs = model(lyrics_ids=lyrics_ids,
                            lyrics_attention_mask=lyrics_attention_mask,
                            midi_tokens=midi_tokens)

            loss = loss_fct(outputs.transpose(1, 2), lyrics_ids)

            loss.backward()
            optimizer.step()
            scheduler.step()

            # --- Prediction and Printing ---
            # Get predictions (argmax over the vocabulary dimension)
            #predictions = torch.argmax(outputs, dim=-1)

            # Decode predictions and ground truth
            #predicted_lyrics = [lyrics_tokenizer.decode(pred, skip_special_tokens=False) for pred in predictions]
            #ground_truth_lyrics = [lyrics_tokenizer.decode(label, skip_special_tokens=False) for label in lyrics_ids]

            # Print for each item in the batch
            #for i in range(len(predicted_lyrics)):
            #    print(f"  Predicted: {predicted_lyrics[i]}")
            #    print(f"  Ground Truth: {ground_truth_lyrics[i]}")
            # ---------------------------------

            loop.set_description(f"Epoch {epoch}")
            loop.set_postfix(loss=loss.item())

    # Create a checkpoint dictionary
    checkpoint = {
        'epoch': epochs,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),  # If you want to save the scheduler's state
        'loss': loss,  # Or any other metric you want to save (e.g., validation BLEU)
    }

    # Save the checkpoint
    torch.save(checkpoint, 'model_checkpoint.pth')  # Or any filename you prefer
    print("Checkpoint saved successfully!")

In [6]:
def generate_lyrics(model, midi_path, lyrics_tokenizer, midi_tokenizer, max_length, num_beams=5):
    model.eval()

    # Tokenize MIDI
    midi_tokens = midi_tokenizer(midi_path)[0].ids
    midi_tokens = midi_tokens[:max_length]
    padding_length = max_length - len(midi_tokens)
    midi_tokens = midi_tokens + [0] * padding_length
    midi_tokens = torch.tensor(midi_tokens, dtype=torch.long).unsqueeze(0).to(device) # Add batch dimension

    # Prepare initial input for GPT-2 (use a special start token, e.g., <|startoftext|>)
    input_ids = torch.tensor(lyrics_tokenizer.encode("<|endoftext|>")).unsqueeze(0).to(device)
    attention_mask = torch.ones_like(input_ids).to(device)
    
    # Generate with beam search
    beam_output = model.gpt2.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=False,
        num_return_sequences=1,
        pad_token_id=lyrics_tokenizer.pad_token_id,
        do_sample=True
    )

    # Decode the generated lyrics
    generated_lyrics = lyrics_tokenizer.decode(beam_output[0], skip_special_tokens=True)

    return generated_lyrics


In [7]:
from torch.utils.data import random_split
from pathlib import Path
from miditok import TSD, TokenizerConfig

# Load MIDI Tokenizer
config = TokenizerConfig(
    num_velocities=1,  # Remove velocity tokens (not relevant for your vocal data)
    use_chords=False,  # Disable chord tokens (unless your vocals have complex harmonies)
    use_rests=False,  # Disable rest tokens (unless silence is significant in your data)
    use_tempos=False,  # Disable tempo tokens (unless you have multiple pieces with varying tempos)
    use_time_signatures=False,  # Disable time signature tokens (unless relevant to your data)
)

midi_tokenizer = TSD(config)

midi_tokenizer = midi_tokenizer.from_pretrained(Path("tokenizer", "tokenizer.json"))

# Hyperparameters
max_length = 512
batch_size = 4
epochs = 6
d_model = 768 # GPT-2's dimension
midi_vocab_size = len(midi_tokenizer)
lyrics_vocab_size = None # Placeholder, will be set after initializing the tokenizer

# Load GPT-2 tokenizer
lyrics_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
lyrics_tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
lyrics_vocab_size = len(lyrics_tokenizer)

# Create dataset and dataloader
dataset = LyricsMidiDataset(df, lyrics_tokenizer, midi_tokenizer, max_length)

# Split dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Instantiate model
model = LyricsGenerator(lyrics_vocab_size, midi_vocab_size, d_model, max_length).to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [None]:
# Train the model
train(model, train_dataloader, optimizer, scheduler, epochs)

In [21]:
# Example inference (replace with a real MIDI path)
midi_path = df["midi_path"][7]
generated_lyrics = generate_lyrics(model, midi_path, lyrics_tokenizer, midi_tokenizer, max_length)
print(f"Generated Lyrics:\n{generated_lyrics}")
print(f"Original Lyrics:\n{df["lyrics"][7]}")

Generated Lyrics:
We know that we can make money out of these kinds of things, but we can't really know what is going on in the world's mind. And we can only make money out of these kinds of things. And we can only make money out of these kinds of things. And we can only make money out of these things. And we can only make money out of these things. And we can only make money out of these things. And we can only make money out of these things. And we can only make money out of these things. And we can only make money out of these things. And we can only make money out of these things. And we can only make money out of these things. And we can only make money out of these things. And we can only make money out of these things. And we can only make money out of these things. And we can only make money out of these things. And we can only make money out of these things. And we can only make money out of these things. And we can only make money out of these things. And we can only make mon