<a href="https://colab.research.google.com/github/dyc-github/CSE153_Assignment2/blob/main/MIDI_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and load required libraries

In [32]:
# !pip install miditok
# !pip install mido
#!pip install symusic
#!pip install glob
#!pip install torch

In [28]:
# !unzip data.zip

In [3]:
from google.colab import files

import glob
import random
from typing import List
from collections import defaultdict

import numpy as np
from numpy.random import choice

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from mido import MidiFile
from symusic import Score
from miditok import REMI, TokenizerConfig

# Exploratory Data Analysis

In [4]:
train_files = glob.glob("./data/train/*.mid")
test_files = glob.glob("./data/test/*.mid")

In [5]:
def get_midi_len(file):
  try:
    mid = MidiFile(file)
    return mid.length
  except Exception as e:
    print(e)
    return 0

In [6]:
num_samples = len(train_files)
print(f"Number of training samples: {num_samples}")
total_length = sum([get_midi_len(file) for file in train_files])
print(f"Total length of training samples in ticks: {total_length}")
avg_length = total_length / num_samples
print(f"Avg length of training samples in ticks: {avg_length}")

Number of training samples: 638
Total length of training samples in ticks: 72914.93901779644
Avg length of training samples in ticks: 114.28673827240821


# Model: Second Order Markov Chain Model
This model serves as a baseline of comparison for our LSTM model.

## Preprocessing

Train Midi Tokenizer

In [7]:
config = TokenizerConfig(num_velocities=1, use_chords=False, use_programs=True)
tokenizer = REMI(config)
tokenizer.train(vocab_size=1000, files_paths=train_files)
tokenizer.save("tokenizer.json")

  super().__init__(tokenizer_config, params)


Construct PyTorch Dataset and Dataloaders

In [8]:
class MIDIDataset(Dataset):
    def __init__(self, file_paths: List[str], tokenizer):
        self.tokenizer = tokenizer
        self.file_paths = file_paths
    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        midi = Score(self.file_paths[idx])
        tokens = self.tokenizer(midi)
        return np.array(tokens)

In [9]:
train_dataset = MIDIDataset(train_files, tokenizer)
test_dataset = MIDIDataset(test_files, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

## Model




In [10]:
class SecondOrderMarkovChain:
    def __init__(self):
        self.transitions = defaultdict(lambda: defaultdict(int))
        self.probabilities = defaultdict(lambda: defaultdict(float))

    def train(self, train_loader):
        for sequence in train_loader:
            sequence = sequence[0].numpy().astype(int)
            for i in range(len(sequence) - 2):
                state1, state2 = sequence[i], sequence[i + 1]
                next_state = sequence[i + 2]
                self.transitions[(state1, state2)][next_state] += 1

        for (state1, state2), next_states in self.transitions.items():
            total = sum(next_states.values())
            for next_state, count in next_states.items():
                self.probabilities[(state1, state2)][next_state] = count / total
        return self.probabilities

    def generate(self, test_sequence, num_predictions=1):
        test_sequence = test_sequence[0].numpy().astype(int)
        results = [test_sequence[0], test_sequence[1]]
        for i in range(100):
            if (results[-2], results[-1]) not in self.probabilities:
                break
            else:
                probs = self.probabilities[(results[-2], results[-1])]
                states = list(probs.keys())
                probabilities = list(probs.values())
                if not states:
                    break
                try:
                    predictions = np.random.choice(states, size=num_predictions, p=probabilities)
                except:
                    break
                results.append(predictions[0])
        return results

## Training

In [11]:
model = SecondOrderMarkovChain()
model.train(train_loader)

predictions = []
for test_sequence in test_loader:
    predictions.append(model.generate(test_sequence))
for i, prediction in enumerate(predictions):
    output_score = tokenizer.decode(torch.Tensor(prediction))
    output_score.dump_midi(f"markov/{i}.mid")

Download Output

In [29]:
# !zip -r markov.zip ./markov
# files.download("markov.zip")

# Model: LSTM Model
This is the primary model I will be exploring

## Preprocessing

In [13]:
from miditok.pytorch_data import DatasetMIDI, DataCollator

tokenizer = REMI()  # using defaults parameters (constants.py)
train_dataset = DatasetMIDI(
    files_paths=train_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
test_dataset = DatasetMIDI(
    files_paths=test_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collator)

In [14]:
len(train_loader), len(test_loader)

(160, 18)

## Model

In [15]:
class MusicRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout):
        super(MusicRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # x: (batch_size, seq_length)
        x = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        out, hidden = self.rnn(x, hidden)  # out: (batch_size, seq_length, hidden_dim)
        out = self.fc(out)  # (batch_size, seq_length, vocab_size)
        return out, hidden

## Training

In [17]:
def train(model, train_loader, val_loader, vocab_size, num_epochs=20, lr=0.001, device='cuda'):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        # --------- Training ---------
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            batch = batch['input_ids'].to(device)  # (batch_size, seq_length)

            inputs = batch[:, :-1]
            targets = batch[:, 1:]

            optimizer.zero_grad()
            outputs, _ = model(inputs)
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # --------- Validation ---------
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch['input_ids'].to(device)

                inputs = batch[:, :-1]
                targets = batch[:, 1:]

                outputs, _ = model(inputs)
                outputs = outputs.reshape(-1, vocab_size)
                targets = targets.reshape(-1)

                loss = criterion(outputs, targets)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


# Example usage
if __name__ == "__main__":
    vocab_size = tokenizer.vocab_size
    embedding_dim = 256
    hidden_dim = 512
    num_layers = 2
    dropout = .3

    model = MusicRNN(vocab_size, embedding_dim, hidden_dim, num_layers, dropout)
    train(model, train_loader, test_loader, vocab_size, num_epochs=15)

Epoch 1/15 | Train Loss: 2.7964 | Val Loss: 1.9718
Epoch 2/15 | Train Loss: 1.7968 | Val Loss: 1.6201
Epoch 3/15 | Train Loss: 1.5021 | Val Loss: 1.4396
Epoch 4/15 | Train Loss: 1.3268 | Val Loss: 1.3043
Epoch 5/15 | Train Loss: 1.2244 | Val Loss: 1.2479
Epoch 6/15 | Train Loss: 1.1488 | Val Loss: 1.2160
Epoch 7/15 | Train Loss: 1.0998 | Val Loss: 1.1965
Epoch 8/15 | Train Loss: 1.0568 | Val Loss: 1.1939
Epoch 9/15 | Train Loss: 1.0244 | Val Loss: 1.1704
Epoch 10/15 | Train Loss: 0.9711 | Val Loss: 1.1626
Epoch 11/15 | Train Loss: 0.9323 | Val Loss: 1.1570
Epoch 12/15 | Train Loss: 0.8970 | Val Loss: 1.1454
Epoch 13/15 | Train Loss: 0.8553 | Val Loss: 1.1402
Epoch 14/15 | Train Loss: 0.8196 | Val Loss: 1.1539
Epoch 15/15 | Train Loss: 0.7933 | Val Loss: 1.1445


### Train 15 Epoch Model

In [18]:
def sample(model, start_token, max_length=100, temperature=1.0, device='cuda'):
    model = model.to(device)
    model.eval()

    generated = [start_token]
    input_token = torch.tensor([[start_token]], device=device)  # (1, 1)

    hidden = None

    for _ in range(max_length):
        output, hidden = model(input_token, hidden)  # output: (1, 1, vocab_size)
        output = output[:, -1, :]  # take the last output
        output = output / temperature  # adjust randomness

        probs = F.softmax(output, dim=-1)  # (1, vocab_size)
        next_token = torch.multinomial(probs, num_samples=1).item()
        generated.append(next_token)
        if next_token == 2 or next_token == 0: # reach end of sequence
          break

        input_token = torch.tensor([[next_token]], device=device)

    return generated

start_token = tokenizer.special_tokens_ids[1]
generated_sequence = sample(model, start_token, max_length=1024)

print("Generated token sequence:")
print(generated_sequence)

Generated token sequence:
[1, 4, 189, 44, 124, 132, 197, 49, 124, 128, 201, 47, 124, 128, 205, 51, 124, 128, 209, 49, 124, 132, 217, 44, 124, 126, 219, 44, 124, 126, 4, 189, 42, 124, 128, 193, 47, 124, 128, 197, 46, 124, 136, 209, 51, 124, 128, 213, 47, 124, 128, 217, 47, 124, 128, 4, 189, 42, 124, 132, 197, 40, 124, 132, 205, 40, 124, 132, 213, 37, 124, 132, 4, 189, 38, 124, 126, 191, 34, 124, 126, 193, 35, 124, 126, 195, 30, 124, 126, 197, 18, 124, 140, 213, 30, 124, 126, 217, 43, 124, 126, 219, 42, 124, 126, 4, 189, 40, 124, 140, 47, 124, 140, 205, 41, 124, 140, 42, 124, 140, 4, 189, 66, 124, 140, 38, 124, 138, 205, 40, 124, 140, 44, 124, 140, 4, 189, 53, 124, 156, 4, 189, 44, 124, 156, 4, 189, 44, 124, 126, 191, 42, 124, 126, 193, 41, 124, 126, 195, 37, 124, 126, 197, 35, 124, 126, 199, 37, 124, 126, 201, 38, 124, 126, 203, 38, 124, 126, 205, 30, 124, 126, 207, 42, 124, 126, 209, 46, 124, 126, 211, 46, 124, 126, 213, 25, 124, 126, 215, 46, 124, 126, 217, 42, 124, 126, 219, 42, 124,

In [19]:
for i, start_token in enumerate(random.sample(list(tokenizer.vocab.values()), 10)):
  generated_sequence = sample(model, start_token, max_length=1024)
  output_score = tokenizer.tokens_to_midi([generated_sequence])
  output_score.dump_midi(f"rnn_15/rnn_{i}.mid")

  output_score = tokenizer.tokens_to_midi([generated_sequence])


In [30]:
# !zip -r rnn_15.zip ./rnn_15
# files.download("rnn_15.zip")

### Train 25 Epoch Model

In [22]:
train(model, train_loader, test_loader, vocab_size, num_epochs=10)

Epoch 1/10 | Train Loss: 0.7644 | Val Loss: 1.1469
Epoch 2/10 | Train Loss: 0.7250 | Val Loss: 1.1617
Epoch 3/10 | Train Loss: 0.6925 | Val Loss: 1.1655
Epoch 4/10 | Train Loss: 0.6596 | Val Loss: 1.1808
Epoch 5/10 | Train Loss: 0.6267 | Val Loss: 1.1996
Epoch 6/10 | Train Loss: 0.6004 | Val Loss: 1.2031
Epoch 7/10 | Train Loss: 0.5703 | Val Loss: 1.2232
Epoch 8/10 | Train Loss: 0.5399 | Val Loss: 1.2488
Epoch 9/10 | Train Loss: 0.5151 | Val Loss: 1.2462
Epoch 10/10 | Train Loss: 0.4844 | Val Loss: 1.2806


In [23]:
for i, start_token in enumerate(random.sample(list(tokenizer.vocab.values()), 10)):
  generated_sequence = sample(model, start_token, max_length=1024)
  output_score = tokenizer.tokens_to_midi([generated_sequence])
  output_score.dump_midi(f"rnn_25/rnn_{i}.mid")

  output_score = tokenizer.tokens_to_midi([generated_sequence])


In [31]:
# !zip -r rnn_25.zip ./rnn_25
# files.download("rnn_25.zip")

### Train 50 Epoch Model


In [25]:
train(model, train_loader, test_loader, vocab_size, num_epochs=25)

Epoch 1/25 | Train Loss: 0.4660 | Val Loss: 1.3109
Epoch 2/25 | Train Loss: 0.4301 | Val Loss: 1.3256
Epoch 3/25 | Train Loss: 0.4093 | Val Loss: 1.3385
Epoch 4/25 | Train Loss: 0.3873 | Val Loss: 1.3756
Epoch 5/25 | Train Loss: 0.3738 | Val Loss: 1.4017
Epoch 6/25 | Train Loss: 0.3531 | Val Loss: 1.4187
Epoch 7/25 | Train Loss: 0.3376 | Val Loss: 1.4391
Epoch 8/25 | Train Loss: 0.3207 | Val Loss: 1.4662
Epoch 9/25 | Train Loss: 0.3036 | Val Loss: 1.4819
Epoch 10/25 | Train Loss: 0.2872 | Val Loss: 1.5067
Epoch 11/25 | Train Loss: 0.2718 | Val Loss: 1.5175
Epoch 12/25 | Train Loss: 0.2532 | Val Loss: 1.5516
Epoch 13/25 | Train Loss: 0.2442 | Val Loss: 1.5718
Epoch 14/25 | Train Loss: 0.2355 | Val Loss: 1.5953
Epoch 15/25 | Train Loss: 0.2303 | Val Loss: 1.6487
Epoch 16/25 | Train Loss: 0.2297 | Val Loss: 1.6254
Epoch 17/25 | Train Loss: 0.2108 | Val Loss: 1.6725
Epoch 18/25 | Train Loss: 0.1975 | Val Loss: 1.7077
Epoch 19/25 | Train Loss: 0.1855 | Val Loss: 1.7225
Epoch 20/25 | Train L

In [26]:
for i, start_token in enumerate(random.sample(list(tokenizer.vocab.values()), 10)):
  generated_sequence = sample(model, start_token, max_length=1024)
  output_score = tokenizer.tokens_to_midi([generated_sequence])
  output_score.dump_midi(f"rnn_50/rnn_{i}.mid")

  output_score = tokenizer.tokens_to_midi([generated_sequence])


In [27]:
!zip -r rnn_50.zip ./rnn_50
files.download("rnn_50.zip")

  adding: rnn_50/ (stored 0%)
  adding: rnn_50/rnn_5.mid (deflated 60%)
  adding: rnn_50/rnn_0.mid (deflated 67%)
  adding: rnn_50/rnn_1.mid (deflated 63%)
  adding: rnn_50/rnn_3.mid (deflated 59%)
  adding: rnn_50/rnn_2.mid (deflated 79%)
  adding: rnn_50/rnn_6.mid (deflated 66%)
  adding: rnn_50/rnn_4.mid (deflated 64%)
  adding: rnn_50/rnn_7.mid (deflated 62%)
  adding: rnn_50/rnn_8.mid (deflated 68%)
  adding: rnn_50/rnn_9.mid (deflated 70%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>