The next thing we'll do is to create a melody as midi. A melody with 1000 notes. We'll extend this when we want more data, but at the moment this corresponds to 2 generated songs.

In [None]:
from midiutil import MIDIFile
import random

# === Helper Functions ===
def get_minor_scale_with_octaves(root_midi):
    """Return natural minor scale in octaves 3, 4, and 5."""
    intervals = [0, 2, 3, 5, 7, 8, 10]
    base = [root_midi + i for i in intervals]
    return base + [n - 12 for n in base] + [n + 12 for n in base]

# === CONFIG ===
mf = MIDIFile(1)
track = 0
channel = 0
volume = 100
default_tempo = 120
mf.addTempo(track, 0, default_tempo)

durations = [0.25, 0.5, 1, 2]
current_time = 0
three_minutes_beats = 3 * default_tempo

# === Initialize first key and tempo
current_tempo = default_tempo
current_root = 59  # B3
current_scale = get_minor_scale_with_octaves(current_root)
mf.addTempo(track, current_time, current_tempo)

# === Melody Generation ===
last_pitch = random.choice(current_scale)  # start with any note
note_count = 0

while note_count < 2000:
    duration = random.choice(durations)

    # Style switch if 3 minutes of MIDI time (in beats) have passed
    if current_time >= three_minutes_beats:
        current_time = round(current_time, 2)
        current_tempo = random.randint(20, 200)
        current_root = random.randint(48, 72)
        current_scale = get_minor_scale_with_octaves(current_root)
        mf.addTempo(track, current_time, current_tempo)
        three_minutes_beats = current_time + 3 * current_tempo

    # Filter notes within ±7 semitones of last pitch
    candidates = [p for p in current_scale if abs(p - last_pitch) <= 7]
    if not candidates:
        candidates = [last_pitch]  # fallback in case of no valid options

    pitch = random.choice(candidates)

    # Optional tied note
    if random.random() < 0.3:
        tie = random.choice(durations)
        duration += tie

    mf.addNote(track, channel, pitch, current_time, duration, volume)
    current_time += duration
    last_pitch = pitch
    note_count += 1

# === Save MIDI
with open("melody.mid", "wb") as f:
    mf.writeFile(f)

print("🎵 Saved as melody.mid")

🎵 Saved as melody.mid


Now we have both lyrics and a melody. In this step we want to combine them together to create the training data. For now we're just doing the rhythm. We're still working on adding pitch correctly. But except for pitch we can extend the data to be as much as we like and the step after the next we actually train the ai.

In [None]:

import os
import numpy as np
import pretty_midi
import soundfile as sf
import subprocess
from IPython.display import clear_output
import librosa

# === CONFIG ===
midi_path = "melody.mid"
output_path = "melody.mp3"
wav_path = output_path.replace(".mp3", ".wav")
sr = 22050

# Prevent tick errors in long MIDI files
pretty_midi.pretty_midi.MAX_TICK = 1e10

# === Helpers ===
def midi_to_freq(midi_note):
    return 440.0 * (2 ** ((midi_note - 69) / 12))

def generate_sine_with_vibrato(freq, duration_sec, sr=22050, vibrato_freq=5, vibrato_depth=1):
    t = np.linspace(0, duration_sec, int(sr * duration_sec), False)
    phase = 2 * np.pi * freq * t + np.sin(2 * np.pi * vibrato_freq * t) * vibrato_depth
    wave = np.sin(phase)
    fade_samples = int(0.02 * sr)
    envelope = np.ones_like(wave)
    envelope[:fade_samples] = np.linspace(0, 1, fade_samples)
    envelope[-fade_samples:] = np.linspace(1, 0, fade_samples)
    return wave * envelope

# === Load MIDI ===
print(f"🎵 Loading MIDI file: {midi_path}")
midi_data = pretty_midi.PrettyMIDI(midi_path)
duration_sec = midi_data.get_end_time() + 1
print(f"🎹 MIDI loaded. Total duration: {duration_sec / 3600:.2f} hours")

# === Create empty WAV file ===
print(f"📝 Preparing blank WAV file: {wav_path}")
with sf.SoundFile(wav_path, mode='w', samplerate=sr, channels=1, subtype='PCM_16') as f:
    pass  # file initialized

# === Stream audio note by note ===
note_count = 0
for instrument in midi_data.instruments:
    for idx, note in enumerate(instrument.notes):
        note_count += 1  # <- this was missing

        if note_count % 100 == 1:
            #clear_output(wait=True)
            print(f"🎶 Processing note {note_count}: Pitch {note.pitch}, Time {note.start:.2f}s → {note.end:.2f}s")

        freq = midi_to_freq(note.pitch)
        duration = note.end - note.start
        start_sample = int(note.start * sr)

        sine_wave = generate_sine_with_vibrato(freq, duration, sr=sr)

        with sf.SoundFile(wav_path, mode='r+') as f:
            current_frames = f.frames

            if start_sample > current_frames:
                silence = np.zeros(start_sample - current_frames)
                f.seek(0, sf.SEEK_END)
                f.write(silence)

            f.seek(0, sf.SEEK_END)
            f.write(sine_wave)

print(f"✅ Synthesis complete. {note_count} notes written to: {wav_path}")

# === Normalize ===
print("📏 Normalizing audio...")
y, _ = librosa.load(wav_path, sr=sr)
y = y / np.max(np.abs(y))
sf.write(wav_path, y, sr)
print("✅ Normalization done.")

# === Convert to MP3 ===
print(f"🔄 Converting to MP3: {output_path}")
subprocess.call(["ffmpeg", "-y", "-i", wav_path, output_path])

# === Clean up ===
print(f"🧹 Removing intermediate WAV file: {wav_path}")
os.remove(wav_path)

print(f"🎉 Done! MP3 saved to: {output_path}")

🎵 Loading MIDI file: melody.mid
🎹 MIDI loaded. Total duration: 0.30 hours
📝 Preparing blank WAV file: melody.wav
🎶 Processing note 1: Pitch 54, Time 0.00s → 1.25s
🎶 Processing note 101: Pitch 78, Time 70.62s → 71.62s
🎶 Processing note 201: Pitch 71, Time 138.00s → 138.62s
🎶 Processing note 301: Pitch 52, Time 202.86s → 203.35s
🎶 Processing note 401: Pitch 55, Time 260.69s → 261.65s
🎶 Processing note 501: Pitch 60, Time 320.56s → 320.81s
🎶 Processing note 601: Pitch 43, Time 369.71s → 370.42s
🎶 Processing note 701: Pitch 43, Time 406.05s → 406.13s
🎶 Processing note 801: Pitch 71, Time 448.99s → 449.71s
🎶 Processing note 901: Pitch 59, Time 500.15s → 500.24s
🎶 Processing note 1001: Pitch 67, Time 548.36s → 548.54s
🎶 Processing note 1101: Pitch 66, Time 594.24s → 594.42s
🎶 Processing note 1201: Pitch 81, Time 636.16s → 636.33s
🎶 Processing note 1301: Pitch 83, Time 680.10s → 680.36s
🎶 Processing note 1401: Pitch 77, Time 723.49s → 723.62s
🎶 Processing note 1501: Pitch 58, Time 787.97s → 7

In [None]:
midi = pretty_midi.PrettyMIDI("melody.mid")
onset_times = []

for inst in midi.instruments:
    if inst.is_drum:
        continue
    for note in inst.notes:
        onset_times.append(note.start)  # start time in SECONDS, with tempo applied

# Save to file
with open("onsets.txt", "w") as f:
    for t in sorted(onset_times):
        f.write(f"{t:.6f}\n")

print(f"\n✅ Done! Extracted {len(onset_times)} onsets from melody.mid.")
print("🎧 Audio file: melody.mp3")
print("📄 Onset labels saved to: onsets.txt (in seconds)")
print("🧠 You can now use this for training your onset detection model.")


✅ Done! Extracted 2000 onsets from melody.mid.
🎧 Audio file: melody.mp3
📄 Onset labels saved to: onsets.txt (in seconds)
🧠 You can now use this for training your onset detection model.


This is the part where we actually train the ai to eventually become very good at onset detection.

In [None]:

import librosa
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os

# === CONFIG ===
AUDIO_PATH = "melody.mp3"
LABEL_PATH = "onsets.txt"
SR = 22050
HOP_LENGTH = 512
N_MELS = 32
CONTEXT = 7
EPOCHS = 20000000
BATCH_SIZE = 64
LR = 1e-2
TOLERANCE = 0.04  # seconds

# === Load audio and extract features ===
y, sr = librosa.load(AUDIO_PATH, sr=SR)
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS, hop_length=HOP_LENGTH)
log_S = librosa.power_to_db(S, ref=np.max).T

# === Load onset labels ===
onsets = np.array([float(l.strip()) for l in open(LABEL_PATH) if l.strip()])
onset_frames = librosa.time_to_frames(onsets, sr=sr, hop_length=HOP_LENGTH)
labels = np.zeros(log_S.shape[0])
for f in onset_frames:
    start = max(0, f - int(TOLERANCE * sr / HOP_LENGTH))
    end = min(len(labels), f + int(TOLERANCE * sr / HOP_LENGTH) + 1)
    labels[start:end] = 1

# === Dataset ===
class OnsetDataset(Dataset):
    def __init__(self, X, y, context):
        self.X = X
        self.y = y
        self.context = context

    def __len__(self):
        return len(self.X) - 2 * self.context

    def __getitem__(self, idx):
        i = idx + self.context
        x = self.X[i - self.context:i + self.context + 1].T
        x = np.expand_dims(x, 0)  # [1, mel, time]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(self.y[i], dtype=torch.float32)

# === Smaller CNN ===
class SmallOnsetCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# === Train ===
dataset = OnsetDataset(log_S, labels, CONTEXT)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SmallOnsetCNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.BCELoss()

from sklearn.metrics import precision_score, recall_score
import librosa
import numpy as np
import torch

# === Load correct onset times once (as floats in seconds) ===
with open("evaluateonsets.txt") as f:
    correct_onsets = np.array([
        float(line.strip())
        for line in f if line.strip() != ''
    ])

# === F1 Scoring Function with Time Tolerance ===
def tolerant_f1(model, log_S, correct_onsets, context, device, sr, hop_length, threshold=0.5, tolerance=0.05):
    model.eval()
    X = []
    frame_times = []

    for i in range(context, len(log_S) - context):
        segment = log_S[i - context:i + context + 1].T
        X.append(np.expand_dims(segment, 0))
        frame_time = librosa.frames_to_time(i, sr=sr, hop_length=hop_length)
        frame_times.append(frame_time)

    X = torch.tensor(np.array(X), dtype=torch.float32).to(device)
    with torch.no_grad():
        preds = model(X).squeeze().cpu().numpy()

    pred_times = [t for t, p in zip(frame_times, preds) if p >= threshold]

    # === Match predictions to ground-truth with tolerance ===
    matched_pred = set()
    matched_true = set()

    for i, true_onset in enumerate(correct_onsets):
        for j, pred_onset in enumerate(pred_times):
            if j in matched_pred:
                continue
            if abs(pred_onset - true_onset) <= tolerance:
                matched_true.add(i)
                matched_pred.add(j)
                break

    tp = len(matched_true)
    fp = len(pred_times) - tp
    fn = len(correct_onsets) - tp

    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)

    return f1

# === Training Loop ===
print("🚀 Starting training...")
best_f1 = 0.0
loss_per_epoch = []

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    print(f"📘 Epoch {epoch+1}/{EPOCHS}")
    for batch_idx, (x, y) in enumerate(loader):
        x, y = x.to(device), y.to(device)
        pred = model(x).squeeze()
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)

    avg_loss = total_loss / len(dataset)
    loss_per_epoch.append(avg_loss)

    # === Compute F1 Score using time-based onset matching ===
    f1 = tolerant_f1(model, log_S, correct_onsets, CONTEXT, device, SR, HOP_LENGTH)

    if f1 > best_f1:
        best_f1 = f1
        print(f"✅ Avg Loss: {avg_loss:.6f} | 🎯 New Best F1: {f1:.4f} 🏆")
        # Save checkpoint
        model_path = f"onset_model_epoch{epoch+1}.pth"
        torch.save(model.state_dict(), model_path)
        print(f"💾 Model saved: {model_path}")
    else:
        print(f"✅ Avg Loss: {avg_loss:.6f} | 🎯 F1 Score (±40ms): {f1:.4f}")

🚀 Starting training...
📘 Epoch 1/20000000
✅ Avg Loss: 0.631828 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 2/20000000
✅ Avg Loss: 0.382571 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 3/20000000
✅ Avg Loss: 0.380891 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 4/20000000
✅ Avg Loss: 0.376295 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 5/20000000
✅ Avg Loss: 0.375688 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 6/20000000


In [None]:

# === PREVIEW CONFIG ===
input_audio_path = "melody.mp3"          # 👈 Your full audio file
preview_output_path = "preview.mp3"
preview_duration_ms = 10 * 60 * 1000     # 10 minutes

# === CREATE PREVIEW ===
from pydub import AudioSegment

full_audio = AudioSegment.from_mp3(input_audio_path)
preview_audio = full_audio[:preview_duration_ms]
preview_audio.export(preview_output_path, format="mp3")

print(f"🎧 Preview saved to: {preview_output_path}")

KeyboardInterrupt: 

In [None]:
!pip install midiutil pretty_midi

Collecting midiutil
  Downloading MIDIUtil-1.2.1.tar.gz (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pretty_midi
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mido>=1.1.16 (from pretty_midi)
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Downloading mido-1.3.3-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: midiutil, pretty_midi
  Building wheel for midiutil (setup.py) ... [?25l[?25hdone
  Created wheel for midiutil: filename=MIDIUtil-1.2.1-py3-none-any.whl size=54569 sha256=837a53ea3e5851e1650

In [None]:

import librosa
import numpy as np
import torch
import torch.nn as nn

# === CONFIG ===
AUDIO_PATH = "vocal.mp3"
MODEL_PATH = "Onset.pth"
LABEL_PATH = "evaluateonsets.txt"
SR = 22050
HOP_LENGTH = 512
N_MELS = 32
CONTEXT = 7
THRESHOLD = 0.5
TOLERANCE = 0.05  # seconds

# === Model Definition ===
class SmallOnsetCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# === Load model ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SmallOnsetCNN().to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()

# === Load audio and features ===
y, sr = librosa.load(AUDIO_PATH, sr=SR)
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS, hop_length=HOP_LENGTH)
log_S = librosa.power_to_db(S, ref=np.max).T

# === Load ground-truth onsets (in seconds) ===
with open(LABEL_PATH) as f:
    correct_onsets = np.array([float(line.strip()) for line in f if line.strip()])

# === Evaluation Function ===
def tolerant_f1(model, log_S, correct_onsets, context, device, sr, hop_length, threshold=0.5, tolerance=0.05):
    model.eval()
    X = []
    frame_times = []

    for i in range(context, len(log_S) - context):
        segment = log_S[i - context:i + context + 1].T
        X.append(np.expand_dims(segment, 0))
        frame_time = librosa.frames_to_time(i, sr=sr, hop_length=hop_length)
        frame_times.append(frame_time)

    X = torch.tensor(np.array(X), dtype=torch.float32).to(device)
    with torch.no_grad():
        preds = model(X).squeeze().cpu().numpy()

    pred_times = [t for t, p in zip(frame_times, preds) if p >= threshold]

    matched_pred = set()
    matched_true = set()

    for i, true_onset in enumerate(correct_onsets):
        for j, pred_onset in enumerate(pred_times):
            if j in matched_pred:
                continue
            if abs(pred_onset - true_onset) <= tolerance:
                matched_true.add(i)
                matched_pred.add(j)
                break

    tp = len(matched_true)
    fp = len(pred_times) - tp
    fn = len(correct_onsets) - tp

    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)

    print(f"🎯 Evaluation F1 Score (±{tolerance*1000:.0f}ms): {f1:.4f}")
    print(f"   Precision: {precision:.4f} | Recall: {recall:.4f} | TP: {tp}, FP: {fp}, FN: {fn}")
    return f1

# === Run Evaluation ===
tolerant_f1(model, log_S, correct_onsets, CONTEXT, device, SR, HOP_LENGTH, THRESHOLD, TOLERANCE)

🎯 Evaluation F1 Score (±50ms): 0.1218
   Precision: 0.0794 | Recall: 0.2612 | TP: 117, FP: 1356, FN: 331


0.12181155290324952