In [None]:

import os
from gtts import gTTS

# === One-syllable words list ===
one_syllable_words = [
    "cat", "dog", "sun", "run", "walk", "jump", "red", "blue", "black", "green",
    "fish", "ship", "hat", "bat", "rat", "mat", "cap", "map", "trap", "snap",
    "clap", "trip", "grip", "drip", "flip", "clip", "kick", "pick", "lick", "stick",
    "brick", "click", "quick", "thick", "thin", "win", "sin", "bin", "pin", "tin",
    "fin", "rim", "dim", "jam", "ram", "dam", "bam", "mad", "sad", "bad", "glad",
    "pad", "lad", "dad", "bag", "tag", "lag", "nag", "zag", "tug", "hug", "bug",
    "jug", "mug", "rug", "dug", "cup", "pup", "up", "mud", "bud", "stud", "cut",
    "gut", "nut", "but", "shut", "put", "pot", "dot", "got", "hot", "not", "rot",
    "lot", "shot", "plot", "spot", "drop", "stop", "crop", "hop", "pop", "top"
]

# === Create folder ===
os.makedirs("words", exist_ok=True)

# === Generate audio files ===
for word in one_syllable_words:
    tts = gTTS(text=word, lang='en')
    file_path = os.path.join("words", f"{word}.mp3")
    tts.save(file_path)
    print(f"✅ Saved: {file_path}")

✅ Saved: words/cat.mp3
✅ Saved: words/dog.mp3
✅ Saved: words/sun.mp3
✅ Saved: words/run.mp3
✅ Saved: words/walk.mp3
✅ Saved: words/jump.mp3
✅ Saved: words/red.mp3
✅ Saved: words/blue.mp3
✅ Saved: words/black.mp3
✅ Saved: words/green.mp3
✅ Saved: words/fish.mp3
✅ Saved: words/ship.mp3
✅ Saved: words/hat.mp3
✅ Saved: words/bat.mp3
✅ Saved: words/rat.mp3
✅ Saved: words/mat.mp3
✅ Saved: words/cap.mp3
✅ Saved: words/map.mp3
✅ Saved: words/trap.mp3
✅ Saved: words/snap.mp3
✅ Saved: words/clap.mp3
✅ Saved: words/trip.mp3
✅ Saved: words/grip.mp3
✅ Saved: words/drip.mp3
✅ Saved: words/flip.mp3
✅ Saved: words/clip.mp3
✅ Saved: words/kick.mp3
✅ Saved: words/pick.mp3
✅ Saved: words/lick.mp3
✅ Saved: words/stick.mp3
✅ Saved: words/brick.mp3
✅ Saved: words/click.mp3
✅ Saved: words/quick.mp3
✅ Saved: words/thick.mp3
✅ Saved: words/thin.mp3
✅ Saved: words/win.mp3
✅ Saved: words/sin.mp3
✅ Saved: words/bin.mp3
✅ Saved: words/pin.mp3
✅ Saved: words/tin.mp3
✅ Saved: words/fin.mp3
✅ Saved: words/rim.mp3
✅ S

The next thing we'll do is to create a melody as midi. A melody with 1000 notes. We'll extend this when we want more data, but at the moment this corresponds to 2 generated songs.

In [None]:
from midiutil import MIDIFile
import random

# === Helper Functions ===
def get_minor_scale_with_octaves(root_midi):
    """Return natural minor scale in octaves 3, 4, and 5."""
    intervals = [0, 2, 3, 5, 7, 8, 10]
    base = [root_midi + i for i in intervals]
    return base + [n - 12 for n in base] + [n + 12 for n in base]

# === CONFIG ===
mf = MIDIFile(1)
track = 0
channel = 0
volume = 100
default_tempo = 90
mf.addTempo(track, 0, default_tempo)

durations = [0.25, 0.5, 1, 2]           # 16th, 8th, quarter, half
weights =   [1440, 80, 53, 27]          # scaled to preserve 90% 16th bias

def choose_duration():
    return random.choices(durations, weights=weights, k=1)[0]

current_time = 0
three_minutes_beats = 3 * default_tempo

# === Initialize first key and tempo
current_tempo = default_tempo
current_root = 59  # B3
current_scale = get_minor_scale_with_octaves(current_root)
mf.addTempo(track, current_time, current_tempo)

# === Melody Generation ===
last_pitch = random.choice(current_scale)  # start with any note
note_count = 0

while note_count < 200:
    duration = random.choice(durations)

    # Style switch if 3 minutes of MIDI time (in beats) have passed
    if current_time >= three_minutes_beats:
        current_time = round(current_time, 2)
        current_tempo = random.randint(20, 200)
        current_root = random.randint(48, 72)
        current_scale = get_minor_scale_with_octaves(current_root)
        mf.addTempo(track, current_time, current_tempo)
        three_minutes_beats = current_time + 3 * current_tempo

    # Filter notes within ±7 semitones of last pitch
    candidates = [p for p in current_scale if abs(p - last_pitch) <= 7]
    if not candidates:
        candidates = [last_pitch]  # fallback in case of no valid options

    pitch = random.choice(candidates)

    # Optional tied note
    if random.random() < 0.3:
        tie = random.choice(durations)
        duration += tie

    mf.addNote(track, channel, pitch, current_time, duration, volume)
    current_time += duration
    last_pitch = pitch
    note_count += 1

# === Save MIDI
with open("melody.mid", "wb") as f:
    mf.writeFile(f)

print("🎵 Saved as melody.mid")

🎵 Saved as melody.mid


Now we have both lyrics and a melody. In this step we want to combine them together to create the training data. For now we're just doing the rhythm. We're still working on adding pitch correctly. But except for pitch we can extend the data to be as much as we like and the step after the next we actually train the ai.

In [None]:

import os
import random
import numpy as np
import pretty_midi
import soundfile as sf
import subprocess
import librosa

# === CONFIG ===
midi_path = "melody.mid"
output_path = "melody.mp3"
wav_path = output_path.replace(".mp3", ".wav")
sr = 22050
words_folder = "words"

pretty_midi.pretty_midi.MAX_TICK = 1e10

# === Load MIDI ===
print(f"🎵 Loading MIDI file: {midi_path}")
midi_data = pretty_midi.PrettyMIDI(midi_path)
duration_sec = midi_data.get_end_time() + 1
print(f"🎹 MIDI loaded. Total duration: {duration_sec / 3600:.2f} hours")

# === Prepare blank WAV file ===
print(f"📝 Preparing blank WAV file: {wav_path}")
with sf.SoundFile(wav_path, mode='w', samplerate=sr, channels=1, subtype='PCM_16'):
    pass

# === Get word file paths ===
word_files = [os.path.join(words_folder, f) for f in os.listdir(words_folder) if f.endswith(".mp3")]
if len(word_files) == 0:
    raise RuntimeError("No word audio files found in 'words' folder!")

# === Process notes ===
note_count = 0
for instrument in midi_data.instruments:
    for note in instrument.notes:
        note_count += 1
        if note_count % 100 == 1:
            print(f"🎶 Processing note {note_count}: Pitch {note.pitch}, Time {note.start:.2f}s → {note.end:.2f}s")

        duration = note.end - note.start
        start_sample = int(note.start * sr)

        # Choose a random word audio file
        word_path = random.choice(word_files)
        word_audio, _ = librosa.load(word_path, sr=sr)

        # Time-stretch to fit note duration
        current_duration = len(word_audio) / sr
        stretch_factor = current_duration / duration
        word_stretched = librosa.effects.time_stretch(word_audio, rate=1/stretch_factor)

        # Vary volume randomly between 40% and 100%
        volume = random.uniform(0.4, 1.0)
        word_stretched *= volume

        # Apply fade-in/out to avoid clicks
        fade_samples = int(0.02 * sr)
        envelope = np.ones_like(word_stretched)
        envelope[:fade_samples] = np.linspace(0, 1, fade_samples)
        envelope[-fade_samples:] = np.linspace(1, 0, fade_samples)
        word_stretched *= envelope

        # Write to WAV
        with sf.SoundFile(wav_path, mode='r+') as f:
            current_frames = f.frames
            # Pad with silence if needed
            if start_sample > current_frames:
                f.seek(0, sf.SEEK_END)
                f.write(np.zeros(start_sample - current_frames))
            # Write at correct time location
            f.seek(start_sample)
            f.write(word_stretched)

print(f"✅ Synthesis complete. {note_count} notes written to: {wav_path}")

# === Normalize ===
print("📏 Normalizing audio...")
y, _ = librosa.load(wav_path, sr=sr)
y = y / np.max(np.abs(y))
sf.write(wav_path, y, sr)
print("✅ Normalization done.")

# === Convert to MP3 ===
print(f"🔄 Converting to MP3: {output_path}")
subprocess.call(["ffmpeg", "-y", "-i", wav_path, output_path])

# === Clean up ===
print(f"🧹 Removing intermediate WAV file: {wav_path}")
os.remove(wav_path)
print(f"🎉 Done! MP3 saved to: {output_path}")

🎵 Loading MIDI file: melody.mid
🎹 MIDI loaded. Total duration: 0.04 hours
📝 Preparing blank WAV file: melody.wav
🎶 Processing note 1: Pitch 55, Time 0.00s → 0.17s
🎶 Processing note 101: Pitch 57, Time 72.67s → 72.83s
✅ Synthesis complete. 200 notes written to: melody.wav
📏 Normalizing audio...
✅ Normalization done.
🔄 Converting to MP3: melody.mp3
🧹 Removing intermediate WAV file: melody.wav
🎉 Done! MP3 saved to: melody.mp3


In [None]:
midi = pretty_midi.PrettyMIDI("melody.mid")
onset_times = []

for inst in midi.instruments:
    if inst.is_drum:
        continue
    for note in inst.notes:
        onset_times.append(note.start)  # start time in SECONDS, with tempo applied

# Save to file
with open("onsets.txt", "w") as f:
    for t in sorted(onset_times):
        f.write(f"{t:.6f}\n")

print(f"\n✅ Done! Extracted {len(onset_times)} onsets from melody.mid.")
print("🎧 Audio file: melody.mp3")
print("📄 Onset labels saved to: onsets.txt (in seconds)")
print("🧠 You can now use this for training your onset detection model.")


✅ Done! Extracted 200 onsets from melody.mid.
🎧 Audio file: melody.mp3
📄 Onset labels saved to: onsets.txt (in seconds)
🧠 You can now use this for training your onset detection model.


This is the part where we actually train the ai to eventually become very good at onset detection.

In [None]:

import librosa
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os

# === CONFIG ===
AUDIO_PATH = "trainingaudio.mp3"
LABEL_PATH = "trainingonsets.txt"
SR = 22050
HOP_LENGTH = 512
N_MELS = 32
CONTEXT = 7
EPOCHS = 20000000
BATCH_SIZE = 64
LR = 1e-2
TOLERANCE = 0.04  # seconds

# === Load audio and extract features ===
y, sr = librosa.load(AUDIO_PATH, sr=SR)
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS, hop_length=HOP_LENGTH)
log_S = librosa.power_to_db(S, ref=np.max).T

# === Load onset labels ===
onsets = np.array([float(l.strip()) for l in open(LABEL_PATH) if l.strip()])
onset_frames = librosa.time_to_frames(onsets, sr=sr, hop_length=HOP_LENGTH)
labels = np.zeros(log_S.shape[0])
for f in onset_frames:
    start = max(0, f - int(TOLERANCE * sr / HOP_LENGTH))
    end = min(len(labels), f + int(TOLERANCE * sr / HOP_LENGTH) + 1)
    labels[start:end] = 1

# === Dataset ===
class OnsetDataset(Dataset):
    def __init__(self, X, y, context):
        self.X = X
        self.y = y
        self.context = context

    def __len__(self):
        return len(self.X) - 2 * self.context

    def __getitem__(self, idx):
        i = idx + self.context
        x = self.X[i - self.context:i + self.context + 1].T
        x = np.expand_dims(x, 0)  # [1, mel, time]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(self.y[i], dtype=torch.float32)

# === Smaller CNN ===
class SmallOnsetCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# === Train ===
dataset = OnsetDataset(log_S, labels, CONTEXT)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SmallOnsetCNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.BCELoss()

from sklearn.metrics import precision_score, recall_score
import librosa
import numpy as np
import torch

# === Load correct onset times once (as floats in seconds) ===
with open("evaluateonsets.txt") as f:
    correct_onsets = np.array([
        float(line.strip())
        for line in f if line.strip() != ''
    ])

# === Load evaluation audio features from vocal.mp3 ===
EVAL_AUDIO_PATH = "vocal.mp3"
y_eval, _ = librosa.load(EVAL_AUDIO_PATH, sr=SR)
S_eval = librosa.feature.melspectrogram(y=y_eval, sr=SR, n_mels=N_MELS, hop_length=HOP_LENGTH)
log_S_eval = librosa.power_to_db(S_eval, ref=np.max).T

# === F1 Scoring Function with Time Tolerance ===
def tolerant_f1(model, log_S, correct_onsets, context, device, sr, hop_length, threshold=0.5, tolerance=0.05):
    model.eval()
    X = []
    frame_times = []

    for i in range(context, len(log_S) - context):
        segment = log_S[i - context:i + context + 1].T
        X.append(np.expand_dims(segment, 0))
        frame_time = librosa.frames_to_time(i, sr=sr, hop_length=hop_length)
        frame_times.append(frame_time)

    X = torch.tensor(np.array(X), dtype=torch.float32).to(device)
    with torch.no_grad():
        preds = model(X).squeeze().cpu().numpy()

    pred_times = [t for t, p in zip(frame_times, preds) if p >= threshold]

    # === Match predictions to ground-truth with tolerance ===
    matched_pred = set()
    matched_true = set()

    for i, true_onset in enumerate(correct_onsets):
        for j, pred_onset in enumerate(pred_times):
            if j in matched_pred:
                continue
            if abs(pred_onset - true_onset) <= tolerance:
                matched_true.add(i)
                matched_pred.add(j)
                break

    tp = len(matched_true)
    fp = len(pred_times) - tp
    fn = len(correct_onsets) - tp

    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)

    return f1

# === Training Loop ===
print("🚀 Starting training...")
best_f1 = 0.0
loss_per_epoch = []

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    print(f"📘 Epoch {epoch+1}/{EPOCHS}")
    for batch_idx, (x, y) in enumerate(loader):
        x, y = x.to(device), y.to(device)
        pred = model(x).squeeze()
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)

    avg_loss = total_loss / len(dataset)
    loss_per_epoch.append(avg_loss)

    # === Compute F1 Score using vocal.mp3 ===
    f1 = tolerant_f1(model, log_S_eval, correct_onsets, CONTEXT, device, SR, HOP_LENGTH)

    if f1 > best_f1:
        best_f1 = f1
        print(f"✅ Avg Loss: {avg_loss:.6f} | 🎯 New Best F1: {f1:.4f} 🏆")
        # Save checkpoint
        model_path = f"onset_model_epoch{epoch+1}.pth"
        torch.save(model.state_dict(), model_path)
        print(f"💾 Model saved: {model_path}")
    else:
        print(f"✅ Avg Loss: {avg_loss:.6f} | 🎯 F1 Score (±40ms): {f1:.4f}")

🚀 Starting training...
📘 Epoch 1/20000000
✅ Avg Loss: 0.509180 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 2/20000000
✅ Avg Loss: 0.487469 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 3/20000000
✅ Avg Loss: 0.481399 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 4/20000000
✅ Avg Loss: 0.475846 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 5/20000000
✅ Avg Loss: 0.472476 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 6/20000000
✅ Avg Loss: 0.470986 | 🎯 New Best F1: 0.0044 🏆
💾 Model saved: onset_model_epoch6.pth
📘 Epoch 7/20000000
✅ Avg Loss: 0.462866 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 8/20000000
✅ Avg Loss: 0.464269 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 9/20000000
✅ Avg Loss: 0.454953 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 10/20000000
✅ Avg Loss: 0.456438 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 11/20000000
✅ Avg Loss: 0.455833 | 🎯 New Best F1: 0.0428 🏆
💾 Model saved: onset_model_epoch11.pth
📘 Epoch 12/20000000
✅ Avg Loss: 0.454939 | 🎯 F1 Score (±40ms): 0.0000
📘 Epoch 13/20000000
✅ Avg Loss: 0.451311 | 🎯 New Best F1: 0.0625 🏆
💾 Model 

In [None]:

# === PREVIEW CONFIG ===
input_audio_path = "melody.mp3"          # 👈 Your full audio file
preview_output_path = "preview.mp3"
preview_duration_ms = 10 * 60 * 1000     # 10 minutes

# === CREATE PREVIEW ===
from pydub import AudioSegment

full_audio = AudioSegment.from_mp3(input_audio_path)
preview_audio = full_audio[:preview_duration_ms]
preview_audio.export(preview_output_path, format="mp3")

print(f"🎧 Preview saved to: {preview_output_path}")

KeyboardInterrupt: 

In [None]:
!pip install midiutil pretty_midi

Collecting midiutil
  Downloading MIDIUtil-1.2.1.tar.gz (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pretty_midi
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m76.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mido>=1.1.16 (from pretty_midi)
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Downloading mido-1.3.3-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: midiutil, pretty_midi
  Building wheel for midiutil (setup.py) ... [?25l[?25hdone
  Created wheel for midiutil: filename=MIDIUtil-1.2.1-py3-none-any.whl size=54569 sha256=bcf05634758a62360fd

In [None]:

import librosa
import numpy as np
import torch
import torch.nn as nn

# === CONFIG ===
AUDIO_PATH = "vocal.mp3"
MODEL_PATH = "Onset.pth"
LABEL_PATH = "evaluateonsets.txt"
SR = 22050
HOP_LENGTH = 512
N_MELS = 32
CONTEXT = 7
THRESHOLD = 0.5
TOLERANCE = 0.05  # seconds

# === Model Definition ===
class SmallOnsetCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# === Load model ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SmallOnsetCNN().to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()

# === Load audio and features ===
y, sr = librosa.load(AUDIO_PATH, sr=SR)
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS, hop_length=HOP_LENGTH)
log_S = librosa.power_to_db(S, ref=np.max).T

# === Load ground-truth onsets (in seconds) ===
with open(LABEL_PATH) as f:
    correct_onsets = np.array([float(line.strip()) for line in f if line.strip()])

# === Evaluation Function ===
def tolerant_f1(model, log_S, correct_onsets, context, device, sr, hop_length, threshold=0.5, tolerance=0.05):
    model.eval()
    X = []
    frame_times = []

    for i in range(context, len(log_S) - context):
        segment = log_S[i - context:i + context + 1].T
        X.append(np.expand_dims(segment, 0))
        frame_time = librosa.frames_to_time(i, sr=sr, hop_length=hop_length)
        frame_times.append(frame_time)

    X = torch.tensor(np.array(X), dtype=torch.float32).to(device)
    with torch.no_grad():
        preds = model(X).squeeze().cpu().numpy()

    pred_times = [t for t, p in zip(frame_times, preds) if p >= threshold]

    matched_pred = set()
    matched_true = set()

    for i, true_onset in enumerate(correct_onsets):
        for j, pred_onset in enumerate(pred_times):
            if j in matched_pred:
                continue
            if abs(pred_onset - true_onset) <= tolerance:
                matched_true.add(i)
                matched_pred.add(j)
                break

    tp = len(matched_true)
    fp = len(pred_times) - tp
    fn = len(correct_onsets) - tp

    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)

    print(f"🎯 Evaluation F1 Score (±{tolerance*1000:.0f}ms): {f1:.4f}")
    print(f"   Precision: {precision:.4f} | Recall: {recall:.4f} | TP: {tp}, FP: {fp}, FN: {fn}")
    return f1

# === Run Evaluation ===
tolerant_f1(model, log_S, correct_onsets, CONTEXT, device, SR, HOP_LENGTH, THRESHOLD, TOLERANCE)

🎯 Evaluation F1 Score (±50ms): 0.1218
   Precision: 0.0794 | Recall: 0.2612 | TP: 117, FP: 1356, FN: 331


0.12181155290324952