# BPSD: Transcription of Audio Recordings


- Transcribe all audio files using the NoteEM transcription model from B. Maman and A. Bermano, "Unaligned supervision for automatic music transcription in the wild.", International Conference on Machine Learning, 2022
- Requirement: transcription code and checkpoint from https://github.com/benadar293/benadar293.github.io/

Ben Maman, Johannes Zeitler (johannes.zeitler@audiolabs-erlangen.de), 2024

Provide path to onsets and frames code

In [6]:
import os
onsets_frames_dir = "path_to_transcription_code"

In [7]:
import sys
sys.path.append(onsets_frames_dir)

import torch
torch.set_grad_enabled(False)
from onsets_and_frames.mel import *
from onsets_and_frames.midi_utils import *
import soundfile
import librosa
from glob import glob

import matplotlib.pyplot as plt

from tqdm.notebook import tqdm


In [11]:
CURR_N_FFT = 2048
CURR_WIN_LENGTH = 2048
CURR_NORM = 'slaney'
CURR_N_MELS = N_MELS
CURR_MEL_FMIN = 30
CURR_HOP_LENGTH = 512

USE_INSTRUMENT_GROUP = False # use general instrument group rather than specific instrument

MAX_TIME = 10 * 60 * SAMPLE_RATE # max segment length in samples (to prevent OOM), if longer - transcribe segment by segment

instrument_restriction = [0]

# Fine-tuned by Ben Maman on Beethoven Piano Sonatas
ckpt = "path_to_checkpoint"
instrument_ckpt = None

In [20]:
#album = 'A Night at The Opera'

out_pth = os.path.join(onsets_frames_dir, "transcriptions", "BPSD")
os.makedirs(out_pth, exist_ok=True)
os.makedirs(os.path.join(out_pth, "midi"), exist_ok=True)
os.makedirs(os.path.join(out_pth, "raw_features"), exist_ok=True)

In [13]:
onset_th = 0.5
frame_th = 0.5

##########################################################################################
inactive = None

In [14]:
model = torch.load(ckpt).cuda()
print('model type', type(model))

model type <class 'onsets_and_frames.transcriber.OnsetsAndFramesMultiV12'>


In [15]:
audio_dir = os.path.join("../", "1_Audio")

pieces = [f for f in os.listdir(audio_dir) if ".wav" in f]
pieces.sort()

audio_pths = [os.path.join(audio_dir, piece) for piece in pieces]

In [18]:
verbose=False

In [None]:
melspectrogram = MelSpectrogram(CURR_N_MELS, SAMPLE_RATE, filter_length=CURR_N_FFT, win_length=CURR_WIN_LENGTH,
                                         hop_length=CURR_HOP_LENGTH, mel_fmin=CURR_MEL_FMIN, mel_fmax=8000,
                                         norm=CURR_NORM)
melspectrogram.to(DEFAULT_DEVICE)



for pth in tqdm(audio_pths):
    if verbose: print('pth', pth)
    audio, sr = librosa.load(pth, sr=16000)
    if verbose: print('sr', sr)
    if verbose: print('audio min max', audio.min(), audio.max())
    if len(audio.shape) > 1:
        audio = audio.mean(axis=1)
    audio = np.clip(audio, a_min=-1., a_max=1.)
    audio = torch.from_numpy(audio).float().cuda().unsqueeze(0)

    audio_inp_len = audio.shape[1]
    if audio_inp_len > MAX_TIME: # transcribe in segments of length MAX_TIME:
        n_segments = audio_inp_len // MAX_TIME
        if n_segments * MAX_TIME < audio_inp_len:
            n_segments += 1
        if verbose: print('long audio, splitting to {} segments'.format(n_segments))
        seg_len = MAX_TIME
        onsets_preds = []
        frame_preds = []
        inst_onsets_preds = []
        for i_s in range(n_segments):
            if verbose: print('segment', i_s)
            curr = audio[:, i_s * seg_len: (i_s + 1) * seg_len]
            curr_mel = melspectrogram(curr).permute((0, 2, 1))
            try:
                curr_onset_pred, _, _, curr_frame_pred = model(curr_mel)
            except:
                curr_onset_pred, _, _, curr_frame_pred, _ = model(curr_mel)
            if instrument_ckpt is not None:
                try:
                    curr_inst_onset_pred, _, _, _ = instrument_model(curr_mel)
                except:
                    curr_inst_onset_pred, _, _, _, _ = instrument_model(curr_mel)

            onsets_preds.append(curr_onset_pred)
            frame_preds.append(curr_frame_pred)
            if instrument_ckpt is not None:
                inst_onsets_preds.append(curr_inst_onset_pred)
        pitch_onset_pred = torch.cat(onsets_preds, dim=1)
        if verbose: print('pitch onset shape', pitch_onset_pred.shape)
        if instrument_ckpt is not None:
            inst_onset_pred = torch.cat(inst_onsets_preds, dim=1)
        frame_pred = torch.cat(frame_preds, dim=1)
    else:
        curr_mel = melspectrogram(audio).permute((0, 2, 1))
        try:
            pitch_onset_pred, offset_pred, activation_pred, frame_pred = model(curr_mel)
        except:
            pitch_onset_pred, offset_pred, activation_pred, frame_pred, vel_pred = model(curr_mel)
        if instrument_ckpt is not None:
            try:
                inst_onset_pred, _, _, _ = instrument_model(curr_mel)
            except:
                inst_onset_pred, _, _, _, _ = instrument_model(curr_mel)
            if verbose: print('inst onset pred shape', inst_onset_pred.shape)
    if instrument_ckpt is not None:
        if instrument_ckpt == 'D:/onsets-and-frames-master-multi/ckpts/transcriber-220315-040403-model-13.pt':
            if USE_INSTRUMENT_GROUP:
                # in this model, the last 5 * 88 entries are for: percussion, pluck, strings, wind, and general pitch
                inst_onset_pred = inst_onset_pred[:, :, - 5 * N_KEYS:]
            else:
                # disregard the general instrument class but use the general pitch classes:
                inst_onset_pred = torch.cat((inst_onset_pred[:, :, : - 5 * N_KEYS], inst_onset_pred[:, :, -N_KEYS:]), dim=-1)

    if instrument_ckpt is not None:
        # in the instrument models, the last 88 entries are for pitch classes. we can either use the pitch classes from the pitch model alone,
        # or take the maximum activation between the two.

        # onset_pred = torch.cat((inst_onset_pred[:, :, : -N_KEYS], pitch_onset_pred), dim=-1) # take pitch from the pitch model and instruments from the instrument model

        # other option: take pitch to be maximum between pitch model and pitch classes of the instrument model (last N_KEYS=88 entries are pitch):
        onset_pred = inst_onset_pred
        onset_pred[:, :, -N_KEYS:] = torch.maximum(onset_pred[:, :, -N_KEYS:], pitch_onset_pred[:, :, :])

        onset_pred = onset_pred.squeeze().cpu().numpy()
        # choose maximum likelihood instrument for detected pitch classes:
        onset_pred = max_inst(onset_pred, thr=onset_th, inactive=inactive)[:, : -N_KEYS]
    else:
        onset_pred = pitch_onset_pred.squeeze().cpu().numpy()

    frames2midi(os.path.join(out_pth, "midi", pth.split('/')[-1].split(".")[0] + '.mid'), onset_pred, frame_pred.squeeze().cpu().numpy(), 64 * (onset_pred >= onset_th),
                inst_mapping=[0] if instrument_ckpt is None else mappings[instrument_ckpt],
                scaling=CURR_HOP_LENGTH / SAMPLE_RATE,
                onset_threshold=onset_th, frame_threshold=frame_th)
    
    
    np.savez(os.path.join(out_pth, "raw_features", pth.split('/')[-1].split(".")[0] + '.npz'),
             onset_pred = onset_pred,
             frame_pred = frame_pred.squeeze().cpu().numpy(),
             sample_rate = SAMPLE_RATE,
             hop_length = CURR_HOP_LENGTH)

In [None]:
fig, (ax0, ax1) = plt.subplots(2,1, figsize=(15,10))
im0 = ax0.imshow(onset_pred.T, extent=[0, onset_pred.shape[0]/SAMPLE_RATE*CURR_HOP_LENGTH, 1, 88], origin='lower', aspect='auto', cmap='gray_r')
plt.colorbar(im0, ax=ax0)
ax0.grid()

im1 = ax1.imshow(frame_pred.squeeze().cpu().numpy().T, extent=[0, onset_pred.shape[0]/SAMPLE_RATE*CURR_HOP_LENGTH, 1, 88], origin='lower', aspect='auto', cmap='gray_r')
plt.colorbar(im1, ax=ax1)
ax1.grid()

lims = [0, 15]
ax0.set_xlim(lims)
ax1.set_xlim(lims)