# Install piano_transcription_inference

In [1]:
!apt install ffmpeg wget

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
wget is already the newest version (1.21.2-2ubuntu1.1).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [2]:
pip install piano_transcription_inference



In [3]:
from IPython.display import Audio

# Transcribe using code

In [1]:
!pip uninstall -y librosa
!pip install librosa==0.11.0

from piano_transcription_inference import PianoTranscription, sample_rate
import librosa
import numpy as np
import audioread

# Redefine load_audio to completely bypass librosa.load for more robustness
def _my_custom_load_audio(path, sr=None, mono=True, offset=0.0, duration=None, dtype='float32', res_type='soxr_hq', backends=None):
    y = np.empty(0, dtype=dtype)
    current_sr = None

    with audioread.audio_open(path) as input_file:
        current_sr = input_file.samplerate

        # Handle offset
        if offset > 0:
            input_file.skip(offset)

        # Read frames and convert to float
        for frame in input_file:
            frame_float = librosa.util.buf_to_float(frame, dtype=dtype)
            y = np.append(y, frame_float)

            # Handle duration (approximate for audioread, more precise with librosa.load)
            if duration is not None and len(y) / current_sr > duration:
                y = y[:int(duration * current_sr)]
                break

    # Convert to mono if requested and multichannel
    if mono:
        if y.ndim > 1:
            y = np.mean(y, axis=1)

    # Resample if target sr is different from original sr
    if sr is not None and current_sr != sr:
        y = librosa.resample(y, orig_sr=current_sr, target_sr=sr, res_type=res_type)
        current_sr = sr # Update current_sr to target sr if resampled

    return y, current_sr

def transcribe(audio_path, output_midi_path):
    # Load audio using our redefined custom load_audio function
    audio, _ = _my_custom_load_audio(audio_path, sr=sample_rate, mono=True)

    # Transcriptor
    transcriptor = PianoTranscription(device='cuda', checkpoint_path=None)

    # Transcribe and write out to MIDI file
    transcriptor.transcribe(audio, output_midi_path)

Found existing installation: librosa 0.11.0
Uninstalling librosa-0.11.0:
  Successfully uninstalled librosa-0.11.0
Collecting librosa==0.11.0
  Using cached librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Using cached librosa-0.11.0-py3-none-any.whl (260 kB)
Installing collected packages: librosa
Successfully installed librosa-0.11.0


In [17]:
!wget https://github.com/qiuqiangkong/piano_transcription_inference/raw/master/resources/cut_liszt.mp3

--2025-11-03 08:32:16--  https://github.com/qiuqiangkong/piano_transcription_inference/raw/master/resources/cut_liszt.mp3
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/qiuqiangkong/piano_transcription_inference/master/resources/cut_liszt.mp3 [following]
--2025-11-03 08:32:16--  https://raw.githubusercontent.com/qiuqiangkong/piano_transcription_inference/master/resources/cut_liszt.mp3
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 240633 (235K) [audio/mpeg]
Saving to: ‘cut_liszt.mp3.3’


2025-11-03 08:32:17 (106 MB/s) - ‘cut_liszt.mp3.3’ saved [240633/240633]



In [18]:
Audio('/content/sample_data/evans.mp3')

In [2]:
# Removed problematic pip uninstall/install commands. Will use system's default librosa/numpy.

# The actual fix will be in the transcribe function definition, to correct the load_audio's internal call.
# Re-run transcribe with the fixed load_audio (defined in the previous cell now).
transcribe('/content/sample_data/evans.mp3', 'evans.mid')

Checkpoint path: /root/piano_transcription_inference_data/note_F1=0.9677_pedal_F1=0.9186.pth
Total size: ~165 MB
Using cuda for inference.
GPU number: 1
Segment 0 / 45
Segment 1 / 45
Segment 2 / 45
Segment 3 / 45
Segment 4 / 45
Segment 5 / 45
Segment 6 / 45
Segment 7 / 45
Segment 8 / 45
Segment 9 / 45
Segment 10 / 45
Segment 11 / 45
Segment 12 / 45
Segment 13 / 45
Segment 14 / 45
Segment 15 / 45
Segment 16 / 45
Segment 17 / 45
Segment 18 / 45
Segment 19 / 45
Segment 20 / 45
Segment 21 / 45
Segment 22 / 45
Segment 23 / 45
Segment 24 / 45
Segment 25 / 45
Segment 26 / 45
Segment 27 / 45
Segment 28 / 45
Segment 29 / 45
Segment 30 / 45
Segment 31 / 45
Segment 32 / 45
Segment 33 / 45
Segment 34 / 45
Segment 35 / 45
Segment 36 / 45
Segment 37 / 45
Segment 38 / 45
Segment 39 / 45
Segment 40 / 45
Segment 41 / 45
Segment 42 / 45
Segment 43 / 45
Segment 44 / 45
Segment 45 / 45
Write out to evans.mid


# Render midi plot

In [None]:
!apt install git

In [None]:
!git clone https://github.com/bytedance/piano_transcription.git

In [None]:
pip install -r piano_transcription/requirements.txt

In [None]:
!mkdir results

In [None]:
import sys
sys.path.append('piano_transcription/utils')
sys.path.append('piano_transcription/pytorch')
from collections import namedtuple

plot_args = namedtuple('PlotArgs', ['audio_path', 'midi_path'])

from plot_for_paper import plot_midi
plot_midi(plot_args('cut_liszt.mp3', 'cut_liszt.mid'))