**Load Dependencies**

In [0]:
!pip install pretty_midi
!pip install librosa
!pip install mir_eval
!pip install tables

In [0]:
!wget http://hog.ee.columbia.edu/craffel/lmd/match_scores.json

In [0]:
!wget http://hog.ee.columbia.edu/craffel/lmd/lmd_aligned.tar.gz

In [0]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pretty_midi
import librosa
import mir_eval
import mir_eval.display
import tables
import IPython.display
import os
import json

# Local path constants
DATA_PATH = 'data'
RESULTS_PATH = ''
# Path to the file match_scores.json distributed with the LMD
SCORE_FILE = os.path.join(RESULTS_PATH, 'match_scores.json')

# Utility functions for retrieving paths
def msd_id_to_dirs(msd_id):
    """Given an MSD ID, generate the path prefix.
    E.g. TRABCD12345678 -> A/B/C/TRABCD12345678"""
    return os.path.join(msd_id[2], msd_id[3], msd_id[4], msd_id)

def msd_id_to_mp3(msd_id):
    """Given an MSD ID, return the path to the corresponding mp3"""
    return os.path.join(DATA_PATH, 'msd', 'mp3',
                        msd_id_to_dirs(msd_id) + '.mp3')

def msd_id_to_h5(h5):
    """Given an MSD ID, return the path to the corresponding h5"""
    return os.path.join(RESULTS_PATH, 'lmd_matched_h5',
                        msd_id_to_dirs(msd_id) + '.h5')

def get_midi_path(msd_id, midi_md5, kind):
    """Given an MSD ID and MIDI MD5, return path to a MIDI file.
    kind should be one of 'matched' or 'aligned'. """
    return os.path.join(RESULTS_PATH, 'lmd_{}'.format(kind),
                        msd_id_to_dirs(msd_id), midi_md5 + '.mid')

**Match Dataset**

In [48]:
with open(SCORE_FILE) as f:
    scores = json.load(f)
    keys = list(scores)
# Grab a Million Song Dataset ID from the scores dictionary
msd_id = list(scores)[20]
print("Million Song Dataset ID {} has {} MIDI file matches:".format(msd_id, len(scores[msd_id])))
          
for midi_md5, score in scores[msd_id].items():
    print('  {} with confidence score {}'.format(midi_md5, score))

Million Song Dataset ID TREGGML12903CC0740 has 2 MIDI file matches:
  33e3ed0356f8356f0a89a4dce5be88ad with confidence score 0.7378106284252202
  ffca0861fd3a0d536ebdabdf9853a3fe with confidence score 0.7378074024297993


In [12]:
while True:
    # Grab an MSD ID and its dictionary of matches
    msd_id, matches = scores.popitem()
    # Grab a MIDI from the matches
    midi_md5, score = matches.popitem()
    # Construct the path to the aligned MIDI
    matched_midi_path = 'lmd_aligned.mid' 
    # Load/parse the MIDI file with pretty_midi
    pm = pretty_midi.PrettyMIDI(matched_midi_path)
    # Look for a MIDI file which has lyric and key signature change events
    if len(pm.lyrics) > 5 and len(pm.key_signature_changes) > 0:
        break

OSError: ignored