# Measuring Statistics about Information Sources in MIDI Files

A MIDI file can provide a cornucopia of musical information about a given piece of music, including transcription, key, lyrics, and meter.
However, the presence and quantity of each of these sources of information can vary.
Through a large-scale web scrape, we obtained 178,561 unique (i.e. having different MD5 checksums) MIDI files.
This notebook measures the availability of each possible source of information in MIDI files in this collection of MIDIs found "in the wild".

In [None]:
import pretty_midi
import numpy as np
import joblib
import glob
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('white')
sns.set_context('notebook', font_scale=1.5)
import matplotlib.gridspec
import cPickle as pickle
import collections
import os
# plotting.py contains utility functions for making nice histogram plots
import plotting

## Compute statistics

The `compute_statistics` function takes in a MIDI file and simply collects the number, and values for, different events (for example, key change values and tempo settings) in the file.
We will call this function for every entry in our 178,561 MIDI file collection and use these measurements to study the distributions of different event values and quantities of different event types.

In [None]:
def compute_statistics(midi_file):
    """
    Given a path to a MIDI file, compute a dictionary of statistics about it
    
    Parameters
    ----------
    midi_file : str
        Path to a MIDI file.
    
    Returns
    -------
    statistics : dict
        Dictionary reporting the values for different events in the file.
    """
    # Some MIDI files will raise Exceptions on loading, if they are invalid.
    # We just skip those.
    try:
        pm = pretty_midi.PrettyMIDI(midi_file)
        # Extract informative events from the MIDI file
        return {'n_instruments': len(pm.instruments),
                'program_numbers': [i.program for i in pm.instruments if not i.is_drum],
                'key_numbers': [k.key_number for k in pm.key_signature_changes],
                'tempos': list(pm.get_tempo_changes()[1]),
                'time_signature_changes': pm.time_signature_changes}
    # Silently ignore exceptions for a clean presentation (sorry Python!)
    except Exception as e:
        pass

In [None]:
# Compute statistics about every file in our collection in parallel using joblib
# We do things in parallel because there are tons so it would otherwise take too long!
statistics = joblib.Parallel(n_jobs=10, verbose=10)(
    joblib.delayed(compute_statistics)(midi_file)
    for midi_file in glob.glob(os.path.join('data', 'unique_mid', '*', '*.mid')))
# When an error occurred, None will be returned; filter those out.
statistics = [s for s in statistics if s is not None]

In [None]:
# Save the computed statistics so we only need to do the above once
with open('statistics.pkl', 'wb') as f:
    pickle.dump(statistics, f)

In [None]:
# Load in pre-computed statistics
with open('statistics.pkl') as f:
    statistics = pickle.load(f)

In [None]:
plotting.plot_hist([s['n_instruments'] for s in statistics], range(22))
plt.xticks(range(0, 22, 5), range(0, 22 - 5, 5) + ['20+']);

In [None]:
plotting.plot_hist([i for s in statistics for i in s['program_numbers']], range(128))

In [None]:
plotting.plot_hist([len(s['tempos']) for s in statistics], range(1, 12) + [30, 100, 1000])
plt.xticks(np.arange(13) + .3, range(1, 11) + ['11 - 30', '31 - 100', '101+'], rotation=45, ha='right');

In [None]:
plotting.plot_hist([i for s in statistics for i in s['tempos']], range(0, 260, 10))
plt.xticks(np.arange(0, len(range(0, 260, 10)), 3) + .5, range(0, 240, 30) + ['240+'], rotation=45, ha='right');

In [None]:
plotting.plot_hist([len(s['key_numbers']) for s in statistics], range(12))
plt.xticks(range(11), range(10) + ['10+']);

In [None]:
plotting.plot_hist([i for s in statistics for i in s['key_numbers']], range(25))
plt.xticks([0, 2, 4, 5, 7, 9, 11, 12, 14, 16, 17, 19, 21, 23],
           ['C', 'D', 'E', 'F', 'G', 'A', 'B', 'c', 'd', 'e', 'f', 'g', 'a', 'b']);

In [None]:
plotting.plot_hist([len(s['time_signature_changes']) for s in statistics], range(12))
plt.xticks(range(11), range(10) + ['10+']);

In [None]:
# Get strings for all time signatures
time_signatures = ['{}/{}'.format(c.numerator, c.denominator)
                   for s in statistics for c in s['time_signature_changes']]

# Only display the n_top top time signatures
n_top = 15
# Get the n_top top time signatures
top = collections.Counter(time_signatures).most_common(n_top)
# Create a dict mapping an integer index to the time signature string
top_signatures = {n: s[0] for n, s in enumerate(top)}
# Add an additional index for non-top signatures
top_signatures[n_top] = 'Other'
# Compute the number of non-top time signatures
n_other = len(time_signatures) - sum(s[1] for s in top)
# Create a list with each index repeated the number of times
# each time signature appears, to be passed to plt.hist
indexed_time_signatures = sum([[n]*s[1] for n, s in enumerate(top)], [])
indexed_time_signatures += [n_top]*n_other

plotting.plot_hist(indexed_time_signatures, range(n_top + 2))
plt.xticks(np.array(top_signatures.keys()) + .3, top_signatures.values(), rotation=45, ha='right');

In [None]:
most_common_instruments = np.argsort(np.bincount([i for s in statistics for i in s['program_numbers']]))[-4:]
print most_common_instruments
print [pretty_midi.program_to_instrument_name(i) for i in most_common_instruments]

In [None]:
len(statistics)

[a](n_instruments.pdf)

[b](program_numbers.pdf)

[c](n_tempos.pdf)

[d](tempos.pdf)

[e](n_keys.pdf)

[f](keys.pdf)

[g](n_signatures.pdf)

[h](time_signatures.pdf)

### Text events?

In [None]:
import midi
import glob
import os
import joblib
import pretty_midi

In [None]:
def get_names(midi_file):
    

In [None]:
def any_lyrics(midi_file):
    try:
        midi_data = midi.read_midifile(midi_file)
    except:
        return None
    for track in midi_data:
        for event in track:
            if isinstance(event, midi.LyricsEvent):
                return 1
    return 0

has_lyrics = joblib.Parallel(n_jobs=10, verbose=11)(
    joblib.delayed(any_lyrics)(f) for f in
    glob.glob(os.path.join('data', 'unique_mid', '*', '*.mid')))

In [None]:
print sum([l for l in has_lyrics if l is not None])
print sum([l for l in has_lyrics if l is not None])/float(len(has_lyrics))

In [None]:
n_bad = 0
for f in glob.glob('data/mid/*/*.mid*'):
    try:
        pm = pretty_midi.PrettyMIDI(f)
    except:
        n_bad += 1
        pass

In [None]:
n_mids = []
beatles_wavs = glob.glob('data/wav/*/*.wav')
for f in beatles_wavs:
    mid_glob = f.replace('wav', 'mid') + '*'
    n_mids.append(len(glob.glob(mid_glob)))
n_mids = np.array(n_mids)
print len(n_mids)
print np.sum(n_mids == 0)
print np.median(n_mids)
print np.max(n_mids)
print beatles_wavs[np.argmax(n_mids)]