In [None]:
import librosa
import mir_eval
import IPython
import sys
sys.path.append('/home/craffel/projects/midi-dataset/')
sys.path.append('/home/craffel/projects/midi-dataset/scripts')
import whoosh_search
import os
import shutil
import pretty_midi
import re
import numpy as np
import feature_extraction
import deepdish
import joblib
import align_text_matches
import glob
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('white')
import matplotlib.gridspec
import matplotlib
import cPickle as pickle
import collections

### Compute statistics about MIDI files from the wild

In [None]:
def compute_statistics(midi_file):
    """
    Given a path to a MIDI file, compute a dictionary of statistics about it
    
    Parameters
    ----------
    midi_file : str
        Path to a MIDI file.
    
    Returns
    -------
    statistics : dict
        Dictionary reporting the program numbers, key numbers,
        tempos, and time signatures present in the MIDI file.
    """
    # Some MIDI files will raise Exceptions on loading, if they are invalid.
    # We just skip those.
    try:
        pm = pretty_midi.PrettyMIDI(midi_file)
        # Extract informative events from the MIDI file
        return {'n_instruments': len(pm.instruments),
                'program_numbers': [i.program for i in pm.instruments if not i.is_drum],
                'key_numbers': [k.key_number for k in pm.key_signature_changes],
                'tempos': list(pm.get_tempo_changes()[1]),
                'time_signature_changes': pm.time_signature_changes}
    # Dear Python, sorry for the anti-pattern
    except Exception as e:
        pass

In [None]:
# Compute statistics about every unique MIDI file we found on the internet
statistics = joblib.Parallel(n_jobs=10, verbose=10)(
    joblib.delayed(compute_statistics)(midi_file)
    for midi_file in glob.glob(os.path.join('data', 'unique_mid', '*', '*.mid')))
# When an error occurred, None will be returned; filter those out.
statistics = [s for s in statistics if s is not None]

In [None]:
with open('statistics.pkl', 'wb') as f:
    pickle.dump(statistics, f)

In [None]:
with open('statistics.pkl') as f:
    statistics = pickle.load(f)

In [None]:
statistics = statistics[:10000]

In [None]:
matplotlib.rc('font', **{'size':13})

In [None]:
BLUE = '#28ABE3'
GREEN = '#1FDA9A'
ORANGE = '#E8B71A'
GREY = '#AAAAAA'

def uniform_hist(data, bins, ax, **kwargs):
    heights, _ = np.histogram(data, bins)
    ax.bar(left=np.arange(len(bins) - 1) - .5, height=heights,
           width=1, bottom=0, **kwargs)
    return heights

def pretty_hist(data, bins, fc, ax, title=None):
    """ Utility method for plotting a nice histogram """
    # Make it so that all points beyond the bin range get put in the last bin
    data = np.array(data)
    data[data > bins[-1]] = bins[-1] - 1e-10
    # Plot histogram, with specific coloring and axis-alignment
    heights = uniform_hist(data, bins, ax, fc=fc, alpha=.7)
    # Remove spines from plot
    sns.despine()
    # Add grid to y axis
    ax.yaxis.grid()
    # Set the plotting range to fit the histogram exactly
    bin_spacing = 1.
    ax.set_xlim(-bin_spacing/2., len(bins) - 1 - bin_spacing/2.)
    if title is not None:
        plt.suptitle(title, verticalalignment='top', y=.95, size='large')
    return heights

def divide_yticklabels(ax, divisor=1000):
    """ Utility method to scale down all y tick labels """
    ax.set_yticklabels([int(float(t)/divisor)
                        if (float(t)/divisor).is_integer()
                        else float(t)/divisor
                        for t in ax.get_yticks()])

def split_hist(data, bin_edges, high_bin_indices, fc, title):
    """ Plot a histogram where one or more bins have very large values """
    # Make high_bin_indices a list if an int was passed
    if isinstance(high_bin_indices, int):
        high_bin_indices = [high_bin_indices]
    # Create 2-row, 1-col subplot where the upper sublot is 1/4 the height
    # The upper subplot will be the tops of the very large bins; lower will be the rest
    gs = matplotlib.gridspec.GridSpec(2, 1, width_ratios=[1,], height_ratios=[1, 4])
    # Set the spacing between subplots to .1
    gs.update(hspace=0.1)
    # Grab axes handles
    ax = plt.subplot(gs[0])
    ax2 = plt.subplot(gs[1])
    # Plot pretty histograms both for the "upper" and "lower" parts of the split
    heights = pretty_hist(data, bin_edges, fc, ax)
    pretty_hist(data, bin_edges, fc, ax2)
    low_min = 0
    # Compute the height of the largest bin _not_ in high_bin_indices
    low_max = 1.1*max(heights[n] for n in range(len(bin_edges) - 1)
                      if n not in high_bin_indices)
    low_range = low_max - low_min
    # Compute the height of the smallest bin in high_bin_indices
    high_min = .9*min(heights[n] for n in high_bin_indices)
    # Compute the height of the highest bin in high_bin_indices
    high_max = 1.1*max(heights[n] for n in high_bin_indices)
    # Set the Y plotting range according to the above.  This will crop things.
    ax.set_ylim(high_min, high_max)
    ax2.set_ylim(low_min, low_max)
    # Hide the spines between ax and ax2
    ax.spines['bottom'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    ax.xaxis.tick_top()
    ax.tick_params(labeltop='off')
    ax2.xaxis.tick_bottom()

    # Compute the spacing between y-ticks on the lower plot
    lowtick_spacing = np.diff(ax2.get_yticks())[0]
    # Create a single tick on the upper plot, rounded to the same spacing as lower plot
    ax.set_yticks([int(lowtick_spacing)*int((high_min + high_max)/(2*lowtick_spacing))])

    # X-axis start of clip lines (relative to [0, 1])
    start = -.015
    # Compute proportion of x-axis covered by last high_bin_indices (+ .015)
    end = (high_bin_indices[-1] + 1)/float(len(bin_edges) - 1) + .015
    # Plot the lines, allowing for it to expand outside of the axis
    ax.plot([start, end], [0., 0.], transform=ax.transAxes, color='k', clip_on=False)
    ax2.plot([start, end], [1., 1.], transform=ax2.transAxes, color='k', clip_on=False)

    # Convert count to thousands
    divide_yticklabels(ax)
    divide_yticklabels(ax2)

    plt.suptitle(title, verticalalignment='top', y=.95, size='large')

plt.figure()
pretty_hist([s['n_instruments'] for s in statistics],
            range(22), BLUE, plt.gca(), '(a) Number of instruments')
divide_yticklabels(plt.gca())
plt.savefig('n_instruments.pdf', transparent=True, bbox_inches='tight', pad_inches=0.1)
plt.xticks(range(0, 22, 5), range(0, 22 - 5, 5) + ['20+'])

plt.figure()
split_hist([i for s in statistics for i in s['program_numbers']],
           range(128), 0, BLUE, '(e) Program Numbers')
plt.savefig('program_numbers.pdf', transparent=True, bbox_inches='tight', pad_inches=0.1)


plt.figure()
split_hist([len(s['tempos']) for s in statistics],
           range(1, 12) + [30, 100, 1000], 0, GREEN, '(b) Number of tempo changes')
plt.xticks(range(13), range(1, 11) + ['11 - 30', '31 - 100', '101+'], rotation=45, ha='center')
plt.savefig('n_tempos.pdf', transparent=True, bbox_inches='tight', pad_inches=0.1)

plt.figure()
pretty_hist([i for s in statistics for i in s['tempos']],
            range(0, 260, 10), GREEN, plt.gca(), '(f) Tempos')
divide_yticklabels(plt.gca())
plt.xticks(range(0, len(range(0, 260, 10)), 3), range(0, 240, 30) + ['240+'], rotation=45)
plt.savefig('tempos.pdf', transparent=True, bbox_inches='tight', pad_inches=0.1)

plt.figure()
split_hist([len(s['key_numbers']) for s in statistics],
           range(12), [0, 1], ORANGE, '(d) Number of key changes')
plt.xticks(range(11), range(10) + ['10+'])
plt.savefig('n_keys.pdf', transparent=True, bbox_inches='tight', pad_inches=0.1)

fig = plt.figure()
split_hist([i for s in statistics for i in s['key_numbers']],
           range(25), 0, ORANGE, '(h) Keys')
plt.xticks([0, 2, 4, 5, 7, 9, 11, 12, 14, 16, 17, 19, 21, 23],
           ['C', 'D', 'E', 'F', 'G', 'A', 'B', 'c', 'd', 'e', 'f', 'g', 'a', 'b'])
plt.figtext(0.28, .03, 'Major')
plt.figtext(0.67, .03, 'Minor')
l1 = matplotlib.lines.Line2D([.14, .26], [.045, .045], c='k', transform=fig.transFigure, figure=fig)
l2 = matplotlib.lines.Line2D([.37, .49], [.045, .045], c='k', transform=fig.transFigure, figure=fig)
l3 = matplotlib.lines.Line2D([.53, .65], [.045, .045], c='k', transform=fig.transFigure, figure=fig)
l4 = matplotlib.lines.Line2D([.76, .88], [.045, .045], c='k', transform=fig.transFigure, figure=fig)
fig.lines.extend([l1, l2, l3, l4])
plt.savefig('keys.pdf', transparent=True, bbox_inches='tight', pad_inches=0.1)

plt.figure()
split_hist([len(s['time_signature_changes']) for s in statistics],
           range(12), 1, GREY, '(c) Number of time signature changes')
plt.xticks(range(11), range(10) + ['10+'])
plt.savefig('n_signatures.pdf', transparent=True, bbox_inches='tight', pad_inches=0.1)

# Get strings for all time signatures
time_signatures = ['{}/{}'.format(c.numerator, c.denominator)
                   for s in statistics for c in s['time_signature_changes']]

# Only display the n_top top time signatures
n_top = 15
# Get the n_top top time signatures
top = collections.Counter(time_signatures).most_common(n_top)
# Create a dict mapping an integer index to the time signature string
top_signatures = {n: s[0] for n, s in enumerate(top)}
# Add an additional index for non-top signatures
top_signatures[n_top] = 'Other'
# Compute the number of non-top time signatures
n_other = len(time_signatures) - sum(s[1] for s in top)
# Create a list with each index repeated the number of times
# each time signature appears, to be passed to plt.hist
indexed_time_signatures = sum([[n]*s[1] for n, s in enumerate(top)], [])
indexed_time_signatures += [n_top]*n_other

plt.figure()
split_hist(indexed_time_signatures, range(n_top + 2), 0, GREY, '(g) Time signatures')
plt.xticks(top_signatures.keys(), top_signatures.values(), rotation=45, ha='center')
plt.savefig('time_signatures.pdf', transparent=True, bbox_inches='tight', pad_inches=0.1)

pass

In [None]:
most_common_instruments = np.argsort(np.bincount([i for s in statistics for i in s['program_numbers']]))[-4:]
print most_common_instruments
print [pretty_midi.program_to_instrument_name(i) for i in most_common_instruments]

In [None]:
len(statistics)

[a](n_instruments.pdf)

[b](program_numbers.pdf)

[c](n_tempos.pdf)

[d](tempos.pdf)

[e](n_keys.pdf)

[f](keys.pdf)

[g](n_signatures.pdf)

[h](time_signatures.pdf)

### Text events?

In [None]:
import midi
import glob
import os
import joblib
import pretty_midi

In [None]:
def get_names(midi_file):
    

In [None]:
def any_lyrics(midi_file):
    try:
        midi_data = midi.read_midifile(midi_file)
    except:
        return None
    for track in midi_data:
        for event in track:
            if isinstance(event, midi.LyricsEvent):
                return 1
    return 0

has_lyrics = joblib.Parallel(n_jobs=10, verbose=11)(
    joblib.delayed(any_lyrics)(f) for f in
    glob.glob(os.path.join('data', 'unique_mid', '*', '*.mid')))

In [None]:
print sum([l for l in has_lyrics if l is not None])
print sum([l for l in has_lyrics if l is not None])/float(len(has_lyrics))

In [None]:
n_bad = 0
for f in glob.glob('data/mid/*/*.mid*'):
    try:
        pm = pretty_midi.PrettyMIDI(f)
    except:
        n_bad += 1
        pass

In [None]:
n_mids = []
beatles_wavs = glob.glob('data/wav/*/*.wav')
for f in beatles_wavs:
    mid_glob = f.replace('wav', 'mid') + '*'
    n_mids.append(len(glob.glob(mid_glob)))
n_mids = np.array(n_mids)
print len(n_mids)
print np.sum(n_mids == 0)
print np.median(n_mids)
print np.max(n_mids)
print beatles_wavs[np.argmax(n_mids)]