In [None]:
# Loose inspiration from: https://towardsdatascience.com/how-to-generate-music-using-a-lstm-neural-network-in-keras-68786834d4c5


# From a corpus of midi files, generate tokens for a sequence model


from music21 import converter, instrument, note, chord, stream
from music21.midi import MidiException
from random import shuffle
import time
import os
import signal
import sys
import warnings
import glob


# Instruments to look for
instr = (instrument.Piano, instrument.StringInstrument, instrument.Harpsichord)

# Ignore warnings
if not sys.warnoptions:
    warnings.simplefilter('ignore')

# Custom exception class for time-out
class TimeoutException(Exception):   
    pass

# Custom signal handler
def timeout_handler(signum, frame):   
    raise TimeoutException

# Change the behavior of SIGALRM
signal.signal(signal.SIGALRM, timeout_handler)


# Tokenize midis in source_dir for training sequence model
def tokenize_midis(source_dir, dest_file, timeout=30, monophonic=False, rests=False, durations=False, 
                   instr = (instrument.Piano, instrument.StringInstrument, instrument.Harpsichord)):
    
    # Iterate over midis in directory
    midi_list = glob.glob(source_dir + '/**/*.mid', recursive=True)
    shuffle(midi_list)
    
    total = len(midi_list)
    i = 1
    outfile = open(dest_file, 'w')
    
    for file in midi_list:
        tokens = []
        
        # Set time-out alarm (seconds) in case transposing is taking too long
        signal.alarm(timeout)
        
        # Try to parse the file
        try:
            s1 = converter.parse(file)
            notes_to_parse = None
            
            # If flat file
            if len(s1.parts) == 1:
                if monophonic:
                    notes_to_parse = s1.flat.notes
                else:
                    notes_to_parse = s1.flat.notes.chordify()
            
            # If file has parts matching desired instruments
            elif any(isinstance(part.getInstrument(), instr) for part in s1.parts):
                for part in s1.parts:
                    if not isinstance(part.getInstrument(), instr):
                        s1.remove(part)
                if monophonic:
                    notes_to_parse = s1.parts[0]
                else:
                    notes_to_parse = s1.parts.chordify()
                
            # If no matching parts, try first one
            else:
                if monophonic:
                    notes_to_parse = s1.parts[0]
                else:
                    notes_to_parse = s1.parts[0].chordify()
                
            # Perform tokenization
            for element in notes_to_parse:
                if isinstance(element, note.Note):
                    if durations:
                        tokens.append(str(element.pitch)+'.'+str(element.duration.type))
                    else:
                        tokens.append(str(element.pitch))
                elif isinstance(element, chord.Chord):
                    if monophonic:
                        if durations:
                            tokens.append(element.root().name + str(element.root().octave) + '.' + str(element.duration.type))
                        else:
                            tokens.append(element.root().name + str(element.root().octave))
                    else:
                        if durations:
                            tokens.append('.'.join((pitch.name + str(pitch.octave)) for pitch in element.pitches)+'.'+str(element.duration.type))
                        else:
                            tokens.append('.'.join((pitch.name + str(pitch.octave)) for pitch in element.pitches))
                elif isinstance(element, note.Rest):
                    if rests:
                        tokens.append('rest')
        
        # Tokenizing took too long
        except TimeoutException:
            print("Time-out tokenizing file", i, "out of", total, "(", file, ")")
            continue
        
        # Tokenizing encountered an error
        except (MidiException, IndexError, TypeError):
            print("Exception tokenizing file", i, "out of", total, "(", file, ")")
            continue
            
        else:
            signal.alarm(0)
            print("Tokenized file", i, "out of", total)
            
        finally:
            i += 1
        
        outfile.write(" ".join(tokens)+"\n")
        
    outfile.close()
    print('Tokens written to %s' % dest_file)
    print('Vocabulary size is %i' % len(set(w for w in open(dest_file).read().split())))

In [None]:
tokenize_midis('./testmids/', './testmids/testtokens.txt', monophonic=True)
tokenize_midis('./testmids/', './testmids/testtokens2.txt', durations=True)
print('Tokenize original complete!')