In [1]:
# Loose inspiration from: https://towardsdatascience.com/how-to-generate-music-using-a-lstm-neural-network-in-keras-68786834d4c5


# From a corpus of midi files, generate tokens for a sequence model


from music21 import converter, instrument, note, chord, stream
from music21.midi import MidiException
from random import shuffle
import time
import os
import signal
import sys
import warnings
import glob


# Instruments to look for
instr = (instrument.Piano, instrument.StringInstrument, instrument.Harpsichord)

# Ignore warnings
if not sys.warnoptions:
    warnings.simplefilter('ignore')

# Custom exception class for time-out
class TimeoutException(Exception):   
    pass

# Custom signal handler
def timeout_handler(signum, frame):   
    raise TimeoutException

# Change the behavior of SIGALRM
signal.signal(signal.SIGALRM, timeout_handler)


# Tokenize midis in source_dir for training sequence model
def tokenize_midis(source_dir, dest_file, timeout=30, monophonic=False, rests=False, durations=False, 
                   instr = (instrument.Piano, instrument.StringInstrument, instrument.Harpsichord)):
    
    # Iterate over midis in directory
    midi_list = glob.glob(source_dir + '/**/*.mid', recursive=True)
    shuffle(midi_list)
    
    # Create a log of the file ordering
    with open(('.').join(dest_file.split('.')[:-1])+'_log.txt', 'w') as outlog:
        for file in midi_list:
            outlog.write(file + "\n")
    
    total = len(midi_list)
    i = 1
    outfile = open(dest_file, 'w')
    
    for file in midi_list:
        tokens = []
        
        # Set time-out alarm (seconds) in case transposing is taking too long
        signal.alarm(timeout)
        
        # Try to parse the file
        try:
            s1 = converter.parse(file)
            notes_to_parse = None
            
            # If flat file
            if len(s1.parts) == 1:
                if monophonic:
                    notes_to_parse = s1.flat.notes
                else:
                    notes_to_parse = s1.flat.notes.chordify()
            
            # If file has parts matching desired instruments
            elif any(isinstance(part.getInstrument(), instr) for part in s1.parts):
                for part in s1.parts:
                    if not isinstance(part.getInstrument(), instr):
                        s1.remove(part)
                if monophonic:
                    notes_to_parse = s1.parts[0]
                else:
                    notes_to_parse = s1.parts.chordify()
                
            # If no matching parts, try first one
            else:
                if monophonic:
                    notes_to_parse = s1.parts[0]
                else:
                    notes_to_parse = s1.parts[0].chordify()
                
            # Perform tokenization
            for element in notes_to_parse:
                if isinstance(element, note.Note):
                    if durations:
                        tokens.append(str(element.pitch)+'.'+str(element.duration.type))
                    else:
                        tokens.append(str(element.pitch))
                elif isinstance(element, chord.Chord):
                    if monophonic:
                        if durations:
                            tokens.append(element.root().name + str(element.root().octave) + '.' + str(element.duration.type))
                        else:
                            tokens.append(element.root().name + str(element.root().octave))
                    else:
                        if durations:
                            tokens.append('.'.join((pitch.name + str(pitch.octave)) for pitch in element.pitches)+'.'+str(element.duration.type))
                        else:
                            tokens.append('.'.join((pitch.name + str(pitch.octave)) for pitch in element.pitches))
                elif isinstance(element, note.Rest):
                    if rests:
                        tokens.append('rest')
        
        # Tokenizing took too long
        except TimeoutException:
            print("Time-out tokenizing file", i, "out of", total, "(", file, ")")
            continue
        
        # Tokenizing encountered an error
        except (MidiException, IndexError, TypeError):
            print("Exception tokenizing file", i, "out of", total, "(", file, ")")
            continue
            
        else:
            signal.alarm(0)
            print("Tokenized file", i, "out of", total)
            
        finally:
            i += 1
        
        outfile.write(" ".join(tokens)+"\n")
        
    outfile.close()
    print('Tokens written to %s' % dest_file)
    print('Vocabulary size is %i' % len(set(w for w in open(dest_file).read().split())))

In [3]:
tokenize_midis('./classical_midis/mozart/', './tokenized/mozart_mono.txt', monophonic=True)
tokenize_midis('./classical_midis/mozart/', './tokenized/mozart_poly_dur.txt', durations=True)
print('Tokenizations complete!')

Tokenized file 1 out of 711
Tokenized file 2 out of 711
Time-out tokenizing file 3 out of 711 ( ./classical_midis/mozart/chamber/mozart_clarinet_quintet_581a_1_(c)bakels.mid )
Tokenized file 4 out of 711
Tokenized file 5 out of 711
Tokenized file 6 out of 711
Tokenized file 7 out of 711
Time-out tokenizing file 8 out of 711 ( ./classical_midis/mozart/piano/!live!/mozart_piano_rondo_485_(c)oguri.mid )
Tokenized file 9 out of 711
Tokenized file 10 out of 711
Tokenized file 11 out of 711
Tokenized file 12 out of 711
Time-out tokenizing file 13 out of 711 ( ./classical_midis/mozart/chamber/mozart_piano_quartet_478_score_(c)unknown.mid )
Tokenized file 14 out of 711
Tokenized file 15 out of 711
Tokenized file 16 out of 711
Time-out tokenizing file 17 out of 711 ( ./classical_midis/mozart/concertos/mozart_piano_concerto_18_456_1_(nc)fisher.mid )
Tokenized file 18 out of 711
Tokenized file 19 out of 711
Tokenized file 20 out of 711
Time-out tokenizing file 21 out of 711 ( ./classical_midis/mo

In [5]:
import os

def train_test_split(source_file, dest_dir, holdout_pct=0.03, split_pct=0.04):
    
    # Check if destination exists and create if not
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    
    songs = []
    
    # Strip out empty lines
    with open(source_file) as infile:
        for line in infile:
            if not line.strip(): continue
            songs.append(line)
    
    # Figure out number of songs for holdout and split
    holdout_i = round(len(songs)*holdout_pct)
    split_i = holdout_i + round(len(songs)*split_pct)
    
    # Perform split
    holdout_tokens = songs[:holdout_i]
    prelim_split_tokens = songs[holdout_i:split_i]
    split_tokens = []
    train_tokens = songs[split_i:]
    
    # Divide 'split' files between train and split test
    for line in prelim_split_tokens:
        line = line.split()
        first_half = line[:len(line)//2]
        second_half = line[len(line)//2:]
        split_tokens.append(" ".join(first_half)+'\n')
        train_tokens.append(" ".join(second_half)+'\n')

    # Write files
    with open(os.path.join(dest_dir, os.path.splitext(os.path.basename(source_file))[0]+'_train.txt'), 'w') as out_train:
        for x in train_tokens:
            out_train.write(x)
            
    with open(os.path.join(dest_dir, os.path.splitext(os.path.basename(source_file))[0]+'_holdout.txt'), 'w') as out_holdout:
        for x in holdout_tokens:
            out_holdout.write(x)
            
    with open(os.path.join(dest_dir, os.path.splitext(os.path.basename(source_file))[0]+'_split.txt'), 'w') as out_split:
        for x in split_tokens:
            out_split.write(x)
            
    print(source_file + ' done!')
    

In [7]:
train_test_split('./tokenized/bach_mono.txt', 'train-test')
train_test_split('./tokenized/bach_poly_dur.txt', 'train-test')
train_test_split('./tokenized/brahms_mono.txt', 'train-test')
train_test_split('./tokenized/brahms_poly_dur.txt', 'train-test')
train_test_split('./tokenized/handel_mono.txt', 'train-test')
train_test_split('./tokenized/handel_poly_dur.txt', 'train-test')
train_test_split('./tokenized/haydn_mono.txt', 'train-test')
train_test_split('./tokenized/haydn_poly_dur.txt', 'train-test')
train_test_split('./tokenized/mozart_mono.txt', 'train-test')
train_test_split('./tokenized/mozart_poly_dur.txt', 'train-test')
train_test_split('./tokenized/schubert_mono.txt', 'train-test')
train_test_split('./tokenized/schubert_poly_dur.txt', 'train-test')
train_test_split('./tokenized/vivaldi_mono.txt', 'train-test')
train_test_split('./tokenized/vivaldi_poly_dur.txt', 'train-test')

./tokenized/bach_mono.txt done!
./tokenized/bach_poly_dur.txt done!
./tokenized/brahms_mono.txt done!
./tokenized/brahms_poly_dur.txt done!
./tokenized/handel_mono.txt done!
./tokenized/handel_poly_dur.txt done!
./tokenized/haydn_mono.txt done!
./tokenized/haydn_poly_dur.txt done!
./tokenized/mozart_mono.txt done!
./tokenized/mozart_poly_dur.txt done!
./tokenized/schubert_mono.txt done!
./tokenized/schubert_poly_dur.txt done!
./tokenized/vivaldi_mono.txt done!
./tokenized/vivaldi_poly_dur.txt done!
