# **Preprocessing**

* Tune all guitars to 7-string B-standard
* Lowest note in this tuning is 35 (B2)
* Highest note in this tuning is 88 (E7, assuming 24-fret guitar)
* This is a dictionary size of 55 playable notes
    * [35, 88] inclusive + a rest note

*To add dynamic note lengths, we must increase this 
dictionary size by many times…*

Dictionary of note lengths:  
    [32nd, 16th, 8th, quarter, half, whole,  
     dotted {16th, 8th, quarter, half, whole},  
     triplet {16th, 8th, quarter}, two whole]

This brings our total dictionary length up to…  

**15 note lengths x 55 playable notes = 825 total**

### Standard
32nd = 120  
16th = 240  
8th = 480  
Quarter = 960  
Half = 1920  
Whole = 3840  
Two whole = 7680  

### Dotted
16th = 360  
8th = 720  
Quarter = 1440  
Half = 2880  
Whole = 5760  

### Triplet
16th = 160  
8th = 320  
Quarter = 640

#### Other notes:
* MIDI track 0 appears to only contain the tempo + time signature information
* Rests are identifiable by the "time=x" in a "note-on" MIDI event
* Pitches will be tokenized with 0 as the lowest pitch by subtracting MIN_PITCH
* We'll assign rests the maximum "pitch" value of 54 whereas all other notes will be [0, 53] == [35-35, 88-35]
* Each note duration is assigned an index on the interval [0, 14] and is retrieved via index_dict
    * The encoded pitch/duration combination is given by: X = (Pitch - MIN_PITCH) + index * N_NOTES
    * Then we can see min(X) = 0 for (Pitch, index) = (MIN_PITCH, 0) and max(X) = 824 for (Pitch, index) = (89 [rest note], 14)

In [46]:
import os
import re
import time
import numpy as np
import pandas as pd

from mido import MidiFile
from multiprocessing import pool
from matplotlib import pyplot as plt

In [None]:
MIN_PITCH = 35
MAX_PITCH = 88
INCLUDE_REST = True
N_NOTES = MAX_PITCH - MIN_PITCH + 2 if INCLUDE_REST else MAX_PITCH - MIN_PITCH + 1
REST_NOTE = N_NOTES - 1 if INCLUDE_REST else None

In [None]:
note_lengths = [120, 160, 240, 320, 360, 480, 640, 720, 960, 1440, 1920, 2880, 3840, 5760, 7680]

In [None]:
index_dict = dict(zip(note_lengths, range(len(note_lengths))))

In [None]:
index_dict

In [None]:
midi_dir = './midi'
save_dir = './data'

In [None]:
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

In [None]:
files = [os.path.join(midi_dir, file) for file in os.listdir(midi_dir) if file.endswith('.mid')]

In [None]:
def get_index(key):
    keys = list(index_dict.keys())
    
    if key in keys:
        index = index_dict.get(key)
        
    else:
        diff = np.absolute(np.array(keys) - key)
        key = keys[np.argmin(diff)] 
        index = index_dict.get(key)
    
    return index

In [None]:
for file in files:
    mid = MidiFile(file)
    
    for i, track in enumerate(mid.tracks[1:]):
        
        last_pitch = np.inf
        notes = []
        
        for msg in track:
            if str(msg).startswith('note_on'):

                time = re.search('time=(\d+)', str(msg))
                time = int(time.group(1))

                if time > 0:

                    index = get_index(time)
                    notes.append(REST_NOTE + N_NOTES*index)
                
                pitch = re.search('note=(\d+)', str(msg))
                pitch = int(pitch.group(1)) - MIN_PITCH
                
                if pitch < last_pitch:
                    last_pitch = pitch
                
            elif str(msg).startswith('note_off'):

                time = re.search('time=(\d+)', str(msg))
                time = int(time.group(1))
                
                if time > 0 and last_pitch != np.inf:

                    index = get_index(time)
                    notes.append(last_pitch + N_NOTES*index)

                    last_pitch = np.inf
        
        basename = os.path.splitext(os.path.basename(file))[0]
        filename = basename + " - {}".format(i)
        np.save(os.path.join(save_dir, filename), notes)

In [55]:
for file in files:
    mid = MidiFile(file)
    
    for i, track in enumerate(mid.tracks[1:]):
        
        step = 0
        note_on = True
        notes = []
        
        for msg in track:
            if str(msg).startswith('note_on'):
                
                if not note_on:
                    step += 1

                time = re.search('time=(\d+)', str(msg))
                time = int(time.group(1))
                time = get_index(time)
                
                pitch = re.search('note=(\d+)', str(msg))
                pitch = int(pitch.group(1)) - MIN_PITCH
                
                notes.append([step, True, pitch, time])
                
                note_on = True
                
            elif str(msg).startswith('note_off'):
                
                if note_on:
                    step += 1

                time = re.search('time=(\d+)', str(msg))
                time = int(time.group(1))
                time = get_index(time)
                
                notes.append([step, False, np.nan, time])
                
                note_on = False
    
        df = pd.DataFrame(notes, columns=['step', 'note_on', 'pitch', 'time'])

In [61]:
df.tail(100)

Unnamed: 0,step,note_on,pitch,time
5894,3809,False,,0
5895,3809,False,,0
5896,3809,False,,0
5897,3809,False,,0
5898,3810,True,30.0,0
5899,3810,True,25.0,0
5900,3810,True,21.0,0
5901,3810,True,18.0,0
5902,3810,True,13.0,0
5903,3810,True,6.0,0


In [41]:
for file in files[2:]:
    mid = MidiFile(file)
    
    for i, track in enumerate(mid.tracks[1:]):
        
        for msg in track:
            print(msg)

pitchwheel channel=4 pitch=0 time=0
control_change channel=4 control=7 value=127 time=0
control_change channel=5 control=7 value=127 time=0
control_change channel=4 control=10 value=63 time=0
control_change channel=5 control=10 value=63 time=0
control_change channel=4 control=93 value=0 time=0
control_change channel=5 control=93 value=0 time=0
control_change channel=4 control=91 value=0 time=0
control_change channel=5 control=91 value=0 time=0
control_change channel=4 control=95 value=0 time=0
control_change channel=5 control=95 value=0 time=0
control_change channel=4 control=92 value=0 time=0
control_change channel=5 control=92 value=0 time=0
control_change channel=4 control=11 value=127 time=0
control_change channel=5 control=11 value=127 time=0
program_change channel=4 program=30 time=0
program_change channel=5 program=30 time=0
note_on channel=4 note=40 velocity=79 time=0
note_on channel=4 note=35 velocity=79 time=0
note_off channel=4 note=40 velocity=79 time=240
note_off channel=4

note_off channel=4 note=41 velocity=79 time=240
note_on channel=4 note=47 velocity=79 time=240
note_off channel=4 note=47 velocity=79 time=480
note_on channel=4 note=40 velocity=79 time=0
note_off channel=4 note=40 velocity=79 time=240
note_on channel=4 note=40 velocity=79 time=240
note_off channel=4 note=40 velocity=79 time=240
note_on channel=4 note=55 velocity=79 time=240
note_off channel=4 note=55 velocity=79 time=480
note_on channel=4 note=48 velocity=79 time=0
note_off channel=4 note=48 velocity=79 time=240
note_on channel=4 note=57 velocity=79 time=240
note_off channel=4 note=57 velocity=79 time=480
note_on channel=4 note=55 velocity=79 time=0
note_off channel=4 note=55 velocity=79 time=480
note_on channel=4 note=48 velocity=79 time=0
note_off channel=4 note=48 velocity=79 time=240
note_on channel=4 note=55 velocity=79 time=240
note_off channel=4 note=55 velocity=79 time=480
note_on channel=4 note=64 velocity=79 time=0
note_on channel=4 note=59 velocity=79 time=0
note_on channel

note_on channel=4 note=41 velocity=79 time=400
note_off channel=4 note=41 velocity=79 time=240
note_on channel=4 note=42 velocity=79 time=400
note_on channel=4 note=35 velocity=79 time=0
note_off channel=4 note=42 velocity=79 time=240
note_off channel=4 note=35 velocity=79 time=0
note_on channel=4 note=42 velocity=79 time=3600
note_on channel=4 note=35 velocity=79 time=0
note_off channel=4 note=42 velocity=79 time=11520
note_off channel=4 note=35 velocity=79 time=0
note_on channel=4 note=40 velocity=79 time=0
note_off channel=4 note=40 velocity=79 time=240
note_on channel=4 note=45 velocity=79 time=400
note_off channel=4 note=45 velocity=79 time=240
note_on channel=4 note=59 velocity=79 time=400
note_off channel=4 note=59 velocity=79 time=640
note_on channel=5 note=45 velocity=79 time=0
pitchwheel channel=5 pitch=-512 time=128
pitchwheel channel=5 pitch=-1024 time=128
pitchwheel channel=5 pitch=-1664 time=128
pitchwheel channel=5 pitch=-2176 time=128
pitchwheel channel=5 pitch=-2816 ti