In [1]:
import sys
sys.path.insert(0, '../src/')
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.base import clone
import matplotlib.pyplot as plt
from pathlib import Path
import pickle
from datetime import datetime

from preprocessing import BagOfNotes, MidiPathToDataFrame, PreprocessMidiDataFrame, NfIsf, BagOfChords2, Downsampler
import evaluation

In [2]:
data_dir = Path('../maestro/maestro-v3.0.0/')
# df = pd.read_csv(data_dir / 'maestro-v3.0.0.csv')
df = pd.read_csv('../eda/no_dups.csv')
keys = pd.read_csv('../eda/key_review.csv')
categories = pd.read_csv('../eda/categories.csv')
df = pd.merge(df, keys, how='left', on='canonical_title')
df = pd.merge(df, categories, how='left', on='canonical_title')
df.head()

Unnamed: 0.1,Unnamed: 0,canonical_composer,canonical_title,split,year,midi_filename,audio_filename,duration,draft_key,category
0,0,Alban Berg,Sonata Op. 1,train,2018,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,698.66116,,sonata
1,3,Alexander Scriabin,"24 Preludes Op. 11, No. 13-24",train,2004,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,872.640588,,prelude
2,4,Alexander Scriabin,"3 Etudes, Op. 65",validation,2006,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,397.857508,,etude
3,5,Alexander Scriabin,"5 Preludes, Op.15",validation,2009,2009/MIDI-Unprocessed_07_R1_2009_04-05_ORIG_MI...,2009/MIDI-Unprocessed_07_R1_2009_04-05_ORIG_MI...,400.557826,,prelude
4,6,Alexander Scriabin,"Entragete, Op.63",test,2009,2009/MIDI-Unprocessed_11_R1_2009_06-09_ORIG_MI...,2009/MIDI-Unprocessed_11_R1_2009_06-09_ORIG_MI...,163.74583,,


In [3]:
train = df[df.split == 'train']
validate = df[df.split == 'validation']

In [4]:
do_once = Pipeline([
    ('loader', MidiPathToDataFrame(data_dir)),
    ('cleaner', PreprocessMidiDataFrame()),
])
train_loaded = do_once.transform(train.midi_filename)




In [5]:
train_loaded[0].head()

Unnamed: 0,time_from_start,note,velocity,duration
0,756,67.0,52.0,635.0
1,1371,72.0,67.0,94.0
2,1650,78.0,65.0,1116.0
3,1655,71.0,45.0,1667.0
4,1674,61.0,39.0,818.0


In [6]:
train_loaded[0]['note'] % 12

0        7.0
1        0.0
2        6.0
3       11.0
4        1.0
        ... 
4192     6.0
4193     2.0
4194     6.0
4195    11.0
4196    11.0
Name: note, Length: 4197, dtype: float64

In [7]:
distinct = [t.copy() for t in train_loaded]
for d in distinct:
    d['note'] = d['note'] % 12

In [8]:
boc = BagOfChords2(vocab_size=5000)
boc.fit(distinct)

In [9]:
boc.vocab_.shape

(2419,)

In [10]:
boc.vocab_

7                 87437
2                 85752
9                 84986
0                 83197
4                 81415
                  ...  
0,1,8,9,10            1
0,1,2,5,6,9           1
1,2,4,9,10,11         1
0,1,2,3,7,8,11        1
1,2,4,5,6,7           1
Name: count, Length: 2419, dtype: int64

In [11]:
boc.vocab_.describe()

count     2419.000000
mean       629.577098
std       5601.085863
min          1.000000
25%          1.000000
50%          5.000000
75%         37.500000
max      87437.000000
Name: count, dtype: float64