In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import sys
sys.path.insert(0, '../src/')
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.cluster import KMeans, SpectralClustering
import matplotlib.pyplot as plt
from pathlib import Path

from preprocessing import BagOfNotes, MidiPathToDataFrame, PreprocessMidiDataFrame, NfIsf
import evaluation

In [5]:
data_dir = Path('../maestro/maestro-v3.0.0/')
df = pd.read_csv(data_dir / 'maestro-v3.0.0.csv')
keys = pd.read_csv('../eda/key_review.csv')
df = pd.merge(df, keys, how='left', on='canonical_title')
df.head()

Unnamed: 0,canonical_composer,canonical_title,split,year,midi_filename,audio_filename,duration,draft_key
0,Alban Berg,Sonata Op. 1,train,2018,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,698.66116,
1,Alban Berg,Sonata Op. 1,train,2008,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,759.518471,
2,Alban Berg,Sonata Op. 1,train,2017,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,464.649433,
3,Alexander Scriabin,"24 Preludes Op. 11, No. 13-24",train,2004,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,872.640588,
4,Alexander Scriabin,"3 Etudes, Op. 65",validation,2006,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,397.857508,


In [6]:
train = df[df.split == 'train']
validate = df[df.split == 'validation']

In [None]:
do_once = Pipeline([
    ('loader', MidiPathToDataFrame(data_dir)),
    ('cleaner', PreprocessMidiDataFrame()),
])
train_loaded = do_once.transform(train.midi_filename)
val_loaded = do_once.transform(validate.midi_filename)


In [18]:
vectorizers = [
    ('BoN', BagOfNotes(normalize=False)),
    ('NfIsf', NfIsf()),
]
reducers = [
    ('PCA2', PCA(2)),
    ('PCA4', PCA(4)),
    ('Iso2', Isomap(n_neighbors=15, n_components=2)),
    ('Iso4', Isomap(n_neighbors=15, n_components=4)),
]
clusterers = [
    ('kmeans', KMeans(n_init=10, random_state=6740*42)),
    # ('spectral', SpectralClustering())
]
for v in vectorizers:
    for r in reducers:
        for c in clusterers:
            p = Pipeline([
                v,
                r,
                c,
            ])
            p.fit(train_loaded)
            clusters = p.predict(val_loaded)
            scores = evaluation.evaluate_clusters(clusters, validate[['canonical_composer', 'draft_key']])
            print(v[0], r[0], c[0])
            print(scores)
            print('\n')

BoN PCA2 kmeans
{'canonical_composer': 0.2865450431739002, 'draft_key': 0.23957696592987113}


BoN PCA4 kmeans
{'canonical_composer': 0.24927306993119677, 'draft_key': 0.24222184935009602}


BoN Iso2 kmeans
{'canonical_composer': 0.27676045400321114, 'draft_key': 0.4180431558947288}


BoN Iso4 kmeans
{'canonical_composer': 0.23388759948866417, 'draft_key': 0.35677831621806794}


NfIsf PCA2 kmeans
{'canonical_composer': 0.26989441602815833, 'draft_key': 0.3060379723456053}


NfIsf PCA4 kmeans
{'canonical_composer': 0.22021758364412636, 'draft_key': 0.22249668042140433}


NfIsf Iso2 kmeans
{'canonical_composer': 0.21137489839309526, 'draft_key': 0.2848045045733567}


NfIsf Iso4 kmeans
{'canonical_composer': 0.22532112381001382, 'draft_key': 0.2272434779047489}


