In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.insert(0, '../src/')
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.base import clone
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from pathlib import Path
import pickle
from datetime import datetime

from preprocessing import BagOfNotes, MidiPathToDataFrame, PreprocessMidiDataFrame, NfIsf, BagOfChords2, Downsampler
import evaluation

In [None]:
data_dir = Path('../maestro/maestro-v3.0.0/')
# df = pd.read_csv(data_dir / 'maestro-v3.0.0.csv')
df = pd.read_csv('../eda/no_dups.csv', index_col=0)
keys = pd.read_csv('../eda/key_review.csv')
categories = pd.read_csv('../eda/categories.csv')
df = pd.merge(df, keys, how='left', on='canonical_title')
df = pd.merge(df, categories, how='left', on='canonical_title')
df.head()

In [None]:
train = df[df.split == 'train']
validate = df[df.split == 'validation']

In [None]:
print(train.shape)

In [None]:
do_once = Pipeline([
    ('loader', MidiPathToDataFrame(data_dir)),
    ('cleaner', PreprocessMidiDataFrame()),
])
train_loaded = do_once.transform(train.midi_filename)
val_loaded = do_once.transform(validate.midi_filename)


In [None]:
vectorizers = [
    ('BoN', BagOfNotes(normalize=False)),
    ('nBoN', BagOfNotes(normalize=True)),
    ('VW-nBoN', BagOfNotes(normalize=True, weight_by_velocity=True)),
    ('DW-nBoN', BagOfNotes(normalize=True, weight_by_duration=True)),
    ('VDW-nBoN', BagOfNotes(normalize=True, weight_by_velocity=True, weight_by_duration=True)),
    ('nBoDN', BagOfNotes(normalize=True, reduce_to_distinct=True)),
    ('BoC', BagOfChords2(time_threshold=30, vocab_size=500)),
    ('BoC', BagOfChords2(time_threshold=30, vocab_size=1000, reduce_to_distinct=True)),
    # ('BoC', Pipeline([('Downsample', Downsampler(n_samples=500)), ('BoC', BagOfChords2(time_threshold=30, vocab_size=500))])),
    ('NfIsf', NfIsf()),
]
reducers = [
    ('PCA2', PCA(2)),
    # ('PCA4', PCA(4)),
    ('ISO2', Isomap(n_neighbors=15, n_components=2)),
    # ('Iso4', Isomap(n_neighbors=15, n_components=4)),
]
clusterers = [
    ('kmeans', KMeans(n_clusters=20, n_init=10, random_state=6740*42)),
    # ('spectral', SpectralClustering())
]
models = {}
results = {}
for v in vectorizers:
    for r in reducers:
        for c in clusterers:
            p = Pipeline([
                v,
                r,
                c,
            ])
            p = clone(p)
            p.fit(train_loaded)
            embeddings = p[:2].transform(val_loaded)
            clusters = p[2].predict(embeddings)
            scores = evaluation.evaluate_clusters(clusters, validate[['canonical_composer', 'draft_key', 'category']])
            name = f'{v[0]}-{r[0]}'
            for s in scores:
                evaluation.plot_with_label(embeddings, validate[s], f'{name} (homogeneity={scores[s]:.4f})')
            silhouette = silhouette_score(embeddings, clusters)
            scores['silhouette'] = silhouette
            models[name] = p
            results[name] = scores

In [None]:
with open(f'models_{datetime.today().strftime('%Y%m%d')}.pkl', 'wb') as f:
    pickle.dump(models, f)

In [None]:
result_df = pd.DataFrame.from_dict(results, orient='index')
result_df

In [None]:
result_df.to_csv(f'results_{datetime.today().strftime('%Y%m%d')}.csv')