In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path('../src').resolve()))
import os
import pickle
from datetime import datetime
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.cluster import KMeans
from sklearn.base import clone
from preprocessing import BagOfNotes, BagOfChords2, NfIsf, PreprocessMidiDataFrame, MidiPathToDataFrame, MidiPathToPrettyMidi,InstrumentAwareBoN
import evaluation

ModuleNotFoundError: No module named 'preprocessing'

In [None]:
maestro_path = Path('../maestro/maestro-v3.0.0/')
df = pd.read_csv('../eda/no_dups.csv')
train = df[df.split == 'train']
validate = df[df.split == 'validation']

pipe = Pipeline([
    ('loader', MidiPathToDataFrame(maestro_path)),
    ('cleaner', PreprocessMidiDataFrame()),
])
train_loaded = pipe.transform(train.midi_filename)
val_loaded = pipe.transform(validate.midi_filename)

pretty_train = MidiPathToPrettyMidi(maestro_path).transform(train.midi_filename)
pretty_val = MidiPathToPrettyMidi(maestro_path).transform(validate.midi_filename)

In [None]:
bitmidi_folder = Path('../data/')
bitmidi_files = [f for f in os.listdir(bitmidi_folder) if f.endswith(".mid")]
bitmidi_raw = Pipeline([
    ('loader', MidiPathToDataFrame(bitmidi_folder)),
    ('cleaner', PreprocessMidiDataFrame())
]).transform(bitmidi_files)
bitmidi_pretty = MidiPathToPrettyMidi(bitmidi_folder).transform(bitmidi_files)

In [None]:
vectorizers = [
    ('BoN', BagOfNotes(normalize=False)),
    ('BoNn', BagOfNotes(normalize=True)),
    ('NfIsf', NfIsf()),
    ('BoC2', BagOfChords2(time_threshold=30)),
    ('InstBoN', InstrumentAwareBoN())
]

In [None]:
X_vec_test = BagOfNotes(normalize=True).fit_transform(midi_dfs)
scores = {}
for n in range(3, 36, 3):
    iso = Isomap(n_neighbors=n, n_components=2)
    X_iso = iso.fit_transform(X_vec_test)
    kmeans = KMeans(n_clusters=20, random_state=42)
    labels = kmeans.fit_predict(X_iso)
    scores[n] = silhouette_score(X_iso, labels)
    print(f"n_neighbors={n}, silhouette={scores[n]:.4f}")
best_n = max(scores, key=scores.get)

In [None]:
X_iso = Isomap(n_neighbors=best_n, n_components=2).fit_transform(X_vec_test)
k_scores = {}
for k in range(2, 31):
    labels = KMeans(n_clusters=k, random_state=42).fit_predict(X_iso)
    k_scores[k] = silhouette_score(X_iso, labels)
    print(f"k={k}, silhouette={k_scores[k]:.4f}")
best_k = max(k_scores, key=k_scores.get)

In [None]:
reducers = [
    ('PCA2', PCA(n_components=2)),
    ('Iso2', Isomap(n_neighbors=best_n, n_components=2)),
]

In [None]:
clusterers = [
    ('kmeans', KMeans(n_clusters=best_k, random_state=33))
]

In [None]:
results = {}
bitmidi_cluster_results = {}

for vec_name, vectorizer in vectorizers:
    maestro_data = pretty_train if vec_name == 'InstBoN' else train_loaded
    val_data = pretty_val if vec_name == 'InstBoN' else val_loaded
    bitmidi_data = bitmidi_pretty if vec_name == 'InstBoN' else bitmidi_raw

    for red_name, reducer in reducers:
        for clust_name, clusterer in clusterers:
            print(f'Running: {vec_name}-{red_name}-{clust_name}')
            model = Pipeline([
                (vec_name, vectorizer),
                (red_name, reducer),
                (clust_name, clusterer)
            ])
            model = clone(model)

            try:
                model.fit(maestro_data)
                val_embeddings = model[:2].transform(val_data)
                val_preds = model.named_steps[clust_name].predict(val_embeddings)
                score = evaluation.evaluate_clusters(val_preds, validate[['canonical_composer']])

                key = f'{vec_name}-{red_name}-{clust_name}'
                results[key] = score

                bitmidi_embeds = model[:2].transform(bitmidi_data)
                bitmidi_clusters = model.named_steps[clust_name].predict(bitmidi_embeds)
                bitmidi_cluster_results[key] = bitmidi_clusters

                plt.figure(figsize=(8, 6))
                plt.scatter(val_embeddings[:, 0], val_embeddings[:, 1], alpha=0.5, label='MAESTRO Val')
                plt.scatter(bitmidi_embeds[:, 0], bitmidi_embeds[:, 1], c='red', marker='x', label='BitMidi')
                plt.title(f"{key} Embedding")
                plt.legend()
                plt.tight_layout()
                plt.show()

            except Exception as e:
                print(f"Error in {key}: {e}")

In [None]:
best_model_key = max(results, key=lambda k: results[k]['silhouette'])
print(f"Best model: {best_model_key}")
best_model = Pipeline([
    vectorizers[[v[0] for v in vectorizers].index(best_model_key.split('-')[0])][1],
    reducers[[r[0] for r in reducers].index(best_model_key.split('-')[1])][1]
])
X_embed = best_model.fit_transform(midi_dfs)
plt.scatter(X_embed[:, 0], X_embed[:, 1], c=dedupe.canonical_composer.astype('category').cat.codes)
plt.title('MAESTRO Cluster Embedding')
plt.show()