### Compare Extractors

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import pickle
import torch
from tqdm import tqdm

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, adjusted_mutual_info_score
from scipy.stats import wasserstein_distance

from pathlib import Path
import os
from datetime import datetime
import uuid
import json
import shutil

from kcm.koopman_category_model import KoopmanCategoryModel
from kcm.basic_feature_extract import BasicFeatureExtractor
from kcm.discovery import (
    CategoryDiscoveryTrainer,
    train_test_split_indices,
    prep_data_for_discovery,
    check_histograms,
    sup_con_loss,
    BaselineModel,
    HASHHead,
    cluster_acc,
    split_cluster_acc_v1,
    split_cluster_acc_v2,
    create_hash_ids
)

# import warnings
# warnings.filterwarnings('ignore')


plt.style.use('dark_background')

In [2]:
# Reproducibility Code
seed=42
rng = np.random.default_rng(seed)

In [3]:
############### Shared Inputs ###############
num_cats = 4 # 10
num_samples = 100 # 500
system_dimension = 2
test_size = 0.2
category_discovery=True
train_classes = [1,2,3] # range(3) # range(7)
noisy_data=False
noise_std=0.01
samples_name = f'noisy_{noise_std}_samples' if noisy_data else 'samples'
data_path = (rf"C:\Users\peterdb1\Documents\Masters in ACM\(i-j) 625.801-802 - ACM Master's Research\Technical Work\koopman-category-discovery\data",
                f"{system_dimension}-dimensional-systems",
                f"dataset_{num_cats}_class_{num_samples}_{samples_name}.pkl"
            )
data_path = os.path.join(*data_path)
#############################################


############ kcm-specific inputs ############
delay_embeddings = 3
num_segments = 8 # 30
svd_rank = None
dmd_rank = None
q = 1
num_clusters = 5 # 20
codebook_training_size = 120 # 490 # divides <num training classes>
normalize_kcm_inputs=True
soft_clustering=True
tau = 1.0
#############################################


###### basic extractor-specific inputs ######
drop_na=True
normalize_basic_inputs=False
#############################################

train_counts, test_counts = train_test_split_indices(num_cats,num_samples,test_size,train_classes,category_discovery,rng)

Category Discovery Split...
    Training Size: 240
    Testing Size: 160


### Prepare Data

In [4]:
KCM = KoopmanCategoryModel(num_cats=num_cats,
                           num_samples=num_samples,
                           system_dimension=system_dimension,
                           delay_embeddings=delay_embeddings,
                           num_segments=num_segments,
                           svd_rank=svd_rank,
                           dmd_rank=dmd_rank,
                           q=q,
                           data_path=data_path,
                           cluster_method='kmeans',
                           num_clusters=num_clusters,
                           noisy_data=noisy_data,
                           noise_std=noise_std,
                           normalize_inputs=normalize_kcm_inputs,
                           train_classes=train_classes,
                           soft_clustering=soft_clustering,
                           tau=tau,
                           seed=seed)

KCM.train_counts = train_counts
KCM.test_counts = test_counts
KCM.generate_data()

kcm = KCM.df
kcm_train_data = kcm.loc[kcm['count'].isin(train_counts)].reset_index(drop=True)
kcm_test_data = kcm.loc[kcm['count'].isin(test_counts)].reset_index(drop=True)

KCM.df_train = kcm_train_data
KCM.df_test = kcm_test_data

assert int(kcm_train_data.shape[0]/num_segments) == len(train_counts), 'training samples not correct shape'
assert int(kcm_test_data.shape[0]/num_segments) == len(test_counts), 'testing samples not correct shape'

Loading data in at C:\Users\peterdb1\Documents\Masters in ACM\(i-j) 625.801-802 - ACM Master's Research\Technical Work\koopman-category-discovery\data\2-dimensional-systems\dataset_4_class_100_samples.pkl...
Generating 3200 DMD eigs/modes each with dimensionality 8


harmonic_oscillator system: 100%|███████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 204.65it/s]
spring_mass_with_forcing system: 100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 219.28it/s]
duffing_oscillator system: 100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 237.55it/s]
van_der_pol_oscillator system: 100%|████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 234.21it/s]
Formatting DMD data: 100%|███████████████████████████████████████████████████████████████████████████████| 3200/3200 [00:00<00:00, 4000.26it/s]


In [5]:
KCM.create_codebook(codebook_training_size=codebook_training_size,
                    category_discovery=category_discovery,
                    include_plots=True)

KCM.create_feature_outputs()

KCM.save()
KCM.shutdown_logger()

Creating df_sample for codebook


(1/2) * 120^2 = 7200 Wasserstein distance metrics:  99%|█████████████████████████████████████████████████▌| 7140/7200 [00:17<00:00, 409.70it/s]
5 * 1920 = 9600 Wasserstein distance metrics: 100%|███████████████████████████████████████████████████████| 9600/9600 [00:20<00:00, 462.61it/s]
5 * 1280 = 6400 Wasserstein distance metrics: 100%|███████████████████████████████████████████████████████| 6400/6400 [00:13<00:00, 463.34it/s]


In [10]:
print(KCM.run_dir.stem)

KCM_20250811_225835_b382058c


In [7]:
def load_koopman_model(run_dir):
    model_path = Path(run_dir) / "koopman_model.pkl"
    params_path = Path(run_dir) / "params.json"
    
    model = KoopmanCategoryModel.load(model_path)
    with open(params_path, "r") as f:
        params = json.load(f)
    
    return model, params

In [35]:
run_dir = r"C:\Users\peterdb1\Documents\Masters in ACM\(i-j) 625.801-802 - ACM Master's Research\Technical Work\koopman-category-discovery\experiments\KCM_20250811_192934_79d7f879"
KCM, params = load_koopman_model(run_dir)
for par in params:
    print(par)

In [None]:
# train test split
kcm_X_train, kcm_X_test, kcm_y_train, kcm_y_test, kcm_stacked = prep_data_for_discovery(train=KCM.kcm_train,
                                                                                        test=KCM.kcm_test,
                                                                                        normalize_final_data=False, # already normalized within kcm.create_codebook()
                                                                                        pca_reduction=False,
                                                                                        n_components=None,
                                                                                        feat_extractor='kcm')

In [None]:
Extractor = BasicFeatureExtractor(num_cats=KCM.num_cats,
                                  num_samples=KCM.num_samples,
                                  system_dimension=KCM.system_dimension,
                                  data_path=KCM.data_path,
                                  noisy_data=KCM.noisy_data,
                                  noise_std=KCM.noise_std,
                                  seed=KCM.seed)

Extractor.batch_extract_features(normalize_inputs=normalize_basic_inputs, drop_na=drop_na)

basic = Extractor.df
basic_train = basic.loc[basic['count'].isin(train_counts)]
basic_test = basic.loc[basic['count'].isin(test_counts)]

# train test split
basic_X_train, basic_X_test, basic_y_train, basic_y_test, basic_stacked = prep_data_for_discovery(train=basic_train,
                                                                                                  test=basic_test,
                                                                                                  normalize_final_data=True,
                                                                                                  pca_reduction=True,
                                                                                                  n_components=KCM.num_clusters, # make kcm and basic extract have same feature dimension
                                                                                                  feat_extractor='basic')

In [None]:
check_histograms(basic_stacked), print('')
check_histograms(kcm_stacked)

In [None]:
# Comparing system-target mappings for basic extractor and kcm
mapping = basic[['system_name','target']].drop_duplicates().values
basic_system_dict = {row[1] : row[0] for row in mapping}
kcm_system_dict = {tgt : cat for cat,tgt in zip(KCM.cats,KCM.df['target'].drop_duplicates())}
assert basic_system_dict == kcm_system_dict, 'Dictionaries do not match between extractors'

assert (basic.target.values == KCM.df[['target','sample']].drop_duplicates()['target'].values).all()
assert (basic_train.target.values == KCM.df_train[['target','sample']].drop_duplicates()['target'].values).all()
assert (basic_test.target.values == KCM.df_test[['target','sample']].drop_duplicates()['target'].values).all()

In [None]:
def create_discovery_run_dir(base_dir="experiments", tag="discovery_run"):
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    uid = uuid.uuid4().hex[:8]
    run_dir = Path(base_dir) / f"{tag}_{ts}_{uid}"
    (run_dir / "plots").mkdir(parents=True)
    (run_dir / "results").mkdir()
    (run_dir / "logs").mkdir()
    print(f"Created discovery run directory: {run_dir}")
    return run_dir

In [None]:
def save_discovery_params(params, run_dir):
    path = Path(run_dir) / "discovery_params.json"
    with open(path, "w") as f:
        json.dump(params, f, indent=2)
    print(f"Saved discovery parameters to {path}")

In [None]:
params = {
    "codebook_training_size": 280,
    "num_clusters": 20,
    "use_soft_clustering": True,
    "tau": 0.5,
    "train_classes": [0, 1, 2, 3, 4, 5, 6],
    "koopman_model_dir": "experiments/koopman_run_20240810_1a2b3c"
}
save_discovery_params(params, run_dir)

In [None]:
# 1. Set up discovery run
discovery_run_dir = create_discovery_run_dir()
save_discovery_params(params, discovery_run_dir)

# 2. Load Koopman model
KCM, KCM_params = load_koopman_model(params["koopman_model_dir"])

# 3. Run category discovery
KCM.create_codebook(
    codebook_training_size=params["codebook_training_size"],
    category_discovery=True,
    train_classes=params["train_classes"],
    include_plots=True  # save plots later
)

# 4. Save results
save_artifact(KCM.codebook, discovery_run_dir, "codebook")
save_artifact(KCM.df_test, discovery_run_dir, "df_test")


In [None]:
############## Training Parameters ##############
input_dim = num_clusters
output_dim = 6
hidden_dims = [200,200] # [200, 200], [1024, 512, 256]
dropout = 0.3
classes = num_cats
epochs = 500
model_type = 'SMILE' # baseline, SMILE
temperature = 0.2
#################################################

kcm_trainer = CategoryDiscoveryTrainer(input_dim=input_dim,
                                       output_dim=output_dim,
                                       hidden_dims=hidden_dims,
                                       dropout=dropout,
                                       classes=classes,
                                       epochs=epochs,
                                       model_type=model_type,
                                       temperature=temperature)

basic_trainer = CategoryDiscoveryTrainer(input_dim=input_dim,
                                         output_dim=output_dim,
                                         hidden_dims=hidden_dims,
                                         dropout=dropout,
                                         classes=classes,
                                         epochs=epochs,
                                         model_type=model_type,
                                         temperature=temperature)

In [None]:
kcm_trainer.train_model(kcm_X_train, kcm_X_test, kcm_y_train, kcm_y_test)

In [None]:
basic_trainer.train_model(basic_X_train, basic_X_test, basic_y_train, basic_y_test)

In [None]:
kcm_trainer.plot_loss(log=True)
basic_trainer.plot_loss(log=True)

In [None]:
kcm_trainer.plot_unique_hash_count()
basic_trainer.plot_unique_hash_count()

In [None]:
kcm_trainer.plot_scores()
basic_trainer.plot_scores()

In [None]:
kcm_trainer.plot_hashes(index=-1,split_testing=False)
basic_trainer.plot_hashes(index=-1,split_testing=False)