### Create Koopman Object

In [3]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import pickle
import torch
from tqdm import tqdm

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, adjusted_mutual_info_score
from scipy.stats import wasserstein_distance

from pathlib import Path
import os
from datetime import datetime
import uuid
import json
import shutil

from kcm.koopman_category_model import KoopmanCategoryModel
from kcm.basic_feature_extract import BasicFeatureExtractor
from kcm.discovery import (
    CategoryDiscoveryTrainer,
    train_test_split_indices,
    prep_data_for_discovery,
    check_histograms,
    sup_con_loss,
    BaselineModel,
    HASHHead,
    cluster_acc,
    split_cluster_acc_v1,
    split_cluster_acc_v2,
    create_hash_ids
)

# import warnings
# warnings.filterwarnings('ignore')


plt.style.use('dark_background')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# Reproducibility Code
seed=42
rng = np.random.default_rng(seed)

In [7]:
############### Shared Inputs ###############
num_cats = 4 # 10, 4
num_samples = 100 # 500, 100
system_dimension = 3 # 2
test_size = 0.2
category_discovery=True
train_classes = range(3) # range(3) # range(7)
noisy_data=False
noise_std=0.01
samples_name = f'noisy_{noise_std}_samples' if noisy_data else 'samples'
data_path = (rf"C:\Users\peterdb1\Documents\Masters in ACM\(i-j) 625.801-802 - ACM Master's Research\Technical Work\koopman-category-discovery\data",
                f"{system_dimension}-dimensional-systems",
                f"dataset_{num_cats}_class_{num_samples}_{samples_name}.pkl"
            )
data_path = os.path.join(*data_path)
use_gpu = False
#############################################


############ kcm-specific inputs ############
delay_embeddings = 5
num_segments = 20 # 30, 8
svd_rank = None
dmd_rank = None
q = 1
num_clusters = 8 # 15, 5
codebook_training_size = 50 # 490 # divides <num training classes>
normalize_kcm_inputs=True
soft_clustering=True
tau = 0.1
#############################################

train_counts, test_counts = train_test_split_indices(num_cats,num_samples,test_size,train_classes,category_discovery,rng)

Category Discovery Split...
    Training Size: 240
    Testing Size: 160


### Prepare Data

In [8]:
KCM = KoopmanCategoryModel(num_cats=num_cats,
                           num_samples=num_samples,
                           system_dimension=system_dimension,
                           delay_embeddings=delay_embeddings,
                           num_segments=num_segments,
                           svd_rank=svd_rank,
                           dmd_rank=dmd_rank,
                           q=q,
                           data_path=data_path,
                           cluster_method='kmeans',
                           num_clusters=num_clusters,
                           noisy_data=noisy_data,
                           noise_std=noise_std,
                           normalize_inputs=normalize_kcm_inputs,
                           train_classes=train_classes,
                           soft_clustering=soft_clustering,
                           tau=tau,
                           seed=seed,
                           use_gpu=use_gpu)

KCM.train_counts = train_counts
KCM.test_counts = test_counts
KCM.generate_data()

kcm = KCM.df
kcm_train_data = kcm.loc[kcm['count'].isin(train_counts)].reset_index(drop=True)
kcm_test_data = kcm.loc[kcm['count'].isin(test_counts)].reset_index(drop=True)

KCM.df_train = kcm_train_data
KCM.df_test = kcm_test_data

assert int(kcm_train_data.shape[0]/num_segments) == len(train_counts), 'training samples not correct shape'
assert int(kcm_test_data.shape[0]/num_segments) == len(test_counts), 'testing samples not correct shape'

Loading data in at C:\Users\peterdb1\Documents\Masters in ACM\(i-j) 625.801-802 - ACM Master's Research\Technical Work\koopman-category-discovery\data\3-dimensional-systems\dataset_4_class_100_samples.pkl...
Generating 8000 DMD eigs/modes each with dimensionality 18


lorenz system: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 78.19it/s]
rossler system: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 73.99it/s]
chen system: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 75.59it/s]
halvorsen system: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 70.89it/s]
Formatting DMD data: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [19]:
print(KCM.run_dir.stem)

KCM_20250814_195212_1da3e4b9


In [10]:
KCM.create_codebook(codebook_training_size=codebook_training_size,
                    category_discovery=category_discovery,
                    include_plots=True)

KCM.create_feature_outputs()

KCM.save()
KCM.shutdown_logger()

Creating df_sample for codebook


(1/2) * 50^2 = 1250 Wasserstein distance metrics:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊           | 1128/1250 [00:03<00:00, 374.50it/s]
8 * 4800 = 38400 Wasserstein distance metrics: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38400/38400 [01:31<00:00, 419.81it/s]
8 * 3200 = 25600 Wasserstein distance metrics: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25600/25600 [01:05<00:00, 392.26it/s]
