In [2]:
import os
import random
import numpy as np
import torch
def set_seed(seed: int = 42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print('Setting seed to', seed)
    

In [1]:
import numpy as np
import pywt
import scipy
from collections import Counter
import tqdm
from scipy.optimize import linear_sum_assignment

# Helper function to calculate entropy for a batch of signals
def calculate_entropy(batch_values):
    """
    Calculate entropy for a batch of signals.
    Each row in `batch_values` is a separate signal.
    """
    entropies = []
    for list_values in batch_values:
        counter_values = Counter(list_values.flatten()).most_common()
        probabilities = [elem[1] / len(list_values) for elem in counter_values]
        entropy = scipy.stats.entropy(probabilities)
        entropies.append(entropy)
    return np.array(entropies)[:, np.newaxis]

# Helper function to calculate statistics for a batch of signals
def calculate_statistics(batch_values):
    """
    Calculate statistics for a batch of signals.
    Each row in `batch_values` is a separate signal.
    """
    # n5 = np.nanpercentile(batch_values, 5, axis=1)
    # n25 = np.nanpercentile(batch_values, 25, axis=1)
    # n75 = np.nanpercentile(batch_values, 75, axis=1)
    # n95 = np.nanpercentile(batch_values, 95, axis=1)
    medians = np.nanpercentile(batch_values, 50, axis=1)
    means = np.nanmean(batch_values, axis=1)
    stds = np.nanstd(batch_values, axis=1)
    vars = np.nanvar(batch_values, axis=1)
    rms = np.nanmean(np.sqrt(batch_values**2), axis=1)
    return np.column_stack([medians, means, stds, vars, rms])

# Helper function to calculate crossings for a batch of signals
def calculate_crossings(batch_values):
    """
    Calculate zero and mean crossings for a batch of signals.
    `batch_values` is (batch_size x wavdec coefficients).
    """
    # Zero crossings: Check where the sign changes
    zero_crossings = np.sum(np.diff(np.sign(batch_values), axis=1) != 0, axis=1)

    # Mean crossings: Subtract the mean and check sign changes
    mean_values = np.nanmean(batch_values, axis=1, keepdims=True)
    mean_crossings = np.sum(np.diff(np.sign(batch_values - mean_values), axis=1) != 0, axis=1)
    
    return zero_crossings, mean_crossings

# Function to extract features for a batch of signals
def get_features(batch_values):
    """
    Extract features (entropy, crossings, and statistics) for a batch of signals.
    Each row in `batch_values` is a separate signal.
    """
    # entropy value is the same for each signals coefficients
    entropies = calculate_entropy(batch_values)
    # print(entropies.shape)
    zero_crossings, mean_crossings = calculate_crossings(batch_values)
    # print(zero_crossings.shape, mean_crossings.shape)
    statistics = calculate_statistics(batch_values)
    # print(statistics.shape)
    # Combine all features in a batch-wise manner
    return np.column_stack([entropies, zero_crossings, mean_crossings, statistics])

# Feature Extraction Function (for batched signals)
def batched_feature_extraction(batch_signals, wavelet='sym5'):
    """
    Extract wavelet features for a batch of signals.
    Each row in `batch_signals` is a separate signal. (n_samples, window_length)
    Out: feature matrix (n_samples, n_features)
    """
    features = []
    # Perform wavelet decomposition on batch and get dict,
    # with 'a' (approximation, low-freq) and 'd' (detail, high-freq) coefficients for each level
    coeffs_batch = pywt.wavedecn(batch_signals, wavelet, mode='symmetric', level=4, axes=[1])
    # NOTE: possibly padd the signals symmetricly for level 5
    # print(len(coeffs_batch), coeffs_batch[0].shape, coeffs_batch[1]['d'].shape)
    # batch_size 32 x 24 components for level 4, 40 for level 5
    # I want 32 x 24 x 8
  
    for item in coeffs_batch:
        if isinstance(item, dict):
            signal_coeffs = list(item.values())[0]
        else:
            signal_coeffs = item
            
        # print('signal_coeffs', signal_coeffs.shape)
        features.append(get_features(signal_coeffs))
    stacked_features = np.concatenate(features, axis=1)
    # print(stacked_features.shape)
    return stacked_features

# Prepare annotated data for batched signals during validation and test
def prepare_ann_data(loader):
    """
    Prepare features and labels for a batched signal loader.
    Input `loader` should provide batches as 2D arrays (num_signals x window_length_samples).
    """
    all_features = []
    all_labels = []

    for i, (signals, batch_labels) in enumerate(tqdm.tqdm(loader)):
        batch_features = batched_feature_extraction(signals)
        all_features.append(batch_features)
        all_labels.append(batch_labels.numpy())
        # Debug: break validation after x batches
        if i == 100:
            pass
    feature_stack = np.vstack(all_features)
    label_array = np.hstack(all_labels)
    
    # print(feature_stack.shape)
    # print(label_array.shape)
    return feature_stack, label_array


# Supervised KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data import DataLoader
import numpy as np
import joblib
from datetime import datetime
import os

def train_knn_classifier(
    train_dataset,
    val_dataset,
    test_dataset,
    model_filename=None,
    batch_size=32,
    n_neighbors=5,
    load_model=False
):
    """
    Train or load a KNN classifier on labeled data.

    Parameters:
        train_dataset (Dataset): Annotated training dataset.
        val_dataset (Dataset): Annotated validation dataset.
        test_dataset (Dataset): Annotated test dataset.
        model_filename (str): Path to save or load the model.
        batch_size (int): Batch size for DataLoader.
        n_neighbors (int): Number of neighbors for KNN.
        load_model (bool): Whether to load a saved model.

    Returns:
        np.ndarray: Predicted labels for the validation dataset.
    """
    if load_model and model_filename and os.path.exists(model_filename):
        print(f"Loading model from {model_filename}")
        knn = joblib.load(model_filename)
    else:
        print("Extracting training features...")
        X_train, Y_train = prepare_ann_data(DataLoader(train_dataset, batch_size=batch_size, shuffle=True))

        knn = KNeighborsClassifier(n_neighbors=n_neighbors)
        knn.fit(X_train, Y_train)

        if not model_filename:
            current_time = datetime.now().strftime("%d_%m_%Y_%H_%M")
            model_filename = f"knn_model_k{n_neighbors}_{current_time}.pkl"
        joblib.dump(knn, model_filename)
        print(f"Model saved to {model_filename}")

    # Validation phase
    print("Validation...")
    X_val, Y_val = prepare_ann_data(DataLoader(val_dataset, batch_size=batch_size, shuffle=False))
    val_predictions = knn.predict(X_val)

    print("Confusion Matrix:\n", confusion_matrix(Y_val, val_predictions))
    print("Classification Report:\n", classification_report(Y_val, val_predictions, digits=4))

    # Test phase
    print("Testing...")
    X_test, Y_test = prepare_ann_data(DataLoader(test_dataset, batch_size=batch_size, shuffle=False))
    test_predictions = knn.predict(X_test)

    print("Confusion Matrix:\n", confusion_matrix(Y_test, test_predictions))
    print("Classification Report:\n", classification_report(Y_test, test_predictions, digits=4))

    return val_predictions, Y_val, test_predictions, Y_test, knn

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data import DataLoader
import numpy as np
import joblib
from datetime import datetime
import os

def train_knn_classifier(
    train_dataset,
    val_dataset,
    test_dataset,
    model_filename=None,
    batch_size=32,
    n_neighbors=5,
    load_model=False
):
    """
    Train or load a KNN classifier on labeled data.

    Parameters:
        train_dataset (Dataset): Annotated training dataset.
        val_dataset (Dataset): Annotated validation dataset.
        test_dataset (Dataset): Annotated test dataset.
        model_filename (str): Path to save or load the model.
        batch_size (int): Batch size for DataLoader.
        n_neighbors (int): Number of neighbors for KNN.
        load_model (bool): Whether to load a saved model.

    Returns:
        np.ndarray: Predicted labels for the validation dataset.
    """
    if load_model and model_filename and os.path.exists(model_filename):
        print(f"Loading model from {model_filename}")
        knn = joblib.load(model_filename)
    else:
        print("Extracting training features...")
        global X_train, Y_train
        if 'X_train' in globals() and 'Y_train' in globals():
            print("Reusing cached training data")
        else:
            X_train, Y_train = prepare_ann_data(DataLoader(train_dataset, batch_size=batch_size, shuffle=True))

        knn = KNeighborsClassifier(n_neighbors=n_neighbors)
        knn.fit(X_train, Y_train)

        if not model_filename:
            current_time = datetime.now().strftime("%d_%m_%Y_%H_%M")
            model_filename = f"knn_model_k{n_neighbors}_{current_time}.pkl"
        joblib.dump(knn, model_filename)
        print(f"Model saved to {model_filename}")

    # Validation phase
    print("Validation...")
    global X_val, Y_val
    if 'X_val' in globals() and 'Y_val' in globals():
        print("Reusing cached validation data")
    else:
        X_val, Y_val = prepare_ann_data(DataLoader(val_dataset, batch_size=batch_size, shuffle=False))
    val_predictions = knn.predict(X_val)

    print("Confusion Matrix:\n", confusion_matrix(Y_val, val_predictions))
    print("Classification Report:\n", classification_report(Y_val, val_predictions, digits=4))

    # Test phase
    print("Testing...")
    global X_test, Y_test
    if 'X_test' in globals() and 'Y_test' in globals():
        print("Reusing cached test data")
    else:
        X_test, Y_test = prepare_ann_data(DataLoader(test_dataset, batch_size=batch_size, shuffle=False))
    test_predictions = knn.predict(X_test)


    print("Confusion Matrix:\n", confusion_matrix(Y_test, test_predictions))
    print("Classification Report:\n", classification_report(Y_test, test_predictions, digits=4))

    return val_predictions, Y_val, test_predictions, Y_test, knn

In [10]:
from butqdb_dataloaders import AnnotatedDataset, TrainDataset
import yaml

model_filename = "finished_models/baselines/knn_model.pkl"

config_file = "bolts_config.yaml"
with open(config_file, "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

data_path = config["dataset"]["data_path"]
train_records = config["dataset"]["train_records"]
validation_records = config["dataset"]["val_records"]
test_records = config["dataset"]["test_records"]
sampling_frequency = config["dataset"]["signal_fs"]
window_size = 2.5
stride = None

window_size = int(window_size * sampling_frequency)

train_dataset = AnnotatedDataset(data_path, train_records, window_size, sampling_frequency, stride=stride)
train_dataset_nolabels= TrainDataset(data_path, train_records, window_size, sampling_frequency, stride=stride)
val_dataset = AnnotatedDataset(data_path, validation_records, window_size, sampling_frequency, stride=stride)
test_dataset = AnnotatedDataset(data_path, test_records, window_size, sampling_frequency, stride=stride)

print("Train dataset size:", len(train_dataset))
print("Train dataset size (no labels):", len(train_dataset_nolabels))
print("Validation dataset size:", len(val_dataset))
print("Test dataset size:", len(test_dataset))

Train dataset size: 60434
Train dataset size (no labels): 269346
Validation dataset size: 39760
Test dataset size: 39380


In [5]:
val_predictions, Y_val, test_predictions, Y_test, knn = train_knn_classifier(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    batch_size=512,
    n_neighbors=5,
    load_model=False,
    model_filename=model_filename
)


Extracting training features...


100%|██████████| 119/119 [02:04<00:00,  1.05s/it]


Model saved to finished_models/baselines/knn_model.pkl
Validation...


100%|██████████| 78/78 [01:20<00:00,  1.03s/it]


Confusion Matrix:
 [[15231  3423     7]
 [11650  7940    16]
 [  551   936     6]]
Classification Report:
               precision    recall  f1-score   support

           1     0.5552    0.8162    0.6609     18661
           2     0.6456    0.4050    0.4977     19606
           3     0.2069    0.0040    0.0079      1493

    accuracy                         0.5829     39760
   macro avg     0.4692    0.4084    0.3888     39760
weighted avg     0.5867    0.5829    0.5559     39760

Testing...


100%|██████████| 77/77 [01:19<00:00,  1.03s/it]


Confusion Matrix:
 [[22354  3646     0]
 [ 9038  3212     0]
 [   35   302   793]]
Classification Report:
               precision    recall  f1-score   support

           1     0.7113    0.8598    0.7785     26000
           2     0.4486    0.2622    0.3310     12250
           3     1.0000    0.7018    0.8248      1130

    accuracy                         0.6693     39380
   macro avg     0.7200    0.6079    0.6447     39380
weighted avg     0.6379    0.6693    0.6406     39380



In [6]:
val_predictions, Y_val, test_predictions, Y_test, knn = train_knn_classifier(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    batch_size=512,
    n_neighbors=4,
    load_model=False,
    model_filename=model_filename
)


Extracting training features...


100%|██████████| 119/119 [02:02<00:00,  1.03s/it]


Model saved to finished_models/baselines/knn_model.pkl
Validation...


100%|██████████| 78/78 [01:19<00:00,  1.01s/it]


Confusion Matrix:
 [[16182  2472     7]
 [12986  6605    15]
 [  665   819     9]]
Classification Report:
               precision    recall  f1-score   support

           1     0.5424    0.8672    0.6674     18661
           2     0.6674    0.3369    0.4478     19606
           3     0.2903    0.0060    0.0118      1493

    accuracy                         0.5733     39760
   macro avg     0.5001    0.4034    0.3757     39760
weighted avg     0.5946    0.5733    0.5345     39760

Testing...


100%|██████████| 77/77 [01:18<00:00,  1.02s/it]


Confusion Matrix:
 [[23803  2197     0]
 [10151  2099     0]
 [   53   284   793]]
Classification Report:
               precision    recall  f1-score   support

           1     0.6999    0.9155    0.7933     26000
           2     0.4583    0.1713    0.2494     12250
           3     1.0000    0.7018    0.8248      1130

    accuracy                         0.6779     39380
   macro avg     0.7194    0.5962    0.6225     39380
weighted avg     0.6334    0.6779    0.6250     39380



In [7]:
val_predictions, Y_val, test_predictions, Y_test, knn = train_knn_classifier(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    batch_size=512,
    n_neighbors=3,
    load_model=False,
    model_filename=model_filename
)


Extracting training features...


100%|██████████| 119/119 [02:00<00:00,  1.01s/it]


Model saved to finished_models/baselines/knn_model.pkl
Validation...


100%|██████████| 78/78 [01:18<00:00,  1.00s/it]


Confusion Matrix:
 [[14922  3730     9]
 [11411  8178    17]
 [  566   917    10]]
Classification Report:
               precision    recall  f1-score   support

           1     0.5547    0.7996    0.6550     18661
           2     0.6377    0.4171    0.5043     19606
           3     0.2778    0.0067    0.0131      1493

    accuracy                         0.5812     39760
   macro avg     0.4901    0.4078    0.3908     39760
weighted avg     0.5852    0.5812    0.5566     39760

Testing...


100%|██████████| 77/77 [01:17<00:00,  1.00s/it]


Confusion Matrix:
 [[21721  4279     0]
 [ 8697  3553     0]
 [   42   295   793]]
Classification Report:
               precision    recall  f1-score   support

           1     0.7131    0.8354    0.7694     26000
           2     0.4372    0.2900    0.3487     12250
           3     1.0000    0.7018    0.8248      1130

    accuracy                         0.6619     39380
   macro avg     0.7168    0.6091    0.6476     39380
weighted avg     0.6355    0.6619    0.6401     39380



In [11]:
val_predictions, Y_val, test_predictions, Y_test, knn = train_knn_classifier(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    batch_size=512,
    n_neighbors=6,
    load_model=False,
    model_filename=model_filename
)


Extracting training features...


100%|██████████| 119/119 [02:02<00:00,  1.03s/it]


Model saved to finished_models/baselines/knn_model.pkl
Validation...


100%|██████████| 78/78 [01:21<00:00,  1.04s/it]


Confusion Matrix:
 [[16096  2557     8]
 [12642  6946    18]
 [  627   860     6]]
Classification Report:
               precision    recall  f1-score   support

           1     0.5481    0.8625    0.6703     18661
           2     0.6703    0.3543    0.4635     19606
           3     0.1875    0.0040    0.0079      1493

    accuracy                         0.5797     39760
   macro avg     0.4686    0.4069    0.3806     39760
weighted avg     0.5948    0.5797    0.5435     39760

Testing...


100%|██████████| 77/77 [01:20<00:00,  1.04s/it]


Confusion Matrix:
 [[23764  2236     0]
 [10083  2167     0]
 [   46   291   793]]
Classification Report:
               precision    recall  f1-score   support

           1     0.7011    0.9140    0.7935     26000
           2     0.4617    0.1769    0.2558     12250
           3     1.0000    0.7018    0.8248      1130

    accuracy                         0.6786     39380
   macro avg     0.7209    0.5976    0.6247     39380
weighted avg     0.6352    0.6786    0.6272     39380



In [12]:
val_predictions, Y_val, test_predictions, Y_test, knn = train_knn_classifier(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    batch_size=512,
    n_neighbors=7,
    load_model=False,
    model_filename=model_filename
)


Extracting training features...
Reusing cached training data
Model saved to finished_models/baselines/knn_model.pkl
Validation...
Reusing cached validation data
Confusion Matrix:
 [[15428  3225     8]
 [11801  7790    15]
 [  569   922     2]]
Classification Report:
               precision    recall  f1-score   support

           1     0.5550    0.8268    0.6642     18661
           2     0.6526    0.3973    0.4939     19606
           3     0.0800    0.0013    0.0026      1493

    accuracy                         0.5840     39760
   macro avg     0.4292    0.4085    0.3869     39760
weighted avg     0.5853    0.5840    0.5554     39760

Testing...
Reusing cached test data
Confusion Matrix:
 [[22711  3289     0]
 [ 9216  3034     0]
 [   38   301   791]]
Classification Report:
               precision    recall  f1-score   support

           1     0.7105    0.8735    0.7836     26000
           2     0.4580    0.2477    0.3215     12250
           3     1.0000    0.7000    0.8235  

In [13]:
val_predictions, Y_val, test_predictions, Y_test, knn = train_knn_classifier(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    batch_size=512,
    n_neighbors=8,
    load_model=False,
    model_filename=model_filename
)


Extracting training features...
Reusing cached training data
Model saved to finished_models/baselines/knn_model.pkl
Validation...
Reusing cached validation data
Confusion Matrix:
 [[16117  2536     8]
 [12566  7023    17]
 [  614   873     6]]
Classification Report:
               precision    recall  f1-score   support

           1     0.5501    0.8637    0.6721     18661
           2     0.6732    0.3582    0.4676     19606
           3     0.1935    0.0040    0.0079      1493

    accuracy                         0.5821     39760
   macro avg     0.4723    0.4086    0.3825     39760
weighted avg     0.5974    0.5821    0.5463     39760

Testing...
Reusing cached test data
Confusion Matrix:
 [[23852  2148     0]
 [10043  2207     0]
 [   41   296   793]]
Classification Report:
               precision    recall  f1-score   support

           1     0.7029    0.9174    0.7959     26000
           2     0.4745    0.1802    0.2612     12250
           3     1.0000    0.7018    0.8248  

In [14]:
val_predictions, Y_val, test_predictions, Y_test, knn = train_knn_classifier(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    batch_size=512,
    n_neighbors=9,
    load_model=False,
    model_filename=model_filename
)


Extracting training features...
Reusing cached training data
Model saved to finished_models/baselines/knn_model.pkl
Validation...
Reusing cached validation data
Confusion Matrix:
 [[15607  3046     8]
 [11927  7664    15]
 [  560   931     2]]
Classification Report:
               precision    recall  f1-score   support

           1     0.5555    0.8363    0.6676     18661
           2     0.6584    0.3909    0.4905     19606
           3     0.0800    0.0013    0.0026      1493

    accuracy                         0.5853     39760
   macro avg     0.4313    0.4095    0.3869     39760
weighted avg     0.5884    0.5853    0.5553     39760

Testing...
Reusing cached test data
Confusion Matrix:
 [[23040  2960     0]
 [ 9363  2887     0]
 [   35   304   791]]
Classification Report:
               precision    recall  f1-score   support

           1     0.7103    0.8862    0.7885     26000
           2     0.4694    0.2357    0.3138     12250
           3     1.0000    0.7000    0.8235  

In [15]:
val_predictions, Y_val, test_predictions, Y_test, knn = train_knn_classifier(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    batch_size=512,
    n_neighbors=10,
    load_model=False,
    model_filename=model_filename
)


Extracting training features...
Reusing cached training data
Model saved to finished_models/baselines/knn_model.pkl
Validation...
Reusing cached validation data
Confusion Matrix:
 [[16136  2516     9]
 [12512  7078    16]
 [  602   889     2]]
Classification Report:
               precision    recall  f1-score   support

           1     0.5517    0.8647    0.6736     18661
           2     0.6752    0.3610    0.4705     19606
           3     0.0741    0.0013    0.0026      1493

    accuracy                         0.5839     39760
   macro avg     0.4336    0.4090    0.3822     39760
weighted avg     0.5946    0.5839    0.5482     39760

Testing...
Reusing cached test data
Confusion Matrix:
 [[23907  2093     0]
 [10057  2193     0]
 [   41   297   792]]
Classification Report:
               precision    recall  f1-score   support

           1     0.7030    0.9195    0.7968     26000
           2     0.4785    0.1790    0.2606     12250
           3     1.0000    0.7009    0.8241  

# KMEANS

In [53]:
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
def evaluate_performance(Y_true, Y_pred):
    """
    Evaluates the performance of the model by calculating accuracy, confusion matrix, and classification report.

    Parameters:
        Y_true (np.ndarray): True labels.
        Y_pred (np.ndarray): Predicted labels.

    Returns:
        dict: A dictionary containing evaluation metrics (accuracy, confusion matrix, class-wise accuracy, and classification report).
    """
    # Calculate balanced accuracy score
    total_accuracy = balanced_accuracy_score(Y_true, Y_pred)
    # Generate confusion matrix
    conf_matrix = confusion_matrix(Y_true, Y_pred)
    # Calculate per-class accuracy
    class_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)
    # Get the classification report
    class_report = classification_report(Y_true, Y_pred, digits=4)
    # Return all metrics in a dictionary
    return {
        'total_accuracy': total_accuracy,
        'confusion_matrix': conf_matrix,
        'class_accuracy': class_accuracy,
        'classification_report': class_report
    }
    
def mapping_JV_prop(clusters, true_labels):
    """
    Map clusters to true labels using the modified Jonker-Volgenant algorithm, 
    considering the proportional presence of each class in each cluster.
    This approach dynamically handles the many-to-one relationship 
    between clusters and true labels.

    Args:
    - clusters: Array of cluster assignments for each sample.
    - true_labels: Array of true labels for each sample.

    Returns:
    - A dictionary where keys are clusters, and values are true labels (each cluster maps to one class).
    """
    
    # Get unique clusters and labels
    all_clusters = np.unique(clusters)
    all_labels = np.unique(true_labels)
    
    # Compute global counts for each label (assuming labels start at 1)
    global_counts = dict(enumerate(np.bincount(true_labels)[1:], start=1))
    
    k = len(all_clusters)
    c = len(all_labels)
    max_clusters_per_class = k - c + 1
    # print(max_clusters_per_class)
    
    # Build the initial cost matrix: rows for clusters, columns for labels
    cost_matrix = np.zeros((len(all_clusters), len(all_labels)*max_clusters_per_class), dtype=float)
    
    for i, cluster in enumerate(all_clusters):
        # Get indices for samples in this cluster
        indices = np.where(clusters == cluster)[0]
        
        for j, label in enumerate(all_labels):
            # Count the occurrences of this label in the cluster
            count = np.sum(true_labels[indices] == label)
            
            # Calculate proportion of this label in the cluster
            proportion = count / global_counts[label] if global_counts[label] else 0
            cost_matrix[i, j] = proportion
            # duplicate the columns to allow for multiple clusters to 1 class
            for dupe_idx in range(1, max_clusters_per_class):
                # print(j, dupe_idx, j + len(all_labels)*dupe_idx)
                cost_matrix[i, j + (len(all_labels)*dupe_idx)] = proportion  # Duplicate each label
    
    # print(cost_matrix.shape)
    # Apply the modified Jonker-Volgenant algorithm (linear sum assignment)
    row_ind, col_ind = linear_sum_assignment(cost_matrix, maximize=True)
    
    # print(row_ind, col_ind)
    
    # Initialize a dictionary for mapping clusters to labels
    # cluster_to_label_mapping = {cluster: [] for cluster in all_clusters}
    cluster_to_label_mapping = {}
    
    # Perform the dynamic assignment
    for r, c in zip(row_ind, col_ind):
        cluster = all_clusters[r]
        label = all_labels[c % len(all_labels)]
        cluster_to_label_mapping[cluster] = label
    
    # Now `cluster_to_label_mapping` contains the many-to-one mapping of clusters to classes
    return cluster_to_label_mapping


In [None]:
# new kmeans train with k selection
from sklearn.metrics import silhouette_score
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.cluster import MiniBatchKMeans
import os

def plot_scores(scores):
    """
    Plot the provided scores (either WSS or Silhouette Scores) to help determine the optimal number of clusters.
    
    Args:
        scores (list or array): The list or array of score values to plot (WSS or Silhouette).
    """
    # Plot the scores
    plt.figure(figsize=(8, 6))
    plt.plot(range(3, 10+1), scores, marker='o')
    plt.title('Score vs. Number of Clusters')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Score')
    plt.show()

    # Find the optimal k based on the highest score
    # optimal_k = np.argmax(scores) + 2  # +2 because range starts at 2
    # print(f"Optimal k based on Score: {optimal_k}")

def select_best_k(train_dataset, val_dataset, batch_size, min_k, max_k, random_state=42):
    """
    Determines the optimal number of clusters using the silhouette score.

    Parameters:
        train_dataset (Dataset): Training dataset (no annotations needed).
        val_dataset (Dataset): Validation dataset (with annotations).
        batch_size (int): Batch size for training.
        min_k (int): Minimum number of clusters to test.
        max_k (int): Maximum number of clusters to test.
        random_state (int): Random seed.

    Returns:
        int: The best number of clusters based on silhouette score.
    """

    # Prepare validation data
    X_val, _ = prepare_ann_data(DataLoader(val_dataset, batch_size=batch_size, shuffle=False))
    
    best_k = min_k
    best_score = -1
    
    wss = []
    silhouette_scores = []


    for k in range(min_k, max_k + 1):
        print(f"Evaluating KMeans with k={k} clusters...")
        
        # Initialize MiniBatchKMeans
        kmeans = MiniBatchKMeans(n_clusters=k, batch_size=batch_size, random_state=random_state)
        
        # Train the model on the training dataset
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        for i, (signals, batch_labels) in enumerate(tqdm.tqdm(train_loader)):
            batch_features = batched_feature_extraction(signals)
            kmeans.partial_fit(batch_features)

        # Predict validation clusters
        val_predictions = kmeans.predict(X_val)

        wss.append(kmeans.inertia_)
        
        # Compute silhouette score (higher is better)
        score = silhouette_score(X_val, val_predictions)
        silhouette_scores.append(score)
        print(f"Silhouette Score for k={k}: {score}")

        # Update best k if this score is higher
        if score > best_score:
            best_score = score
            best_k = k

    print(f"Best k found: {best_k} with silhouette score: {best_score}")
    print('sil ', silhouette_scores)
    print('eblow ', wss)
    return best_k, silhouette_scores, wss


def train_kmeans(
    train_dataset,
    val_dataset,
    test_dataset,
    model_filename=None,
    min_k=None,
    max_k=10,
    batch_size=32,
    random_state=42,
    load_model=False,
    num_clusters=None
):
    """
    Train or load a MiniBatchKMeans model, selecting the best k using silhouette score.

    Parameters:
        train_dataset (Dataset): Training dataset.
        val_dataset (Dataset): Validation dataset.
        test_dataset (Dataset): Test dataset.
        model_filename (str): Path to save or load the model.
        min_k (int): Minimum number of clusters (defaults to num_classes).
        max_k (int): Maximum number of clusters.
        batch_size (int): Batch size for training.
        random_state (int): Random seed.
        load_model (bool): Whether to load the model instead of training.

    Returns:
        np.ndarray: Predicted labels for the validation dataset.
    """
    # Determine number of classes (min_k)
    print('Preparing features...')
    _, Y_val = prepare_ann_data(DataLoader(val_dataset, batch_size=batch_size, shuffle=False))
    num_classes = len(np.unique(Y_val))
    min_k = min_k if min_k else num_classes
    if load_model and os.path.exists(model_filename):
        print(f"Loading model from {model_filename}")
        kmeans = joblib.load(model_filename)
    else:
        if not num_clusters:
            print("Selecting best number of clusters...")
            best_k, sil, wss = select_best_k(train_dataset, val_dataset, batch_size, min_k, max_k, random_state)
            plot_scores(sil)
            plot_scores(wss)
        else:
            best_k = num_clusters
                
        print(f"Training final model with k={best_k} clusters...")
        kmeans = MiniBatchKMeans(n_clusters=best_k, batch_size=batch_size, random_state=random_state)
        
        # train_dataset[np.random.choice(range(len(train_dataset)), 96*100)]
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
        for i, (signals, batch_labels) in enumerate(tqdm.tqdm(train_loader)):
            batch_features = batched_feature_extraction(signals)
            kmeans.partial_fit(batch_features)
            
        current_time = datetime.now().strftime("%d_%m_%Y_%H_%M")
        model_filename = f"kmeans_baseline_model_k{best_k}_{current_time}.pkl"
        joblib.dump(kmeans, model_filename)
        print(f"Model saved to {model_filename}")

    # Validation phase
    print('Validation...')
    X_val, Y_val = prepare_ann_data(DataLoader(val_dataset, batch_size=batch_size, shuffle=False))
    val_predictions = kmeans.predict(X_val)
    
    cluster_to_label = mapping_JV_prop(val_predictions, Y_val)
    print(cluster_to_label)
    val_labels_pred = np.array([cluster_to_label[cluster] for cluster in val_predictions])

    val_metrics = evaluate_performance(Y_val, val_labels_pred)
    print("Confusion Matrix:", val_metrics['confusion_matrix'])
    print("Per-class accuracy:", val_metrics['class_accuracy'])
    print(val_metrics['classification_report'])

    # Testing phase
    print('Testing...')
    X_test, Y_test = prepare_ann_data(DataLoader(test_dataset, batch_size=batch_size, shuffle=False))
    test_predictions = kmeans.predict(X_test)

    test_labels_pred = np.array([cluster_to_label[cluster] for cluster in test_predictions])
    test_metrics = evaluate_performance(Y_test, test_labels_pred)
    
    print("Confusion Matrix:", test_metrics['confusion_matrix'])
    print("Per-class accuracy:", test_metrics['class_accuracy'])
    print(test_metrics['classification_report'])

    return val_labels_pred


# model storage name
window_size = 2.5  # in seconds
sampling_frequency = 100  # Hz
stride = None

window_size = window_size * sampling_frequency

# Use TrainDataset for training data (no annotations)
mode = None # not 'random' since kmeans is not optimization method
train_dataset = TrainDataset(data_path, train_records, window_size, sampling_frequency, stride, mode=mode)
# Use AnnotatedDataset for validation data (with annotations)
val_dataset = AnnotatedDataset(data_path, validation_records, window_size, sampling_frequency, stride, balanced_classes=False)

test_dataset = AnnotatedDataset(data_path, test_records, window_size, sampling_frequency, stride, balanced_classes=False)

predictions = train_kmeans(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    batch_size=512,
    load_model=True,
    model_filename="finished_models/baselines//kmeans_model.pkl",
    num_clusters=3
)


Preparing features...


100%|██████████| 78/78 [01:20<00:00,  1.04s/it]


Training final model with k=3 clusters...


100%|██████████| 527/527 [09:24<00:00,  1.07s/it]


Model saved to kmeans_baseline_model_k3_29_04_2025_14_18.pkl
Validation...


100%|██████████| 78/78 [01:22<00:00,  1.05s/it]


{0: 2, 1: 1, 2: 3}
Confusion Matrix: [[12928  4982   751]
 [ 7123  9237  3246]
 [  175   420   898]]
Per-class accuracy: [0.69278174 0.47113129 0.60147354]
              precision    recall  f1-score   support

           1     0.6392    0.6928    0.6649     18661
           2     0.6310    0.4711    0.5395     19606
           3     0.1835    0.6015    0.2812      1493

    accuracy                         0.5801     39760
   macro avg     0.4845    0.5885    0.4952     39760
weighted avg     0.6180    0.5801    0.5886     39760

Testing...


100%|██████████| 77/77 [01:20<00:00,  1.04s/it]

Confusion Matrix: [[21549  3877   574]
 [ 8199  3289   762]
 [  800   133   197]]
Per-class accuracy: [0.82880769 0.2684898  0.17433628]
              precision    recall  f1-score   support

           1     0.7054    0.8288    0.7621     26000
           2     0.4506    0.2685    0.3365     12250
           3     0.1285    0.1743    0.1480      1130

    accuracy                         0.6357     39380
   macro avg     0.4282    0.4239    0.4155     39380
weighted avg     0.6096    0.6357    0.6121     39380




