In [1]:
import os
import scipy.io
import pandas as pd
from collections import Counter

In [2]:

dataset_path = 'Hopkins'
data_dirs = [
        d for d in os.listdir(dataset_path)
        if os.path.isdir(os.path.join(dataset_path, d)) and d != "README.txt"
]

num_subspace = {}
for data_dir in data_dirs:
    mat_file = f"{data_dir}_truth.mat"
    mat_path = os.path.join(dataset_path, data_dir, mat_file)
    mat = scipy.io.loadmat(mat_path)
    num_subspace[data_dir] = len(Counter(mat['s'][:,0]))
df_meta = pd.DataFrame(num_subspace.items(), columns=['data', 'num_space'])

In [3]:

def read_log(fname):
    data = []
    with open(fname) as f:
        for line in f:
            try:
                ds_name = line.strip().split(" - ")[0]
                error = float(line.strip().split(" - ")[1].split("Error: ")[1][:-1])
                # print(ds_name, error)
                data.append([ds_name, error])
            except:
                continue

    init_error = pd.DataFrame([data[x] for x in range(0, len(data), 2)])
    init_error.columns = ["data", "error"]
    init_error["data"] = init_error["data"].astype(str)
    model_error = pd.DataFrame([data[x] for x in range(1, len(data), 2)])
    model_error.columns = ["data", "error"]
    model_error["data"] = model_error["data"].astype(str)

    merged_df = init_error.merge(model_error, on="data",  how='inner',)
    merged_df.columns = ["data", "kss", "ppca"]

    return merged_df

In [4]:
mppca = read_log("logs/mppca_t1_n0.025_paper")
smppca = read_log("logs/smppca_test_t1_n0.025_a2.0_paper")

In [5]:
merged_df = mppca.merge(smppca, on="data",  how='inner',)
merged_df.columns = ["data", "mppca_kss", "mppca",  "smppca_kss", "smppca"]
merged_df = merged_df.merge(df_meta, on="data",  how='inner',)
merged_df.drop(columns=["smppca_kss"], inplace=True)
merged_df.rename(columns={"mppca_kss": "kss"}, inplace=True)

In [6]:
merged_df.columns

Index(['data', 'kss', 'mppca', 'smppca', 'num_space'], dtype='object')

In [8]:
len(merged_df[merged_df.mppca!=merged_df.smppca])

62

In [9]:
pd.set_option('display.max_columns', None)  

In [10]:
merged_df.groupby("num_space").describe()

Unnamed: 0_level_0,kss,kss,kss,kss,kss,kss,kss,kss,mppca,mppca,mppca,mppca,mppca,mppca,mppca,mppca,smppca,smppca,smppca,smppca,smppca,smppca,smppca,smppca
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
num_space,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
2,120.0,16.28325,16.663949,0.0,0.0,11.015,30.3475,49.54,120.0,13.251,14.809892,0.0,0.0,8.945,21.89,49.3,120.0,13.61725,15.026648,0.0,0.0,9.475,22.29,49.8
3,35.0,28.798571,17.241542,0.0,12.41,28.85,45.18,55.04,35.0,25.416,16.742252,0.0,11.105,20.61,38.775,54.91,35.0,26.148,16.627731,0.0,11.24,22.77,39.94,53.4
5,1.0,48.19,,48.19,48.19,48.19,48.19,48.19,1.0,47.15,,47.15,47.15,47.15,47.15,47.15,1.0,47.15,,47.15,47.15,47.15,47.15,47.15


In [16]:
merged_df[merged_df.mppca!=merged_df.smppca]

Unnamed: 0,data,kss,mppca,smppca,num_space
2,cars6,27.54,10.48,10.78,2
3,2R3RTC,51.53,49.30,49.86,3
11,1R2RCT_B,25.23,21.85,22.77,3
12,cars3_g23,30.43,5.98,5.71,2
21,2R3RTC_g12,37.84,37.30,37.84,2
...,...,...,...,...,...
148,1RT2RTCRT_B_g12,10.74,9.09,9.92,2
149,1R2RCT_B_g13,14.04,11.06,11.49,2
150,truck2,35.15,15.90,19.67,2
151,2T3RCR,55.04,49.87,49.35,3


In [None]:
# import seaborn as sns
import matplotlib.pyplot as plt

merged_df_2 = merged_df[merged_df.mppca!=merged_df.smppca]

df_melted = merged_df_2.melt(id_vars=['data', 'num_space'], value_vars=['kss', 'mppca', 'smppca'],
                    var_name='method', value_name='value')

df_melted = df_melted[df_melted['num_space']!=5]

# Create the bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='num_space', y='value', hue='method', data=df_melted)

# Add titles and labels
plt.title('Comparison of KSS, MPPCA, and S-MPPCA by Subspace')
plt.xlabel('Subspace')
plt.ylabel('Values')
plt.legend(title='Method')

# Show the plot
plt.tight_layout()
plt.show()

# Train check

In [1]:
from __future__ import print_function
import numpy as np
from scipy.linalg import inv, pinv
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import confusion_matrix
import itertools  # for permutations
from kss import mppca_init_KSS
from data import load_hopkins155
from tqdm import tqdm
np.random.seed(42)


In [2]:


def test_assignments(F_hat, mu_hat, Y_test, prop_hat, var_hat, epsilon=1e-6):
    """
    Compute the predicted assignments for test data using the trained model.
    """
    d, k = F_hat[0].shape
    n = Y_test.shape[1]
    J = len(F_hat)

    M_inv = []
    C_inv = []

    for j in range(J):
        M_j = var_hat[j] * np.eye(k) + F_hat[j].T @ F_hat[j]
        if not np.all(np.isfinite(M_j)):
            print(f"M_j contains NaNs or Infs at component {j+1}, adding regularization.")
            M_j += epsilon * np.eye(k)
        try:
            M_inv_j = inv(M_j)
        except np.linalg.LinAlgError:
            M_inv_j = pinv(M_j)
        M_inv.append(M_inv_j)

        C_inv_j = (1.0 / var_hat[j]) * (np.eye(d) - F_hat[j] @ M_inv_j @ F_hat[j].T)
        if not np.all(np.isfinite(C_inv_j)):
            print(f"C_inv_j contains NaNs or Infs at component {j+1}, adding regularization.")
            C_inv_j += epsilon * np.eye(d)
        C_inv.append(C_inv_j)

    log_likelihood = []
    for j in range(J):
        # Avoid zero or negative determinants
        sign, logdet = np.linalg.slogdet(C_inv[j])
        if sign <= 0 or not np.isfinite(logdet):
            print(f"log determinant not positive definite at component {j+1}, using epsilon.")
            logdet = np.log(epsilon)
        tmp = -0.5 * np.sum((Y_test - mu_hat[:, [j]]) * (C_inv[j] @ (Y_test - mu_hat[:, [j]])), axis=0)
        prop_j = max(prop_hat[j], epsilon)
        log_l = np.log(prop_j) + (-d / 2) * np.log(2 * np.pi) + 0.5 * logdet + tmp
        log_likelihood.append(log_l)

    log_likelihood = np.vstack(log_likelihood)
    assgn_hat_indices = np.argmax(log_likelihood, axis=0)
    # Map indices back to labels starting from 1
    assgn_hat = np.array([j + 1 for j in assgn_hat_indices])
    return assgn_hat

def get_test_error(Y_test, assgn_test, F0, mu0, prop0, var0, classes):

    #  Create mappings between labels and indices
    label_to_index = {label: idx for idx, label in enumerate(classes)}
    assgn_test_indices = np.array([label_to_index[label] for label in assgn_test])

    # Evaluate the KSS initialization on the test data
    min_error_init = float('inf')
    perms = list(itertools.permutations(range(len(classes))))  # Permutations over component indices
    for perm in perms:
        # Convert perm to a list or array for indexing
        perm_indices = np.array(perm)

        # Permute the initial parameters
        F_perm = [F0[i] for i in perm_indices]
        mu_perm = mu0[:, perm_indices]
        var_perm = var0[perm_indices]
        prop_perm = prop0[perm_indices]

        # Compute predicted assignments using the test data
        assgn_pred = test_assignments(F_perm, mu_perm, Y_test, prop_perm, var_perm, epsilon=1e-6)

        # Map predicted assignments to indices
        assgn_pred_indices = np.array([label_to_index[label] for label in assgn_pred])

        # Compute confusion matrix
        cm = confusion_matrix(assgn_test_indices, assgn_pred_indices, labels=range(len(classes)))
        row_ind, col_ind = linear_sum_assignment(-cm)
        mapping = {col_ind[i]: row_ind[i] for i in range(len(row_ind))}
        assgn_pred_mapped = np.array([mapping[idx] for idx in assgn_pred_indices])

        # Compute classification error
        error = np.mean(assgn_pred_mapped != assgn_test_indices)
        if error < min_error_init:
            min_error_init = error
    return min_error_init


In [6]:
params = {}
params['func'] = 'mppca'
params['noise'] = 0.025
params['temperature'] = 1
params['alpha_ent'] = 2
params['type_noise'] = 'default'

init='KSS'
N=156
max_iter=500

In [5]:

dataset_path = 'Hopkins'  # Update this path accordingly
train_split = 0.8  # Fraction of data to use for training

# Load the dataset
train_data_list, train_assign_list, dev_assign_list, dev_data_list, test_data_list, test_assign_list, metadata_list = load_hopkins155(dataset_path, 
                                                                                                                                        train_split=train_split, 
                                                                                                                                        N=N, 
                                                                                                                                        noise_value=params['noise'], 
                                                                                                                                        noise_type=params['type_noise'], 
                                                                                                                                        data_names=['cars6'])

cars6


In [16]:
from mppca import homppca_tipping

# Initialize accumulators for aggregation
total_min_error_init = 0.0  # Accumulator for KSS Initialization errors
total_min_error = 0.0       # Accumulator for PCCA errors
num_datasets = 0            # Counter for the number of datasets processed

# Iterate over each dataset in the Hopkins155 collection
for idx, (Y_train, assgn_train, Y_dev, assign_dev, _, _, metadata) in tqdm(enumerate(zip(
    train_data_list, train_assign_list, dev_data_list, dev_assign_list, test_data_list, test_assign_list, metadata_list))):

    # Normalize data
    Y_mean = np.mean(Y_train, axis=1, keepdims=True)
    Y_std = np.std(Y_train, axis=1, keepdims=True) + 1e-8
    Y_train = (Y_train - Y_mean) / Y_std
    Y_dev = (Y_dev - Y_mean) / Y_std

    classes = np.unique(assgn_train)  # Labels starting from 1
    J = len(classes)
    k = 4  # metadata['k']  # Assuming k is set to 4; adjust as needed

    # Initialize model
    if init == "KSS":
        F0, _, mu0, prop0, var0 = mppca_init_KSS(Y_train, J, k, max_iter=max_iter)
    else:
        F0 = [np.random.randn(Y_train.shape[0], k) for _ in range(J)]
        mu0 = Y_train[:, np.random.choice(Y_train.shape[1], J, replace=False)]  # Random data points as means
        prop0 = np.random.dirichlet(np.ones(J), size=1).flatten()
        var0 = np.random.uniform(0.1, 1, size=J)

    # get error
    min_error_init = get_test_error(Y_dev, assign_dev, F0, mu0, prop0, var0, classes)

    # Train the model
    F_hat, mu_hat, var_hat, prop_hat = homppca_tipping(Y_train, F0, mu0, prop0, var0, niter=max_iter, epsilon=1e-6,
                                                            T=params['temperature'], alpha=params['alpha_ent'], anneal=False)

    min_error = get_test_error(Y_dev, assign_dev, F_hat, mu_hat, prop_hat, var_hat, classes)

    # Accumulate the errors
    total_min_error_init += min_error_init
    total_min_error += min_error
    num_datasets += 1  # Increment the dataset counter

    # Print per-dataset results
    print(f"{metadata['name']} - {init} Init Classification Error: {min_error_init * 100:.2f}%")
    print(f"{metadata['name']} - PCCA Classification Error: {min_error * 100:.2f}%")

# After processing all datasets, compute and print average errors
if num_datasets > 0:
    average_min_error_init = total_min_error_init / num_datasets
    average_min_error = total_min_error / num_datasets

    print("=== Average Classification Errors Over All Datasets ===")
    print(f"{init} Initialization Average Error: {average_min_error_init * 100:.2f}%")
    print(f"Average Error: {average_min_error * 100:.2f}%")
else:
    print("No datasets were processed.")


1it [00:49, 49.31s/it]

cars6 - KSS Init Classification Error: 40.12%
cars6 - PCCA Classification Error: 9.58%
=== Average Classification Errors Over All Datasets ===
KSS Initialization Average Error: 40.12%
Average Error: 9.58%



