In [1]:
# ---- Imports ----
import time
import random
import numpy as np
import scipy.io as sio
from scipy.optimize import linear_sum_assignment

from sklearn.preprocessing import normalize
import scipy.sparse as ss
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import SpectralClustering, KMeans

from clusteringPerformance import clusteringMetrics
from datapre_canntlink import process_views

import torch


In [2]:

def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

In [3]:
# ---- Data Loading ----
def load_data(path, dataset):
    data = sio.loadmat(path + dataset + '.mat')
    features = data['X']
    feature_list = []

    labels = data['Y'].flatten()
    labels = labels - min(set(labels))

    for i in range(features.shape[1]):
        features[0][i] = normalize(features[0][i])
        feature = features[0][i]
        if ss.isspmatrix_csr(feature):
            feature = feature.todense()
            print("Converted CSR sparse to dense for view", i)
        feature_list.append(feature)
    return feature_list, labels

In [4]:
# ---- Graph Construction & Evidence Fusion ----
def construct_knn_adjacency(features, k=5):
    """Build symmetric KNN adjacency for each view."""
    adj_matrices = []
    for view_idx, feature in enumerate(features):
        nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(feature)
        distances, indices = nbrs.kneighbors(feature)

        adj = np.zeros((feature.shape[0], feature.shape[0]), dtype=float)
        for i in range(indices.shape[0]):
            for j in indices[i]:
                adj[i, j] = 1.0
                adj[j, i] = 1.0
        adj_matrices.append(torch.tensor(adj, dtype=torch.float32))
    return adj_matrices


def dempster_combination(bpa1, bpa2):
    combined_bpa = bpa1 * bpa2
    conflict = combined_bpa.sum()
    if conflict == 1:
        return None
    normalized_bpa = combined_bpa / (1 - conflict)
    return normalized_bpa


def calculate_threshold_factors(bpas):
    threshold_factors = []
    for bpa in bpas:
        entropy = -torch.sum(bpa * torch.log(bpa + 1e-10))
        informational_content = 1.0 / (1.0 + entropy)
        threshold_factors.append(informational_content)
    return threshold_factors


def dempster_combination_thre(bpas, threshold_factors):
    """
    Combine BPAs with a threshold that adjusts based on the informational content of the views.
    bpas: List of BPAs as adjacency matrices.
    threshold_factors: List of factors reflecting the amount of information or confidence in each view.
    """
    # Initialize combined BPA as an empty matrix of the same shape
    combined_bpa = torch.zeros_like(bpas[0])

    # Calculate the normalization factor to avoid dividing by zero
    for i, bpa1 in enumerate(bpas):
        for j, bpa2 in enumerate(bpas):
            if i != j:
                combined_bpa += bpa1 * bpa2 * threshold_factors[i] * threshold_factors[
                    j]  

    normalization_factor = combined_bpa.sum()  
    if normalization_factor > 0:
        combined_bpa /= normalization_factor  # Normalize the combined BPA

    return combined_bpa


def apply_cannot_link_constraints(adj_matrix, cannot_link_matrix):
    return np.where(cannot_link_matrix == 1, 0, adj_matrix)

In [5]:
# ---- Clustering Utilities ----
def spectral_cluster(adj_matrix, n_clusters=3, random_state=42):
    clustering = SpectralClustering(
        n_clusters=n_clusters,
        affinity='precomputed',
        assign_labels='kmeans',
        random_state=random_state
    )
    labels = clustering.fit_predict(adj_matrix)
    return labels



def cluster_acc(y_true, y_pred):
    y_true = y_true.astype(np.int64)
    assert y_pred.size == y_true.size
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    u = linear_sum_assignment(w.max() - w)
    ind = np.concatenate([u[0].reshape(u[0].shape[0], 1), u[1].reshape([u[0].shape[0], 1])], axis=1)
    return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size


In [6]:
# ---- Linear Feature Filtering with Loss Tracking ----
def initialize_Uv(Xv, c, random_state=42):
    kmeans = KMeans(n_clusters=c, random_state=random_state).fit(Xv.T)
    Uv = kmeans.cluster_centers_.T
    return Uv

def initialize_Vv(Xv, Uv):
    Vv = np.dot(np.linalg.pinv(Uv.T @ Uv), Uv.T @ Xv)
    return Vv

def frobenius_loss(Xv_list, Uv_list, Vv_list):
    total_loss = 0.0
    for Xv, Uv, Vv in zip(Xv_list, Uv_list, Vv_list):
        total_loss += np.linalg.norm(Xv - np.dot(Uv, Vv), 'fro') ** 2
    return total_loss

def update_Uv_gd(Xv, Uv, Vv, learning_rate=0.01):
    gradient = -2 * np.dot((Xv - np.dot(Uv, Vv)), Vv.T)
    Uv -= learning_rate * gradient
    Uv = np.maximum(Uv, 0)
    return Uv

def update_Vv_gd(Xv, Uv, Vv, learning_rate=0.01):
    Vv = np.maximum(Vv, 0)
    gradient = -2 * np.dot(Uv.T, (Xv - np.dot(Uv, Vv)))
    Vv -= learning_rate * gradient
    U, _, Vt = np.linalg.svd(Vv, full_matrices=False)
    Vv = np.dot(U, Vt)
    return Vv

def optimize_multiview(Xv_list, c, iterations=200, learning_rate=0.0001, tol=1e-6, patience=50, log_every=10):
    Uv_list = [initialize_Uv(Xv, c) for Xv in Xv_list]
    Vv_list = [initialize_Vv(Xv, Uv) for Xv, Uv in zip(Xv_list, Uv_list)]

    prev_loss = frobenius_loss(Xv_list, Uv_list, Vv_list)
    no_improve = 0
    losses = [prev_loss]

    for it in range(iterations):
        for i in range(len(Xv_list)):
            Xv = Xv_list[i]
            Uv_list[i] = update_Uv_gd(Xv, Uv_list[i], Vv_list[i], learning_rate)
            Vv_list[i] = update_Vv_gd(Xv, Uv_list[i], Vv_list[i], learning_rate)

        cur_loss = frobenius_loss(Xv_list, Uv_list, Vv_list)
        losses.append(cur_loss)
        if (it + 1) % log_every == 0:
            print(f"[Iter {it + 1:4d}] Frobenius loss = {cur_loss:.6f}")

        if cur_loss < prev_loss:
            improve = prev_loss - cur_loss
            if improve < tol:
                no_improve += 1
            else:
                no_improve = 0
        else:
            no_improve = 0
        prev_loss = cur_loss

        if patience is not None and patience > 0 and no_improve >= patience:
            print(f"Early stopping at iter {it+1} (no_improve={no_improve}).")
            break

    return Uv_list, Vv_list, losses

In [7]:
# ---- Main Pipeline  ----
def run_pcmvc_pipeline(path, dataset_name, max_iter, tol, learn, k, p, seed=42, log_every=10):
    setup_seed(int(seed))

    features, labels = load_data(path, dataset_name)
    n_class = len(np.unique(labels))
    new_features = features.copy()
    num_views = len(new_features)
    for v in range(num_views):
        new_features[v] = new_features[v].T  # (n, d_v)


    U_list, V_list, losses = optimize_multiview(
        new_features,
        n_class,
        iterations=max_iter,
        learning_rate=learn,
        tol=tol,
        patience=50,
        log_every=log_every,
    )

    for v in range(num_views):
        V_list[v] = V_list[v].T
    new_adj_matrices = construct_knn_adjacency(V_list, k)
    threshold_factors = calculate_threshold_factors(new_adj_matrices)
    new_combined_adj = dempster_combination_thre(new_adj_matrices, threshold_factors)

    cannot_link_matrix = process_views(V_list, p)
    updated_adj_matrix = apply_cannot_link_constraints(new_combined_adj, cannot_link_matrix)

    pred_labels = spectral_cluster(updated_adj_matrix, len(np.unique(labels)))
    # print("True labels   (first 20):", labels[:20])
    print("Pred. labels  (first 20):", pred_labels[:20])
    acc = cluster_acc(labels, pred_labels)
    NMI, Purity, ARI, Fscore, Precision, Recall = clusteringMetrics(labels, pred_labels)

    print(f"""\nDataset: {dataset_name}
    ACC: {acc*100:.2f} | NMI: {NMI*100:.2f} | ARI: {ARI*100:.2f} | F-score: {Fscore*100:.2f}
    """.strip())

    return {
        "dataset": dataset_name,
        "ACC": acc, "NMI": NMI, "Purity": Purity, "ARI": ARI, "Fscore": Fscore, "Precision": Precision, "Recall": Recall,
    }

In [8]:
""# ---- Dataset & Hyperparameter Config ----
dataset_dict = {
    1: 'BBCSports',
    2: 'esp_game',
    3: 'NottingHill',
}

dataset_params = {
    1:{'k': 108, 'p': 50, 'learning_rate': 0.006068879513679375, 'iterations': 200},
    2:{'k': 1576, 'p': 60, 'learning_rate': 0.02904993490601889, 'iterations': 200},
    3: {'k': 932, 'p': 50, 'learning_rate': 0.07897219752341908, 'iterations': 200},

}

DATA_ROOT = './data/'
SELECTED = [1,2]
TOL = 1e-6
SEED = 42
LOG_EVERY = 10

In [9]:
# ---- Run & Log Results ----
results = []
save_path = 'results.txt'

for idx in SELECTED:
    name = dataset_dict[idx]
    params = dataset_params[idx]
    out = run_pcmvc_pipeline(
        path=DATA_ROOT, dataset_name=name,
        max_iter=params['iterations'], tol=TOL,
        learn=params['learning_rate'], k=params['k'], p=params['p'],
        seed=SEED, log_every=LOG_EVERY
    )
    results.append(out)

with open(save_path, "w") as f:
    for r in results:
        f.write(f"""dataset: {r['dataset']}
ACC: {r['ACC']*100:.2f}
NMI: {r['NMI']*100:.2f}
ARI: {r['ARI']*100:.2f}
Fscore: {r['Fscore']*100:.2f}
---------------------------------------
""")

print(f"Saved summary to {save_path}")

[Iter   10] Frobenius loss = 1060.153887
[Iter   20] Frobenius loss = 1048.831826
[Iter   30] Frobenius loss = 1039.881772
[Iter   40] Frobenius loss = 1032.815295
[Iter   50] Frobenius loss = 1027.245266
[Iter   60] Frobenius loss = 1022.857978
[Iter   70] Frobenius loss = 1019.402359
[Iter   80] Frobenius loss = 1016.679962
[Iter   90] Frobenius loss = 1014.534491
[Iter  100] Frobenius loss = 1012.842864
[Iter  110] Frobenius loss = 1011.508360
[Iter  120] Frobenius loss = 1010.454809
[Iter  130] Frobenius loss = 1009.622456
[Iter  140] Frobenius loss = 1008.964351
[Iter  150] Frobenius loss = 1008.443417
[Iter  160] Frobenius loss = 1008.030618
[Iter  170] Frobenius loss = 1007.703084
[Iter  180] Frobenius loss = 1007.442726
[Iter  190] Frobenius loss = 1007.235443
[Iter  200] Frobenius loss = 1007.070027
Pred. labels  (first 20): [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Dataset: BBCSports
    ACC: 96.14 | NMI: 88.07 | ARI: 90.40 | F-score: 92.60
[Iter   10] Frobenius loss = 29871.