
# Correlation-aware k-Means (Signed-Logit Margin → Column z-score → Row-center & L2 → k-Means)

**디렉토리 구조**
```
./raw/src_{i:03d}/probs/tgt_{j:03d}.csv   # columns: p_A, label_num
```
프로세스: 확률→로짓(클립)→서명 마진 → **열 z-score** → **행 중심화+L2** → **k-means(U)**  
유사도/거리: \(S=\frac{UU^\top+I}{2}\), \(D=1-S\).


In [1]:

# 0) Imports & Config
import os, math, random
import numpy as np
import pandas as pd
from typing import Dict, Tuple, List
from collections import Counter

np.set_printoptions(suppress=True, linewidth=140)

M=53; J=53; T=40; S=J*T
DIR_PATH="./raw"
PROB_EPS=1e-6; LOGIT_CLIP=7.0; STD_EPS=1e-12; SEED=0
rng=np.random.default_rng(SEED); random.seed(SEED)


In [2]:

# 1) Strict loader
def load_probs_labels_strict(dir_path: str, M: int, J: int):
    probs={i:{} for i in range(1,M+1)}
    labels={j: None for j in range(1,J+1)}
    any_found=False
    for i in range(1,M+1):
        for j in range(1,J+1):
            f=os.path.join(dir_path, f"src_{i:03d}", "probs", f"tgt_{j:03d}.csv")
            if not os.path.exists(f):
                continue
            df=pd.read_csv(f)
            if "p_A" not in df.columns or "label_num" not in df.columns:
                raise ValueError(f"{f}: columns p_A,label_num required")
            p=df["p_A"].to_numpy(float); q=df["label_num"].to_numpy(int)
            if labels[j] is None: labels[j]=q
            else:
                if labels[j].shape!=q.shape or not np.array_equal(labels[j],q):
                    raise ValueError(f"Label mismatch for client {j}: {f}")
            probs[i][j]=p; any_found=True
    if not any_found:
        raise RuntimeError("No files found under ./raw/...")
    labels={j:v for j,v in labels.items() if v is not None}
    return probs, labels

probs, labels = load_probs_labels_strict(DIR_PATH, M, J)
print("Loaded label clients:", len(labels))


Loaded label clients: 53


In [3]:

# 2) Build signed-logit margin matrix and column z-score
import math
def build_margin_matrix(probs, labels, prob_eps=PROB_EPS, logit_clip=LOGIT_CLIP, std_eps=STD_EPS):
    cols=[]
    for j in range(1,J+1):
        if j not in labels: continue
        for t in range(len(labels[j])):
            cols.append((j,t))
    S_eff=len(cols); 
    X=np.full((M,S_eff), np.nan, float)
    for s,(j,t) in enumerate(cols):
        q=labels[j][t]; y=2*q-1
        for i in range(1,M+1):
            arr=probs.get(i,{}).get(j,None)
            if arr is None or t>=len(arr): continue
            p=float(np.clip(arr[t], prob_eps, 1-prob_eps))
            lv=math.log(p/(1-p))
            lv=max(-logit_clip, min(logit_clip, lv))
            X[i-1,s]=y*lv
    good=~np.isnan(X).any(axis=0)
    X=X[:,good]
    mu=X.mean(axis=0, keepdims=True); Xc=X-mu
    sd=Xc.std(axis=0, keepdims=True); sd=np.where(sd<std_eps,1.0,sd)
    Xz=Xc/sd
    return Xz

Xz=build_margin_matrix(probs, labels)
print("Xz:", Xz.shape)


Xz: (53, 2120)


In [8]:

# 3) Row-center + L2 -> U; S and D
def to_U_row_center_l2(Xz, std_eps=STD_EPS):
    row_mean=Xz.mean(axis=1, keepdims=True)
    Xr=Xz-row_mean
    n=np.linalg.norm(Xr, axis=1, keepdims=True)
    n=np.where(n<std_eps,1.0,n)
    return Xr/n

def corr_similarity_from_U(U):
    C=U@U.T
    C=np.clip(C,-1.0,1.0)
    S=0.5*(C+1.0)
    np.fill_diagonal(S,1.0)
    return S

U=to_U_row_center_l2(Xz)
S_matrix=corr_similarity_from_U(U)
print("S_matrix shape:", S_matrix.shape, "range:", float(S_matrix.min()), "to", float(S_matrix.max()))
D=1.0-S_matrix; np.fill_diagonal(D,0.0)
print("U:", U.shape, "S range:", S_matrix.min(), "to", S_matrix.max())


S_matrix shape: (53, 53) range: 0.00826237447558692 to 1.0
U: (53, 2120) S range: 0.00826237447558692 to 1.0


In [5]:

# 4) k-means on U
from sklearn.cluster import KMeans
def run_kmeans_on_U(U, K, n_init=50, max_iter=1000, random_state=SEED):
    km=KMeans(n_clusters=K, n_init=n_init, max_iter=max_iter, random_state=random_state, tol=1e-4, algorithm="lloyd")
    labels=km.fit_predict(U)
    centers=km.cluster_centers_
    # spherical refinement
    cn=np.linalg.norm(centers, axis=1, keepdims=True)+1e-12
    centers=centers/cn
    labels=(U@centers.T).argmax(axis=1)
    centers=np.vstack([U[labels==k].mean(axis=0) if np.any(labels==k) else km.cluster_centers_[k] for k in range(K)])
    centers=centers/(np.linalg.norm(centers, axis=1, keepdims=True)+1e-12)
    sse=float(np.sum((U-centers[labels])**2))
    return labels, centers, sse


In [6]:

# 5) Evaluation helpers
import numpy as np
from collections import Counter

def sizes_dict(labels):
    c=Counter(labels.tolist())
    return {int(k):int(v) for k,v in sorted(c.items())}

def silhouette_from_D(D, labels):
    N=D.shape[0]; labs=np.array(labels); uniq=np.unique(labs)
    masks={k:(labs==k) for k in uniq}; sizes={k:int(m.sum()) for k,m in masks.items()}
    a=np.zeros(N); b=np.zeros(N); s=np.zeros(N)
    for i in range(N):
        ki=labs[i]; m=masks[ki].copy(); m[i]=False
        n_same=sizes[ki]-1
        a[i]=0.0 if n_same<=0 else float(D[i,m].mean())
        bmin=np.inf
        for k in uniq:
            if k==ki: continue
            mm=masks[k]; 
            if sizes[k]==0: continue
            bmin=min(bmin, float(D[i,mm].mean()))
        b[i]=0.0 if not np.isfinite(bmin) else bmin
        denom=max(a[i],b[i])
        s[i]=0.0 if denom<=0 else (b[i]-a[i])/denom
    s_macro={k:(0.0 if sizes[k]<=1 else float(s[masks[k]].mean())) for k in uniq}
    return s, float(s.mean()), float(a.mean()), s_macro

def intra_inter_S(S, labels):
    labs=np.array(labels); uniq=np.unique(labs)
    masks={k:(labs==k) for k in uniq}; sizes={k:int(m.sum()) for k,m in masks.items()}
    # intra macro/micro
    intra_vals=[]; pair_weights=[]
    for k in uniq:
        idx=np.where(masks[k])[0]; nk=len(idx)
        if nk<=1: intra_vals.append(1.0); pair_weights.append(0)
        else:
            sub=S[np.ix_(idx,idx)].copy(); np.fill_diagonal(sub, np.nan)
            intra_vals.append(float(np.nanmean(sub))); pair_weights.append(nk*(nk-1)//2)
    intra_macro=float(np.mean(intra_vals))
    tot_pairs=sum(pair_weights)
    intra_micro=float(np.sum([v*w for v,w in zip(intra_vals, pair_weights)])/(tot_pairs if tot_pairs>0 else 1))
    # inter macro/micro
    inter_vals=[]; inter_w=[]
    keys=list(uniq)
    for a in range(len(keys)):
        for b in range(a+1,len(keys)):
            ia=np.where(masks[keys[a]])[0]; ib=np.where(masks[keys[b]])[0]
            sub=S[np.ix_(ia,ib)]; v=float(sub.mean())
            inter_vals.append(v); inter_w.append(len(ia)*len(ib))
    inter_macro=float(np.mean(inter_vals)) if inter_vals else 0.0
    inter_micro=float(np.sum(np.array(inter_vals)*np.array(inter_w))/np.sum(inter_w)) if inter_w else 0.0
    # nearest-inter (size-weighted)
    nearest=[]
    for k in uniq:
        best=0.0
        for l in uniq:
            if l==k: continue
            ia=np.where(masks[k])[0]; ib=np.where(masks[l])[0]
            v=float(S[np.ix_(ia,ib)].mean())
            best=max(best,v)
        nearest.append((k,best,sizes[k]))
    Ntot=float(sum(sizes.values())); nearest_sw=sum(w*v for _,v,w in nearest)/(Ntot if Ntot>0 else 1.0)
    NI_D_sw=1.0-nearest_sw
    NI_D_min=min(1.0-v for _,v,_ in nearest) if nearest else 0.0
    return intra_macro, intra_micro, inter_macro, inter_micro, NI_D_sw, NI_D_min

def neg_sil_metrics(s):
    r_neg=float(np.mean(s<0.0))
    burden=float(np.mean(np.maximum(0.0,-s)))
    return r_neg, burden

def distance_margin(NI_D_sw, mean_a):
    return float(NI_D_sw - mean_a)

def prototypes_nearest(U, centers, labels):
    K=centers.shape[0]; protos={}
    for k in range(K):
        idx=np.where(labels==k)[0]
        if len(idx)==0: protos[k]=None; continue
        sims=(U[idx]@centers[k][:,None]).ravel()
        protos[k]=int(idx[int(np.argmax(sims))])
    return protos


In [7]:

# 6) Runner: try multiple K and print reports
def print_report(name, U, S, D, labels, centers, sse, null_R=0, rng=np.random.default_rng(0)):
    from collections import Counter
    N=U.shape[0]
    s_vals, s_all, mean_a, s_macro = silhouette_from_D(D, labels)
    s_macro_avg=float(np.mean(list(s_macro.values()))) if s_macro else s_all
    intra_macro, intra_micro, inter_macro, inter_micro, NI_D_sw, NI_D_min = intra_inter_S(S, labels)
    r_neg, burden = neg_sil_metrics(s_vals)
    M_D = distance_margin(NI_D_sw, mean_a)
    sizes = {int(k):int(v) for k,v in sorted(Counter(labels).items())}
    # prototypes
    protos = prototypes_nearest(U, centers, labels)
    protos_1b = {k:(v+1 if v is not None else None) for k,v in protos.items()}
    # size-weighted nearest-inter similarity
    nearest_inter_sw = 1.0 - NI_D_sw
    # print
    print(f"=== {name} (K={len(set(labels))}) ===")
    print(f"Silhouette (all points avg):       {s_all: .6f}")
    print(f"Silhouette (macro by cluster):     {s_macro_avg: .6f}")
    print(f"  s̄_k by cluster: { {k: s_macro[k] for k in sorted(s_macro)} }")
    print(f"Intra macro:   {intra_macro: .6f}")
    print(f"Intra micro:   {intra_micro: .6f}")
    print(f"Inter macro:   {inter_macro: .6f}")
    print(f"Inter micro:   {inter_micro: .6f}")
    print(f"Nearest-Inter (size-weighted):      {nearest_inter_sw: .6f}  <-- 값↑ ⇒ 분리도 나쁨/경계 혼선")
    print(f"NI_D (size-weighted):            {NI_D_sw: .6f}")
    print(f"NI_D_min (worst cluster):        {NI_D_min: .6f}")
    print(f"Distance Margin M_D:             {M_D: .6f}")
    print(f"Prototypes (nearest model 1-based): {protos_1b}")
    print(f"SSE Cost (U-space): {sse: .6f}")
    print(f"Cluster sizes: {sizes}")
    # Optional: simple null for M_D
    if null_R>0:
        sizes_list=[v for _,v in sorted(sizes.items())]
        sims=[]
        for r in range(null_R):
            perm=rng.permutation(N)
            lbl=np.empty(N,dtype=int); start=0
            for kk,nk in enumerate(sizes_list):
                lbl[perm[start:start+nk]]=kk; start+=nk
            _, mean_a_n, _, _, NI_D_sw_n, NI_D_min_n = intra_inter_S(S, lbl)
            sims.append(NI_D_sw_n - mean_a_n)
        mu=float(np.mean(sims)); sd=float(np.std(sims)+1e-12)
        print(f"[Null]  M_D Z = {(M_D-mu)/sd: .3f}")
    print()

K_list=[2,3,4,5,6,7,8,9,10]
for K in K_list:
    labels, centers, sse = run_kmeans_on_U(U, K)
    print_report(f'kmeans_K{K}', U, S_matrix, D, labels, centers, sse, null_R=100)


ValueError: Algorithm must be 'auto', 'full' or 'elkan', got lloyd instead.