# ws≡1 OFF — User Medoids + Spherical k-Means

In [None]:

# 1) Imports & Config
import os, math
import numpy as np
import pandas as pd
from typing import Dict, Optional, Tuple, List
import matplotlib.pyplot as plt

RANDOM_STATE = 0
rng = np.random.default_rng(RANDOM_STATE)

M = 53
J = 53
DIR_PATH = "./raw"

PROB_EPS = 1e-6      # probability clipping
LOGIT_CLIP = 7.0     # logit clipping in [-7,7]
STD_EPS = 1e-12      # z-score denom floor


In [None]:

# 2) Data Loading

# probs, labels 초기화
probs = {i: {} for i in range(1, M+1)}  # probs[i][j] -> 확률 값
labels = {j: [] for j in range(1, J+1)}  # labels[j] -> 라벨 값

# CSV 파일 로드 및 전처리
for i in range(1, M+1):  # 모델 번호
    for j in range(1, J+1):  # 클라이언트 번호
        file_path = os.path.join(DIR_PATH, f"src_{str(i).zfill(3)}/probs/tgt_{str(j).zfill(3)}.csv")
        
        # 각 파일이 존재하면 읽기
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            
            # 확률(p_A)과 라벨(label_num) 추출
            probs[i][j] = df['p_A'].values  # 모델 i의 클라이언트 j에 대한 p(A) 확률
            labels[j] = df['label_num'].values  # 클라이언트 j에 대한 라벨
            
        else:
            print(f"Warning: File {file_path} not found!")

# 데이터 확인
print(probs[1][1][:5])  # 첫 번째 모델과 첫 번째 클라이언트에 대한 확률 예시
print(labels[1][:5])  # 첫 번째 클라이언트에 대한 라벨 예시


In [None]:

# 3) Signed-Logit Margin -> Column Z-score -> Correlation Similarity

def build_margin_matrix(probs: Dict[int, Dict[int, np.ndarray]],
                        labels: Dict[int, np.ndarray],
                        prob_eps: float = PROB_EPS,
                        logit_clip: float = LOGIT_CLIP,
                        std_eps: float = STD_EPS) -> Tuple[np.ndarray, List[Tuple[int,int]]]:
    # compute total columns
    col_meta: List[Tuple[int,int]] = []  # (j,t)
    for j in range(1, J+1):
        if j not in labels:
            continue
        n = len(labels[j])
        for t in range(n):
            col_meta.append((j,t))
    S = len(col_meta)
    if S == 0:
        raise ValueError("labels 데이터가 비어 있습니다.")
    X = np.zeros((M, S), dtype=float)
    # fill
    for s, (j,t) in enumerate(col_meta):
        q = labels[j][t]
        y = 2*q - 1   # {-1,+1}
        for i in range(1, M+1):
            arr = probs.get(i, {}).get(j, None)
            if arr is None or t >= len(arr):
                X[i-1, s] = np.nan
            else:
                pv = float(np.clip(arr[t], prob_eps, 1-prob_eps))
                lv = math.log(pv/(1-pv))
                if lv > logit_clip: lv = logit_clip
                if lv < -logit_clip: lv = -logit_clip
                X[i-1, s] = y * lv
    # drop columns with NaN
    good = ~np.isnan(X).any(axis=0)
    X = X[:, good]
    # column standardization
    mu = X.mean(axis=0, keepdims=True)
    Xc = X - mu
    sd = Xc.std(axis=0, keepdims=True)
    sd = np.where(sd < std_eps, 1.0, sd)
    Xz = Xc / sd
    return Xz, good.nonzero()[0].tolist()

Xz, kept_cols = build_margin_matrix(probs, labels)
print("Xz shape:", Xz.shape)
print(Xz)

def compute_S_matrix(Xz: np.ndarray) -> np.ndarray:
    # row-wise Pearson correlation -> [0,1] similarity
    C = np.corrcoef(Xz)  # (M,M), correlation between rows
    C = np.clip(C, -1.0, 1.0)
    S = 0.5*(C + 1.0)
    np.fill_diagonal(S, 1.0)
    return S

S_matrix = compute_S_matrix(Xz)
print("S_matrix shape:", S_matrix.shape, "range:", float(S_matrix.min()), "to", float(S_matrix.max()))


In [None]:
from typing import Dict

def nll_i_to_j(p: np.ndarray, q: np.ndarray, smooth_eps: float = 1e-9) -> float:
    """Binary NLL calculation."""
    # 라벨 스무딩 (옵션)
    q_ = (1 - smooth_eps) * q + smooth_eps * (1 - q)
    
    # 확률 클리핑
    p = np.clip(p, smooth_eps, 1 - smooth_eps)
    
    # NLL 계산
    nll = -np.mean(q_ * np.log(p) + (1 - q_) * np.log(1 - p))
    return nll

def compute_T_matrix(probs: Dict[int, Dict[int, np.ndarray]], labels: Dict[int, np.ndarray], smooth_eps: float = 1e-9) -> np.ndarray:
    """Compute the Transferability proxy T."""
    T = np.zeros((M, J))  # 모델 x 클라이언트 크기의 행렬
    
    for i in range(1, M+1):
        for j in range(1, J+1):
            T[i-1, j-1] = nll_i_to_j(probs[i][j], labels[j], smooth_eps)
    
    # 열별(min-max) 정규화
    T = (T - T.min(axis=0)) / (T.max(axis=0) - T.min(axis=0))  # column-wise normalization
    return T

T_matrix = compute_T_matrix(probs, labels)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm

def plot_heatmap_T(
    T_matrix,
    title="T heatmap",
    cmap="RdBu_r",        # 추천 diverging
    center=0.5            # 0.5를 중립(흰색/밝은색)으로
):
    T = np.asarray(T_matrix)
    norm = TwoSlopeNorm(vmin=0.0, vcenter=center, vmax=1.0)

    plt.figure(figsize=(8, 6))
    im = plt.imshow(T, aspect='auto', cmap=cmap, norm=norm)
    plt.title(title)
    plt.xlabel('j (datasets D_j)')
    plt.ylabel('i (models)')
    cbar = plt.colorbar(im)
    cbar.set_label('T (column-wise min–max)', rotation=90)

    # 보기 좋은 1-based 눈금
    ni, nj = T.shape
    plt.xticks(
        np.linspace(0, nj-1, min(nj, 12), dtype=int),
        [str(x) for x in np.linspace(1, nj, min(nj, 12), dtype=int)]
    )
    plt.yticks(
        np.linspace(0, ni-1, min(ni, 12), dtype=int),
        [str(x) for x in np.linspace(1, ni, min(ni, 12), dtype=int)]
    )
    plt.tight_layout()
    plt.show()

# 예시:
# plot_heatmap_T(T_matrix, cmap="RdBu_r")       # 깔끔, 대비 적당
# plot_heatmap_T(T_matrix, cmap="RdYlBu_r")     # 대비 강함(보색+노랑)
# plot_heatmap_T(T_matrix, cmap="coolwarm")     # 부드러운 대비
plot_heatmap_T(T_matrix, cmap="viridis")      # 중립 불필요할 때(순차형)


In [None]:

import numpy as np, pandas as pd
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Load
p = Path('x_raw.npy')
if not p.exists():
    p = Path('/mnt/data/x_raw.npy')
X = np.load(p)
n,d = X.shape
print('Loaded', p, 'shape=', X.shape)

# ws≡1 preprocessing
def preprocess_ws_off(X, delta=1e-6):
    mu_s = X.mean(axis=0, keepdims=True)
    sd_s = np.maximum(X.std(axis=0, ddof=0, keepdims=True), delta)
    Xz = (X - mu_s) / sd_s
    mu_i = Xz.mean(axis=1, keepdims=True)
    Xc = Xz - mu_i
    norms = np.linalg.norm(Xc, axis=1, keepdims=True)
    norms = np.where(norms==0, 1.0, norms)
    U = Xc / norms
    rho = np.clip(U @ U.T, -1.0, 1.0)
    D = 0.5*(1.0 - rho); np.fill_diagonal(D, 0.0)
    return U, D

U, D = preprocess_ws_off(X)
print('U:', U.shape, 'D:', D.shape)


In [None]:

# PAM (k-medoids++) minimal implementation
def kmedoids_pam(D, k, n_init=16, max_swaps=200, random_state=42):
    rng = np.random.RandomState(random_state)
    n = D.shape[0]
    all_idx = np.arange(n)
    best_labels, best_meds, best_cost = None, None, np.inf
    for _ in range(n_init):
        meds = [rng.choice(n)]
        dist_min = D[meds[0]].copy()
        for _ in range(1, k):
            probs = dist_min / (dist_min.sum() + 1e-12)
            nx = rng.choice(n, p=probs)
            meds.append(nx)
            dist_min = np.minimum(dist_min, D[nx])
        meds = np.array(sorted(set(meds)))
        if meds.size < k:
            rest = np.setdiff1d(all_idx, meds)
            meds = np.concatenate([meds, rng.choice(rest, k - meds.size, replace=False)])
        labels = np.argmin(D[:, meds], axis=1)
        improved, swaps = True, 0
        while improved and swaps < max_swaps:
            improved = False
            non_meds = np.setdiff1d(all_idx, meds, assume_unique=True)
            current_cost = np.sum(D[np.arange(n), meds[labels]])
            for m in meds.copy():
                for h in non_meds:
                    cand = meds.copy()
                    cand[np.where(cand == m)[0][0]] = h
                    cand = np.sort(cand)
                    cand_labels = np.argmin(D[:, cand], axis=1)
                    cand_cost = np.sum(D[np.arange(n), cand[cand_labels]])
                    if cand_cost + 1e-9 < current_cost:
                        meds = cand
                        labels = cand_labels
                        current_cost = cand_cost
                        improved = True; swaps += 1
                        break
                if improved or swaps >= max_swaps: break
        cost = np.sum(D[np.arange(n), meds[labels]])
        if cost < best_cost:
            best_cost, best_labels, best_meds = cost, labels, meds
    return best_labels, best_meds, best_cost


In [None]:

# Run K=2
K=2
lab_pm, meds, cost = kmedoids_pam(D, K, n_init=16, max_swaps=200, random_state=42)
lab_km = KMeans(n_clusters=K, n_init=32, max_iter=300, random_state=42).fit_predict(U)

from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
for name, labels in [('medoids', lab_pm), ('kmeans', lab_km)]:
    sil = silhouette_score(D, labels, metric='precomputed')
    ch  = calinski_harabasz_score(U, labels)
    db  = davies_bouldin_score(U, labels)
    sizes = [int(s) for s in np.bincount(labels)]
    print(f'{name}: sil={sil:.3f} CH={ch:.1f} DB={db:.3f} sizes={sizes}')

# Save CSVs
pd.DataFrame({'client_id': np.arange(1, n+1), 'cluster_k2_medoids': lab_pm}).to_csv('ws_off_medoids_k2.csv', index=False)
pd.DataFrame({'client_id': np.arange(1, n+1), 'cluster_k2_kmeans': lab_km}).to_csv('ws_off_kmeans_k2.csv', index=False)
print('Saved ws_off_medoids_k2.csv, ws_off_kmeans_k2.csv')
