# 预先进行聚类


In [2]:
import os
import pandas as pd
import torch
from tqdm import tqdm
from cuml.cluster import KMeans
import cupy as cp

# from sklearn.cluster import KMeans

# Your existing setup
input_path = '/data/wsi/TCTGC50k-features/gigapath-coarse/pt'
input_label = '/data/wsi/TCTGC50k-labels/6_labels/TCTGC50k-v15-train.csv'
labels = pd.read_csv(input_label)
n_clusters = 5
output_path = './cluster'

# Create DataFrame with WSI names
df = labels[['wsi_name']].copy()
df['cluster_label'] = None  # Add column for storing pooled features

# Process each WSI
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing WSIs"):
    wsi_name = row['wsi_name']
    pt_path = os.path.join(input_path, f"{wsi_name}.pt")
    
    try:
        features = torch.load(pt_path)
        # Load features
        # kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        # cluster_labels = kmeans.fit_predict(features)

        patch_features_cp = cp.asarray(features)
        # patch_features_cp = patch_features
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        cluster_labels = kmeans.fit_predict(patch_features_cp)
        df.at[idx, 'cluster_label'] = " ".join(map(str, cluster_labels))
        
    except Exception as e:
        print(f"Error processing {wsi_name}: {str(e)}")
        df.at[idx, 'cluster_label'] = None  # Mark as failed

# Optional: Save the DataFrame with pooled features
output_df_path = os.path.join(output_path, f"kmeans_{n_clusters}.csv")
df.to_csv(output_df_path, index=False)
print(f"Saved pooled features to {output_df_path}")

  features = torch.load(pt_path)
Processing WSIs: 100%|██████████| 38339/38339 [37:12<00:00, 17.17it/s] 


Saved pooled features to ./cluster/kmeans_5.csv
