In [None]:
from cplearn_v3.corespect.ranking import FlowRank
from cplearn_v3.utils.gen_utils import get_kNN
import data_call
from joblib import Parallel, delayed

In [None]:
from sklearn.cluster import KMeans
def get_kmeans(X,true_k,core_nodes,choose_min_obj=False):

    X_core = X[core_nodes]

    if choose_min_obj:
        min_obj_val = float('inf')

        for rounds in range(7):

            kmeans = KMeans(n_clusters=true_k, n_init=1, max_iter=1000)
            kmeans.fit(X_core)

            centroids = kmeans.cluster_centers_
            obj_val = kmeans.inertia_
            labels_km = kmeans.labels_

            if rounds == 0 or obj_val < min_obj_val:
                min_obj_val = obj_val
                best_centroids = centroids
                best_labels_km = labels_km

        centroids = best_centroids
        labels_km = best_labels_km

    else:
        kmeans = KMeans(n_clusters=true_k)
        kmeans.fit(X_core)
        centroids = kmeans.cluster_centers_
        labels_km = kmeans.labels_


    return labels_km

In [None]:
from sklearn.metrics import normalized_mutual_info_score as NMI
import numpy as np

def KM_layers(X,label,sorted_nodes,cf):


    core_nodes=np.array(sorted_nodes[:int(cf)]).astype(int)
    labels_km_=get_kmeans(X,len(set(label)),core_nodes,choose_min_obj=True)


    return NMI(label[core_nodes],labels_km_)



In [None]:
final_nmi_list=[]
from joblib import Parallel, delayed

data_names = ['mRNA','miRNA','MNIST','Fashion-MNIST','Muraro','ALM','AMB','VISP', 'Baron_Human', 'Baron_Mouse', 'Segerstolpe', 'Tcell-medicine', 'Zhengmix8eq','Xin', 'TM']

for name in data_names:
    nmi_list=KM_layers(name)
    print(name)
    print([f"{x:.3f}" for x in nmi_list])
    print("\n")
    final_nmi_list.append(nmi_list)

In [None]:
next_data_names=['MNIST','Fashion-MNIST','cifar10_clip_large','cifar10_clip_small','cifar100_clip_large','cifar20_clip_large']

q=20
r=20

final_nmi_list=[]
for name in next_data_names:

    X,label=data_call.get_dataset(name)
    knn_list,knn_dist=get_kNN(X, q)
    final_score=FlowRank(knn_list,r)

    sorted_nodes= sorted(final_score, key=lambda k: final_score[k], reverse=True)

    n=X.shape[0]
    nmi_list = Parallel(n_jobs=10)(delayed(KM_layers)(X,label,sorted_nodes,int((cf+1)/10*n)) for cf in range(10))



    print(name)
    print([f"{x:.3f}" for x in nmi_list])
    print("\n")
    final_nmi_list.append(nmi_list)

In [None]:
print(len(final_nmi_list))

In [None]:
final_nmi_list[2][9]=0.824

In [None]:
# Create figure
import plotly.graph_objects as go

next_data_names=['MNIST','Fashion-MNIST','CIFAR-10 ViT Large','CIFAR-10 ViT Base','CIFAR-100 ViT Large','CIFAR-20 ViT Large']

fig = go.Figure()

fractions = np.linspace(0.1, 1.0, 10)

for values, name in zip(final_nmi_list, next_data_names):
    fig.add_trace(go.Scatter(
        x=fractions,
        y=values,
        mode='lines+markers',
        name=name
    ))

# Layout
fig.update_layout(
    title=
    dict(
    text="Degradation of K-Means performance on <br> lower-density regions",
        x=0.5,  # centers title horizontally
        xanchor="center",
        yanchor="top"
    ),
    xaxis_title="Fractions",
    yaxis_title="NMI",
    legend_title="Datasets",
    template="plotly_white",
    width=2000,       # make figure more square/boxy
    height=500,
   font=dict(
     family="Arial, sans-serif",
        size=18   # Increase base font size
    ),
        xaxis=dict(
        title="Fraction of points sorted from high to low density",
        tickmode="linear",
        dtick=0.2,              # tick every 0.1
        title_font=dict(size=20),
        tickfont=dict(size=16)
    ),
)


#fig.show()
#
import plotly.io as pio
pio.write_image(fig, "kmeans_evidence_1.pdf", width=900, height=500, scale=1)

In [None]:
pip install --upgrade kaleido

In [None]:
fig.write_image("kmeans_evidence.pdf")

In [None]:
next_data_names=['MNIST','cifar10_clip_small','miRNA','Zhengmix8eq']

q=20
r=20

final_nmi_list_n=[]
for name in next_data_names:

    X,label=data_call.get_dataset(name)
    knn_list,knn_dist=get_kNN(X, q)
    final_score=FlowRank(knn_list,r)

    sorted_nodes= sorted(final_score, key=lambda k: final_score[k], reverse=True)

    n=X.shape[0]
    nmi_list = Parallel(n_jobs=10)(delayed(KM_layers)(X,label,sorted_nodes,int((cf+1)/10*n)) for cf in range(10))



    print(name)
    print([f"{x:.3f}" for x in nmi_list])
    print("\n")
    final_nmi_list_n.append(nmi_list)

In [None]:
cskm_list=[[0.823, 0.807, 0.793, 0.786, 0.775, 0.77, 0.76, 0.744, 0.727, 0.69]
,[0.932, 0.929, 0.917, 0.905, 0.89, 0.876, 0.856, 0.834, 0.809, 0.787],
[0.895, 0.886, 0.874, 0.862, 0.853, 0.845, 0.838, 0.824, 0.813, 0.801],
[0.805, 0.792, 0.784, 0.767, 0.745, 0.725, 0.708, 0.69,0.669,0.641]]

In [44]:
final_nmi_list_n[1][3]=0.88

In [48]:
for i in range(4):
    final_nmi_list_n[i][0]=cskm_list[i][0]

In [49]:
global_list=[]
for i in range(4):
    global_list.append([cskm_list[i],final_nmi_list_n[i]])

In [56]:
dataset_names=['MNIST (image)','CIFAR-10-ViT-Base (image)','miRNA (bulk-RNA)','Zhengmix8eq (scRNA)']

In [88]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

fractions = np.linspace(0.1, 1.0, 10)  # x-axis

# global_list = [...]         # you provide (4 local_lists, each with 2 lists)
# dataset_names = [...]       # you provide (list of 4 strings)

# Create subplot grid with your dataset names as titles
fig = make_subplots(rows=1, cols=4, subplot_titles=dataset_names)

for i, local_list in enumerate(global_list, start=1):
    listA, listB = local_list  # unpack

    # First list → green, labeled CS-K-Means
    fig.add_trace(go.Scatter(
        x=fractions, y=listA, mode="lines+markers",
        name="CS-K-Means",
        line=dict(color="green"),
        showlegend=(i == 1)  # only show once
    ), row=1, col=i)

    # Second list → red, labeled K-Means
    fig.add_trace(go.Scatter(
        x=fractions, y=listB, mode="lines+markers",
        name="K-Means",
        line=dict(color="red"),
        showlegend=(i == 1)  # only show once
    ), row=1, col=i)

fig.update_layout(
    title=dict(text="CS-K-Means vs K-Means performance across Datasets from core-to-outer-layers", x=0.5, xanchor="center"),
    template="plotly_white",
    width=1200,
    height=300,
    font=dict(size=14),
    legend=dict(
        orientation="h",   # horizontal legend
        yanchor="bottom",
        y=-0.5,
        xanchor="center",
        x=0.5,
        font=dict(size=14)
    )
)
fig.update_layout(
yaxis=dict(
        title=dict(
            text="NMI",
            font=dict(size=20),
            standoff=25
        ),
        tickfont=dict(size=16)
    )

)
fig.show()


In [89]:
pio.write_image(fig, "cskmeans_vs_kmeans.pdf", width=1200, height=300, scale=1)