Original implementation of Contrastive-sc method
(https://github.com/ciortanmadalina/contrastive-sc)

In [3]:
import sys
sys.path.append("..")
import argparse
from sklearn.metrics import (adjusted_rand_score, normalized_mutual_info_score, 
                             silhouette_score, calinski_harabasz_score,
                             davies_bouldin_score)
from sklearn.cluster import KMeans
from sklearn import metrics

import torch
import torch.nn as nn
import copy
from tqdm.notebook import tqdm
import models
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralClustering, MeanShift
from sklearn.cluster import AgglomerativeClustering, Birch
from sklearn import mixture
import st_loss

import h5py
import scipy as sp
import scanpy.api as sc
from collections import Counter
import random
import utils
import loop
import pickle
import seaborn as sns
import train
import time
import os
import glob2
plt.ion()
plt.show()
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.



In [4]:
path = "../"

category = "real_data"
all_data = pd.read_pickle(
    f"../output/pickle_results/{category}/{category}_combined.pkl")

clrs = [
    '#C0392B', "#F1948A", "#D7BDE2", "#8E44AD", "#7FB3D5", "#2874A6",
    "#76D7C4", "#117A65", '#00ff00', "#F1C40F", "#EB984E", "#839192", "#566573"
]

ordered_methods = [
    'constrastive+KM',
    'scziDesk',
    'scDeepCluster',
    'scrna',
    'cidr',
    'soup',
    'pca_kmeans',
    'constrastive+LD',
    'desc',
    'scanpy-seurat',
    'scedar',
    'scvi',
    'raceid',
]
dataset_names = {
    '10X_PBMC': '10X PBMC',
    '10X_PBMC_select_2100': '10X PBMC (2100)',
    'mouse_ES_cell': 'Mouse ES\nCell',
    'mouse_ES_cell_select_2100': 'Mouse ES\nCell (2100)',
    'worm_neuron_cell_select_2100': 'Worm Neuron\nCell (2100)',
    'worm_neuron_cell': 'Worm\nNeuron\nCell',
    'mouse_bladder_cell': 'Mouse\nBladder\nCell',
    'mouse_bladder_cell_select_2100': 'Mouse Bladder\n Cell (2100)',
    'Quake_Smart-seq2_Trachea': 'QS\nTrachea',
    'Quake_Smart-seq2_Diaphragm': 'QS\nDiaphragm',
    'Quake_10x_Spleen': 'Q Spleen',
    'Quake_10x_Bladder': 'Q Bladder',
    'Quake_Smart-seq2_Lung': 'QS Lung',
    'Quake_10x_Limb_Muscle': 'Q Limb\nMuscle',
    'Quake_Smart-seq2_Limb_Muscle': 'QS Limb\nMuscle',
}

all_data["dataset_label"] = all_data["dataset"].apply(
    lambda x: dataset_names.get(x, x))

datasets = [
    'Quake_Smart-seq2_Trachea',
    'Quake_10x_Bladder',
    'Quake_10x_Spleen',
    'Quake_Smart-seq2_Diaphragm',
    'Quake_10x_Limb_Muscle',
    'Quake_Smart-seq2_Limb_Muscle',
    'Romanov',
    'Adam',
    'Muraro',
    'Young',
    'Quake_Smart-seq2_Lung',
    '10X_PBMC',
    'mouse_ES_cell',
    'worm_neuron_cell',
    'mouse_bladder_cell',
]

ordered_datasets = dict(zip(datasets, np.arange(len(datasets))))

all_data["ordered_dataset"] = all_data["dataset"].apply(lambda x: ordered_datasets[x])

In [5]:
all_data[all_data["method"]=='constrastive+KM'].groupby("dataset").mean()

Unnamed: 0_level_0,dropout,ARI,NMI,Silhouette,Calinski,time,order,ordered_dataset
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10X_PBMC,0.9,0.701568,0.730663,0.561358,15187.703619,7.249985,0,11
Adam,0.9,0.82862,0.847876,0.506503,2343.246414,5.285699,0,7
Muraro,0.9,0.911457,0.861285,0.650189,3118.359485,3.68946,0,8
Quake_10x_Bladder,0.9,0.754433,0.803781,0.805675,7433.609912,3.74888,0,1
Quake_10x_Limb_Muscle,0.9,0.986853,0.972461,0.616785,3360.114574,5.132316,0,4
Quake_10x_Spleen,0.9,0.905808,0.794537,0.699799,12509.945131,12.081078,0,2
Quake_Smart-seq2_Diaphragm,0.9,0.981169,0.958502,0.835933,5075.863808,2.530369,0,3
Quake_Smart-seq2_Limb_Muscle,0.9,0.975165,0.957377,0.799499,4059.126962,2.560246,0,5
Quake_Smart-seq2_Lung,0.9,0.594065,0.767208,0.601102,1454.101577,3.251033,0,10
Quake_Smart-seq2_Trachea,0.9,0.859771,0.839941,0.611279,1118.053611,3.458294,0,0


# Run other clustering algorithms

In [6]:
df = pd.read_pickle(f"../output/pickle_results/{category}/{category}_baseline.pkl")

In [9]:
aris = pd.DataFrame(columns= ["Dataset", "Method", "run", "score", "time"])
nmis = pd.DataFrame(columns= ["Dataset", "Method", "run", "score", "time"])
sils = pd.DataFrame(columns= ["Dataset", "Method", "run", "score", "time"])
cals = pd.DataFrame(columns= ["Dataset", "Method", "run", "score", "time"])

def add_scores(Y, pred, embedding, i, t, method="KMeans"):
    score = adjusted_rand_score(Y, pred)
    aris.loc[aris.shape[0]] = [dataset, method, i, score, t]
    score = normalized_mutual_info_score(Y, pred)
    nmis.loc[nmis.shape[0]] = [dataset, method, i, score, t]
    score = silhouette_score(embedding, pred)
    sils.loc[sils.shape[0]] = [dataset, method, i, score, t]
    score = calinski_harabasz_score(embedding, pred)
    cals.loc[cals.shape[0]] = [dataset, method, i, score, t]

In [10]:
for dataset in datasets:

    data_mat = h5py.File(f"{path}/real_data/{dataset}.h5","r")
    Y = np.array(data_mat['Y'])

    cluster_number = np.unique(Y).shape[0]

    features = df[df["dataset"] == dataset]["features"].values

    for i in range(len(features)):


        embedding = features[i]
        t0 = time.time()
        kmeans = KMeans(n_clusters=cluster_number,
                        init="k-means++",
                        random_state=0)
        pred = kmeans.fit_predict(embedding)
        t = time.time() -t0
        add_scores(Y, pred, embedding, i, t, method="KMeans")
        

        t0 = time.time()
        pred = utils.run_leiden(embedding, leiden_n_neighbors=300)
        t = time.time() -t0
        add_scores(Y, pred, embedding, i, t, method="Leiden")

        t0 = time.time()
        sc = SpectralClustering(cluster_number, n_init=100, assign_labels='discretize')
        pred = sc.fit_predict(embedding)
        t = time.time() -t0
        add_scores(Y, pred, embedding, i, t, method="Spectral Clust")

        t0 = time.time()
        ward = AgglomerativeClustering(n_clusters=cluster_number, linkage='ward')
        ward.fit(embedding)
        pred = ward.labels_
        t = time.time() -t0
        add_scores(Y, pred, embedding, i, t, method="Ward Hierarchical Clust")

        t0 = time.time()
        gmm = mixture.GaussianMixture(n_components=cluster_number)
        gmm.fit(embedding)
        pred = gmm.predict(embedding)
        t = time.time() -t0
        add_scores(Y, pred, embedding, i, t, method="GMM")

        t0 = time.time()
        model = Birch(n_clusters=cluster_number)
        pred = gmm.fit_predict(embedding)
        t = time.time() -t0
        add_scores(Y, pred, embedding, i, t, method="Birch")

        t0 = time.time()
        ms = MeanShift(bandwidth=0.2)
        ms.fit(embedding)
        pred = ms.labels_
        t = time.time() -t0
        add_scores(Y, pred, embedding, i, t, method="Mean Shift")

0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2


In [None]:
aris.to_pickle(f"../output/pickle_results/{category}/{category}_compare_aris.pkl")

nmis.to_pickle(f"../output/pickle_results/{category}/{category}_compare_nmis.pkl")

sils.to_pickle(f"../output/pickle_results/{category}/{category}_compare_sils.pkl")

cals.to_pickle(f"../output/pickle_results/{category}/{category}_compare_cals.pkl")

In [14]:
aris.groupby("Method").mean()/0.14

Unnamed: 0_level_0,score,time
Method,Unnamed: 1_level_1,Unnamed: 2_level_1
Birch,5.035656,3.173842
GMM,5.006634,3.291824
KMeans,5.51278,1.032212
Leiden,3.668062,56.716855
Mean Shift,4.65745,100.094442
Spectral Clust,5.341015,5.605539
Ward Hierarchical Clust,5.478617,3.478143


In [None]:
aris = aris.groupby(["Dataset", "Method"])["score"].mean().unstack("Method")[[
    'KMeans', 'Leiden', 'Birch', 'GMM', 'Mean Shift', 'Spectral Clust',
    'Ward Hierarchical Clust'
]].round(2).reset_index()
aris["ordered_dataset"] = aris["Dataset"].apply(lambda x: ordered_datasets[x])
aris.sort_values(by="ordered_dataset").to_excel(
    f"../output/pickle_results/{category}/{category}_compare_aris.xlsx")

In [None]:
nmis = nmis.groupby(["Dataset", "Method"])["score"].mean().unstack("Method")[[
    'KMeans', 'Leiden', 'Birch', 'GMM', 'Mean Shift', 'Spectral Clust',
    'Ward Hierarchical Clust'
]].round(2).reset_index()
nmis["ordered_dataset"] = nmis["Dataset"].apply(lambda x: ordered_datasets[x])
nmis.sort_values(by="ordered_dataset").to_excel(
    f"../output/pickle_results/{category}/{category}_compare_nmis.xlsx")

In [None]:
sils = sils.groupby(["Dataset", "Method"])["score"].mean().unstack("Method")[[
    'KMeans', 'Leiden', 'Birch', 'GMM', 'Mean Shift', 'Spectral Clust',
    'Ward Hierarchical Clust'
]].round(2).reset_index()
sils["ordered_dataset"] = sils["Dataset"].apply(lambda x: ordered_datasets[x])
sils.sort_values(by="ordered_dataset").to_excel(
    f"../output/pickle_results/{category}/{category}_compare_sils.xlsx")

In [None]:
cals = cals.groupby(["Dataset", "Method"])["score"].mean().unstack("Method")[[
    'KMeans', 'Leiden', 'Birch', 'GMM', 'Mean Shift', 'Spectral Clust',
    'Ward Hierarchical Clust'
]].round(2).reset_index()
cals["ordered_dataset"] = cals["Dataset"].apply(lambda x: ordered_datasets[x])
cals.sort_values(by="ordered_dataset").to_excel(
    f"../output/pickle_results/{category}/{category}_compare_cals.xlsx")

In [None]:
aris.mean().round(2)

In [None]:
nmis.mean().round(2)

In [None]:
sils.mean().round(2)

In [None]:
cals.mean().round(2)