Original implementation of Contrastive-sc method
(https://github.com/ciortanmadalina/contrastive-sc)

In [1]:
import sys
sys.path.append("..")
import argparse
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.cluster import KMeans
from sklearn import metrics

import torch
import torch.nn as nn
import copy
from tqdm.notebook import tqdm
import models
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import st_loss
import time
import h5py
import scipy as sp
import scanpy.api as sc
from collections import Counter
import random
import utils

import pickle

import train
import os
import glob2
plt.ion()
plt.show()
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.



In [3]:
# in this analysis we are only interested in the computational cost
# we choose one of the simulated datasets for facility to reshape
path = "../"
dataset = 'data_-1c4'
category = "balanced_data"

In [5]:
df = pd.DataFrame()
dropout = 0.9
lr = 0.4
layers = [200, 40, 60]
temperature = 0.07
nb_genes = 500
t0 = time.time()
data_mat = h5py.File(f"{path}R/simulated_data/{category}/{dataset}.h5", "r")
X_o = np.array(data_mat['X'])
Y_o = np.array(data_mat['Y'])
cluster_number = np.unique(Y_o).shape[0]

X_o = train.preprocess(X_o, nb_genes=nb_genes)
preprocess_time = time.time()-t0
for factor in [ 50,30, 20, 10, 5, 1]:
    X = np.tile(X_o.T, factor).T
    Y= np.tile(Y_o.T, factor).T
    print(">> ",X_o.shape, X.shape)
    for run in range(5):
        torch.manual_seed(run)
        torch.cuda.manual_seed_all(run)
        np.random.seed(run)
        random.seed(run)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        start = time.time()
        dresults = train.run(X,
                             cluster_number,
                             dataset,
                             Y=Y,
                             nb_epochs=30,
                             lr=lr,
                             temperature=temperature,
                             dropout=dropout,
                             layers=layers,
                             save_to=f"{path}output/{category}/{run}/",
                             save_pred = False,
                             leiden_n_neighbors = 40)
        time_k = (dresults["t_k"] - start) + preprocess_time
        time_l = (dresults["t_clust"] - start) + (dresults["t_l"] -dresults["t_k"]) +preprocess_time
        dresults["t_training"] = dresults["t_k"] - start
        dresults["t_k"] = time_k
        dresults["t_l"] = time_l
        dresults["nb_cells"] = X.shape[0]
        print(".", end = "")
        df = df.append(dresults, ignore_index=True)

    df.to_pickle(f"{path}output/pickle_results/{category}/{category}_scalability_cells.pkl")

(1000, 2500) (1000, 2500) keeping 500 genes
>>  (1000, 500) (50000, 500)
.....>>  (1000, 500) (30000, 500)
.....>>  (1000, 500) (20000, 500)
.....>>  (1000, 500) (10000, 500)
.....>>  (1000, 500) (5000, 500)
.....>>  (1000, 500) (1000, 500)
.....

In [6]:
df = pd.DataFrame()
dropout = 0.9
lr = 0.4
layers = [200, 40, 60]
temperature = 0.07
nb_genes = 500
t0 = time.time()
data_mat = h5py.File(f"{path}R/simulated_data/{category}/{dataset}.h5", "r")
X_o = np.array(data_mat['X'])
Y_o = np.array(data_mat['Y'])
cluster_number = np.unique(Y_o).shape[0]

X_o = train.preprocess(X_o, nb_genes=nb_genes)
preprocess_time = time.time()-t0
for factor in [500, 200, 100, 50, 10, 5, 1]:
    X = np.tile(X_o, factor)
    Y= Y_o
    print(">> ",X_o.shape, X.shape)
    for run in range(5):
        torch.manual_seed(run)
        torch.cuda.manual_seed_all(run)
        np.random.seed(run)
        random.seed(run)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        start = time.time()
        dresults = train.run(X,
                             cluster_number,
                             dataset,
                             Y=Y,
                             nb_epochs=30,
                             lr=lr,
                             temperature=temperature,
                             dropout=dropout,
                             layers=layers,
                             save_to=f"{path}output/{category}/{run}/",
                             save_pred = False)
        time_k = (dresults["t_k"] - start) + preprocess_time
        time_l = (dresults["t_clust"] - start) + (dresults["t_l"] -dresults["t_k"]) +preprocess_time
        dresults["t_training"] = dresults["t_k"] - start
        dresults["t_k"] = time_k
        dresults["t_l"] = time_l
        dresults["nb_cells"] = X.shape[0]
        dresults["nb_genes"] = X.shape[1]
        print(".", end = "")
        df = df.append(dresults, ignore_index=True)

    df.to_pickle(f"{path}output/pickle_results/{category}/{category}_scalability_genes.pkl")

(1000, 2500) (1000, 2500) keeping 500 genes
>>  (1000, 500) (1000, 250000)
.....>>  (1000, 500) (1000, 100000)
.....>>  (1000, 500) (1000, 50000)
.....>>  (1000, 500) (1000, 25000)
.....>>  (1000, 500) (1000, 5000)
.....>>  (1000, 500) (1000, 2500)
.....>>  (1000, 500) (1000, 500)
.....

In [7]:
df.groupby("nb_genes").mean()

Unnamed: 0_level_0,kmeans_ari,kmeans_cal,kmeans_nmi,kmeans_sil,leiden_ari,leiden_cal,leiden_nmi,leiden_sil,nb_cells,t_clust,t_k,t_l,t_training,time
nb_genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
500.0,0.700331,630.182312,0.699512,0.278299,0.716286,621.746168,0.718366,0.281126,1000.0,1613316000.0,1.089951,2.942203,0.798354,0.690486
2500.0,0.940639,664.807478,0.915589,0.410332,0.940678,664.009766,0.91762,0.41001,1000.0,1613316000.0,1.149828,2.814722,0.858231,0.791008
5000.0,0.96194,340.011301,0.939396,0.299179,0.955391,339.750202,0.930035,0.299087,1000.0,1613316000.0,1.296803,2.940781,1.005205,0.934957
25000.0,0.924367,92.995454,0.888234,0.112081,0.909126,92.377628,0.867734,0.111366,1000.0,1613316000.0,2.692016,4.41179,2.400418,2.308481
50000.0,0.907294,74.99689,0.865673,0.09261,0.890466,74.334633,0.84716,0.091704,1000.0,1613316000.0,4.82678,6.651657,4.535182,4.431003
100000.0,0.875842,63.67233,0.830061,0.078651,0.862454,62.956683,0.813616,0.077615,1000.0,1613316000.0,9.241756,10.961998,8.950158,8.828255
250000.0,0.77285,56.158375,0.715863,0.067725,0.723683,52.499865,0.66238,0.062086,1000.0,1613316000.0,19.036252,20.853013,18.744654,18.617941
