Original implementation of Contrastive-sc method
(https://github.com/ciortanmadalina/contrastive-sc)

In [1]:
import sys
sys.path.append("..")
import argparse
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.cluster import KMeans
from sklearn import metrics

import torch
import torch.nn as nn
import copy
from tqdm.notebook import tqdm
import models
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import st_loss
import time
import h5py
import scipy as sp
import scanpy.api as sc
from collections import Counter
import random
import utils
import loop
import pickle

import train
import os
import glob2
plt.ion()
plt.show()
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.



In [2]:
path = "../"
category = "real_data"
files = glob2.glob(f'{path}{category}/*.h5')
files = [f[len(f"'{path}{category}"):-3] for f in files]
files

['Quake_Smart-seq2_Trachea',
 'Quake_Smart-seq2_Diaphragm',
 'Quake_10x_Spleen',
 'Young',
 'mouse_ES_cell',
 'Adam',
 'Quake_10x_Bladder',
 'Quake_Smart-seq2_Lung',
 'Quake_10x_Limb_Muscle',
 'worm_neuron_cell',
 'mouse_bladder_cell',
 'Romanov',
 'Quake_Smart-seq2_Limb_Muscle',
 'Muraro',
 '10X_PBMC']

In [3]:
sczi = pd.read_pickle(f"../output/pickle_results/real_data/real_data_sczi.pkl")

In [4]:
df = pd.DataFrame()
dropout = 0.9
lr = 0.4
layers = [200, 40, 60]
temperature = 0.07
for dataset in files:

    print(f">>>>> Data {dataset}")
    print("SCZI ", sczi[sczi["dataset"] == dataset]["ARI"].mean())
    data_mat = h5py.File(f"{path}real_data/{dataset}.h5", "r")
    X = np.array(data_mat['X'])
    Y = np.array(data_mat['Y'])

    cluster_number = np.unique(Y).shape[0]
    nb_genes = 500
    data_mat = h5py.File(f"{path}real_data/{dataset}.h5", "r")
    X = np.array(data_mat['X'])
    Y = np.array(data_mat['Y'])

    X = train.preprocess(X, nb_genes=nb_genes)

    for run in range(3):
        torch.manual_seed(run)
        torch.cuda.manual_seed_all(run)
        np.random.seed(run)
        random.seed(run)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        start = time.time()
        dresults = train.run(X,
                             cluster_number,
                             dataset,
                             Y=Y,
                             nb_epochs=30,
                             lr=lr,
                             temperature=temperature,
                             dropout=dropout,
                             evaluate=False,
                             n_ensemble=1,
                             layers=layers,
                             save_to=f"{path}output/{category}/{run}/",
                             save_pred = True)
        elapsed = time.time()-start
        dresults["temperature"] = temperature
        dresults["dropout"] = dropout
        dresults["lr"] = lr
        dresults["layers"] = str(layers)
        dresults["run"] = run
        dresults["time"] = elapsed
#         print(f".", end = "")
#         print(f"# {temperature}, {dropout}, {lr}, {layers}", 
#               dresults.get('COMBINED_kmeans_ari', ""),
#               dresults.get('COMBINED_leiden_ari', ""), dresults.get('kmeans_ari_0',""),
#               dresults.get('leiden_ari_0', ""))
        df = df.append(dresults, ignore_index=True)

        df.to_pickle(f"{path}output/pickle_results/{category}/{category}_baseline.pkl")

>>>>> Data Quake_Smart-seq2_Trachea
SCZI  0.8291128219663276
(1350, 23341) (1350, 23341) keeping 500 genes
>>>>> Data Quake_Smart-seq2_Diaphragm
SCZI  0.9596701189611787
(870, 23341) (870, 23341) keeping 500 genes
>>>>> Data Quake_10x_Spleen
SCZI  0.909534824400645
(9552, 23341) (9552, 23341) keeping 500 genes
>>>>> Data Young
SCZI  0.6629811886776039
(5685, 33658) (5685, 33658) keeping 500 genes
>>>>> Data mouse_ES_cell
SCZI  0.800376096099337
(2717, 24175) (2717, 24175) keeping 500 genes
>>>>> Data Adam
SCZI  0.8634561030635544
(3660, 23797) (3660, 23797) keeping 500 genes
>>>>> Data Quake_10x_Bladder
SCZI  0.983095549108186
(2500, 23341) (2500, 23341) keeping 500 genes
>>>>> Data Quake_Smart-seq2_Lung
SCZI  0.7134055445020913
(1676, 23341) (1676, 23341) keeping 500 genes
>>>>> Data Quake_10x_Limb_Muscle
SCZI  0.9610559204789085
(3909, 23341) (3909, 23341) keeping 500 genes
>>>>> Data worm_neuron_cell
SCZI  0.05077670288621814
(4186, 13488) (4186, 13488) keeping 500 genes
>>>>> Data 

In [5]:
df.mean()

dropout         0.900000
kmeans_ari_0    0.736921
kmeans_nmi_0    0.765052
leiden_ari_0    0.314922
leiden_nmi_0    0.598043
lr              0.000010
run             1.000000
temperature     0.070000
time            3.072331
dtype: float64

# Train on cpu

In [None]:
df = pd.DataFrame()
dropout = 0.9
lr = 0.4
layers = [200, 60, 40]
temperature = 0.07
for dataset in files:

    print(f">>>>> Data {dataset}")
    print("SCZI ", sczi[sczi["dataset"] == dataset]["ARI"].mean())
    data_mat = h5py.File(f"{path}real_data/{dataset}.h5", "r")
    X = np.array(data_mat['X'])
    Y = np.array(data_mat['Y'])

    cluster_number = np.unique(Y).shape[0]
    nb_genes = 500
    data_mat = h5py.File(f"{path}real_data/{dataset}.h5", "r")
    X = np.array(data_mat['X'])
    Y = np.array(data_mat['Y'])

    X = train.preprocess(X, nb_genes=nb_genes)

    for run in range(3):
        torch.manual_seed(run)
        torch.cuda.manual_seed_all(run)
        np.random.seed(run)
        random.seed(run)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        start = time.time()
        dresults = train.run(X,
                             cluster_number,
                             dataset,
                             Y=Y,
                             nb_epochs=30,
                             lr=lr,
                             temperature=temperature,
                             dropout=dropout,
                             evaluate=False,
                             n_ensemble=1,
                             layers=layers,
                             save_to=f"{path}output/{category}/{run}/",
                             save_pred = True,
                             use_cpu = True
                             )
        elapsed = time.time()-start
        dresults["temperature"] = temperature
        dresults["dropout"] = dropout
        dresults["lr"] = lr
        dresults["layers"] = str(layers)
        dresults["run"] = run
        dresults["time"] = elapsed
        df = df.append(dresults, ignore_index=True)

        df.to_pickle(f"{path}output/pickle_results/{category}/{category}_baseline_cpu.pkl")