Original implementation of Contrastive-sc method
(https://github.com/ciortanmadalina/contrastive-sc)

In [1]:
import sys
sys.path.append("..")
import argparse
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.cluster import KMeans
from sklearn import metrics

import torch
import torch.nn as nn
import copy
from tqdm.notebook import tqdm
import models
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import st_loss

import h5py
import scipy as sp
import scanpy.api as sc
from collections import Counter
import random
import utils

import pickle

import train
import os
import glob2
plt.ion()
plt.show()
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.



In [5]:
path = "../"
files = glob2.glob(f'{path}real_data/*.h5')
files = [f[len(f"'{path}real_data"):-3] for f in files]


In [6]:
sczi = pd.read_pickle(f"../output/pickle_results/real_data/real_data_sczi.pkl")

In [7]:
df = pd.DataFrame()
lr =0.4
layers = [200, 40, 60]
dropout = 0.9
temperature = 0.07
for dataset in files:

    print(f">>>>> Data {dataset}")
    print("SCZI ", sczi[sczi["dataset"] == dataset]["ARI"].mean())
    for nb_genes in [ 500, 1500, 5000]:

        data_mat = h5py.File(f"{path}real_data/{dataset}.h5", "r")
        X = np.array(data_mat['X'])
        Y = np.array(data_mat['Y'])
        cluster_number = np.unique(Y).shape[0]
        sparsity = len(np.where(X == 0)[0])/(X.shape[0] *X.shape[1])

        X = train.preprocess(X, nb_genes=nb_genes)

        for run in range(3):
            torch.manual_seed(run)
            torch.cuda.manual_seed_all(run)
            np.random.seed(run)
            random.seed(run)
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False
            dresults = train.run(X,
                                 cluster_number,
                                 dataset,
                                 Y=Y,
                                 nb_epochs=30,
                                 lr=lr,
                                 temperature=temperature,
                                 dropout=dropout,
                                 
                                 
                                 layers=layers,
                                 save_to=f"{path}output/real_data/{run}/",
                                 save_pred = False)
            dresults["nb_genes"] = nb_genes
            dresults["sparsity"] = sparsity
            dresults["layers"] = str(layers)
            dresults["run"] = run
            print(f".", end = "")
            print(f"# {nb_genes},", 
                  dresults.get('COMBINED_kmeans_ari', ""),
                  dresults.get('COMBINED_leiden_ari', ""), dresults.get('kmeans_ari_0',""),
                  dresults.get('leiden_ari_0', ""))
            df = df.append(dresults, ignore_index=True)

            df.to_pickle(f"{path}output/pickle_results/real_data/real_data_dataset_tuning.pkl")

>>>>> Data Quake_Smart-seq2_Trachea
SCZI  0.8291128219663276
(1350, 23341) (1350, 23341) keeping 500 genes
.# 500,   0.8757419746828033 0.18813728466297122
.# 500,   0.8231397433913237 0.1741258162593457
.# 500,   0.8804301360194373 0.17054375955346077
(1350, 23341) (1350, 23341) keeping 1500 genes
.# 1500,   0.5275471490316784 0.16946955471249087
.# 1500,   0.4529149945565235 0.1546898734564224
.# 1500,   0.4918390743487336 0.16857120215330257
(1350, 23341) (1350, 23341) keeping 5000 genes
.# 5000,   0.4986300556084495 0.14890611196743486
.# 5000,   0.5024482696221724 0.19068850739353999
.# 5000,   0.5544586785197373 0.16692622097713075
>>>>> Data Quake_Smart-seq2_Diaphragm
SCZI  0.9596701189611787
(870, 23341) (870, 23341) keeping 500 genes
.# 500,   0.9697968135271604 0.3499212283088658
.# 500,   0.9660890110636096 0.38953113201131523
.# 500,   0.966116503913852 0.28111962100728
(870, 23341) (870, 23341) keeping 1500 genes
.# 1500,   0.9835434095473783 0.3229441585967147
.# 1500,   

.# 5000,   0.6724514014378952 0.2669861350909371
>>>>> Data Muraro
SCZI  0.7246271034758398
(2122, 19046) (2122, 19046) keeping 500 genes
.# 500,   0.8522487093977664 0.3774794154209196
.# 500,   0.8610519908167902 0.37833295995145977
.# 500,   0.878946946231079 0.372726984708388
(2122, 19046) (2122, 19046) keeping 1500 genes
.# 1500,   0.7154749950615965 0.3635358407415918
.# 1500,   0.7066556659921578 0.3468633332252709
.# 1500,   0.6948823714718341 0.35786562488227935
(2122, 19046) (2122, 19046) keeping 5000 genes
.# 5000,   0.6171174161604415 0.34779316419314044
.# 5000,   0.6411539666004095 0.36854845584366897
.# 5000,   0.6449578107860243 0.31559097586518464
>>>>> Data 10X_PBMC
SCZI  0.5845649506030623
(4271, 16653) (4271, 16653) keeping 500 genes
.# 500,   0.7000465730202853 0.38195140087809054
.# 500,   0.7012140678789418 0.39608507554978317
.# 500,   0.703442341359809 0.44127981801223143
(4271, 16653) (4271, 16653) keeping 1500 genes
.# 1500,   0.7712082388250349 0.42995238583

In [15]:
df =df.groupby(["dataset", "nb_genes"])[["kmeans_ari_0", "sparsity"]].mean().unstack("nb_genes").reset_index()
df

Unnamed: 0_level_0,dataset,kmeans_ari_0,kmeans_ari_0,kmeans_ari_0,sparsity,sparsity,sparsity
nb_genes,Unnamed: 1_level_1,500.0,1500.0,5000.0,500.0,1500.0,5000.0
0,10X_PBMC,0.701568,0.766675,0.801425,0.922369,0.922369,0.922369
1,Adam,0.82862,0.781529,0.71497,0.923276,0.923276,0.923276
2,Muraro,0.864083,0.705671,0.63441,0.730166,0.730166,0.730166
3,Quake_10x_Bladder,0.754433,0.753236,0.724299,0.869397,0.869397,0.869397
4,Quake_10x_Limb_Muscle,0.986853,0.979232,0.780618,0.935695,0.935695,0.935695
5,Quake_10x_Spleen,0.905808,0.322492,0.320945,0.943383,0.943383,0.943383
6,Quake_Smart-seq2_Diaphragm,0.967334,0.982227,0.974243,0.913524,0.913524,0.913524
7,Quake_Smart-seq2_Limb_Muscle,0.97321,0.976348,0.671116,0.894683,0.894683,0.894683
8,Quake_Smart-seq2_Lung,0.706464,0.532702,0.424443,0.890813,0.890813,0.890813
9,Quake_Smart-seq2_Trachea,0.859771,0.490767,0.518512,0.854845,0.854845,0.854845


In [17]:
df.columns = ["dataset", "500", "1500", "5000", "sparsity", "s1", "s2"]

df = df.drop(["s1", "s2"], axis = 1).set_index("dataset")

df