In [3]:
import argparse
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.cluster import KMeans
from sklearn import metrics

import torch
import torch.nn as nn
import copy
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import h5py
import scipy as sp
import scanpy.api as sc
from collections import Counter
import pickle
import os
import glob2
plt.ion()
plt.show()
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.



In [9]:

files = glob2.glob('../real_data/*.h5')
files = [f[len("../real_data/"):-3] for f in files]
files

['10X_PBMC_select_2100',
 'mouse_ES_cell',
 'worm_neuron_cell_select_2100',
 'worm_neuron_cell',
 'mouse_bladder_cell',
 'mouse_ES_cell_select_2100',
 'mouse_bladder_cell_select_2100',
 '10X_PBMC']

In [13]:
df = pd.DataFrame(columns = ["dataset", "scanpy", "run"])
for run in range(1):
    print(df.shape)
    for dataset in files:
        print(f">>>>> Data {dataset}")

        data_mat = h5py.File(f"{path}real_data/{dataset}.h5", "r")
        X = np.array(data_mat['X'])
        Y = np.array(data_mat['Y'])

        
        print(np.where(X ==0)[0].shape[0]/(X.shape[0]*X.shape[1]))
        X = np.ceil(X).astype(np.int)
        adata = sc.AnnData(X)
        adata.obs['Group'] = Y
        adata.var_names_make_unique()

        sc.pp.filter_genes(adata, min_cells=3)

        sc.pp.normalize_total(adata, target_sum=1e4)

        sc.pp.log1p(adata)

        sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

        adata = adata[:, adata.var.highly_variable]

        sc.pp.scale(adata, max_value=10)

        sc.tl.pca(adata, svd_solver='arpack')

        sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

        sc.tl.umap(adata)

        sc.tl.leiden(adata)

        pred = adata.obs['leiden'].to_list()
        pred = [int(x) for x in pred]
        ari = adjusted_rand_score(Y, pred)
        df.loc[df.shape[0]] = [dataset, ari, run]
        print(f"ARI {ari}")
        df.to_pickle(f"{path}output/pickle_results/real_data_scanpy.pkl")

(0, 3)
>>>>> Data 10X_PBMC_select_2100
0.922900120956327


  view_to_actual(adata)


ARI 0.5758411955706193
>>>>> Data mouse_ES_cell
0.6576444836391497


  view_to_actual(adata)


ARI 0.7762598459016067
>>>>> Data worm_neuron_cell_select_2100
0.9861758600237248


  view_to_actual(adata)


ARI 0.42085409683332264
>>>>> Data worm_neuron_cell
0.9861813427688408


  view_to_actual(adata)


ARI 0.4051744668033945
>>>>> Data mouse_bladder_cell
0.9486727054455071


  view_to_actual(adata)


ARI 0.6390342857136783
>>>>> Data mouse_ES_cell_select_2100
0.6597349879351948


  view_to_actual(adata)


ARI 0.7732152655085611
>>>>> Data mouse_bladder_cell_select_2100
0.9489742437855645


  view_to_actual(adata)


ARI 0.6162157167348896
>>>>> Data 10X_PBMC
0.9223690984556294


  view_to_actual(adata)


ARI 0.5581793348370953


In [11]:
df

Unnamed: 0,dataset,scanpy,run
0,10X_PBMC_select_2100,0.575841,0
1,mouse_ES_cell,0.77626,0
2,worm_neuron_cell_select_2100,0.420854,0
3,worm_neuron_cell,0.405174,0
4,mouse_bladder_cell,0.639034,0
5,mouse_ES_cell_select_2100,0.773215,0
6,mouse_bladder_cell_select_2100,0.616216,0
7,10X_PBMC,0.558179,0


In [12]:
df.mean()

scanpy    0.595597
run       0.000000
dtype: float64