In [1]:
import argparse
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.cluster import KMeans
from sklearn import metrics
import torch
import torch.nn as nn
import copy
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import h5py
import scipy as sp
import scanpy.api as sc
from collections import Counter
import pickle
import os
from numpy.random import seed
import random
plt.ion()
plt.show()
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.



In [16]:
import glob2
category = "imbalanced_data"#"balanced_data"
files = glob2.glob(f'../R/simulated_data/{category}/*.h5')
files = [f[len(f"{path}R/simulated_data/{category}/"):-3] for f in files]
files

['data_1c8',
 'data_-1c4',
 'data_-1c8',
 'data_0c4',
 'data_0c8',
 'data_0c16',
 'data_1.5c4',
 'data_1c4',
 'data_1.5c8',
 'data_1.5c16',
 'data_-1c16',
 'data_1c16']

In [17]:
df = pd.DataFrame(columns = ["dataset", "scanpy", "run"])
for run in range(3):
    seed(run)
    random.seed(run)
    print(df.shape)
    for dataset in files:
        print(f">>>>> Data {dataset}")

        data_mat = h5py.File(f"{path}R/simulated_data/{category}/{dataset}.h5", "r")
        X = np.array(data_mat['X'])
        Y = np.array(data_mat['Y'])

        
        print(np.where(X ==0)[0].shape[0]/(X.shape[0]*X.shape[1]))
        X = np.ceil(X).astype(np.int)
        adata = sc.AnnData(X)
        adata.obs['Group'] = Y
        adata.var_names_make_unique()

        sc.pp.filter_genes(adata, min_cells=3)

        sc.pp.normalize_total(adata, target_sum=1e4)

        sc.pp.log1p(adata)

        sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

        adata = adata[:, adata.var.highly_variable]

        sc.pp.scale(adata, max_value=10)

        sc.tl.pca(adata, svd_solver='arpack', random_state=run)

        sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40, random_state=run)

        sc.tl.umap(adata, random_state=run)

        sc.tl.leiden(adata, random_state=run)

        pred = adata.obs['leiden'].to_list()
        pred = [int(x) for x in pred]
        ari = adjusted_rand_score(Y, pred)
        df.loc[df.shape[0]] = [dataset, ari, run]
        print(f"ARI {ari}")

        df.to_pickle(f"{path}output/pickle_results/{category}/{category}_scanpy.pkl")

(0, 3)
>>>>> Data data_1c8
0.4561709333333333


  view_to_actual(adata)


ARI 0.167231751761025
>>>>> Data data_-1c4
0.2867932


  view_to_actual(adata)


ARI 0.6601036759577463
>>>>> Data data_-1c8
0.28635706666666666


  view_to_actual(adata)


ARI 0.7089265073365163
>>>>> Data data_0c4
0.3523788


  view_to_actual(adata)


ARI 0.5214156579275147
>>>>> Data data_0c8
0.3519428


  view_to_actual(adata)


ARI 0.5675564237385652
>>>>> Data data_0c16
0.3519462666666667


  view_to_actual(adata)


ARI 0.09398551895002619
>>>>> Data data_1.5c4
0.5219972


  view_to_actual(adata)


ARI 0.09437655215850463
>>>>> Data data_1c4
0.4567536


  view_to_actual(adata)


ARI 0.17521370578986556
>>>>> Data data_1.5c8
0.5210738666666667


  view_to_actual(adata)


ARI 0.05167537527142556
>>>>> Data data_1.5c16
0.5213713333333333


  view_to_actual(adata)


ARI 0.00792048465339417
>>>>> Data data_-1c16
0.28630053333333333


  view_to_actual(adata)


ARI 0.3618256604472812
>>>>> Data data_1c16
0.4562661333333333


  view_to_actual(adata)


ARI 0.010224492589485448
(12, 3)
>>>>> Data data_1c8
0.4561709333333333


  view_to_actual(adata)


ARI 0.1587538730873152
>>>>> Data data_-1c4
0.2867932


  view_to_actual(adata)


ARI 0.6699729221490228
>>>>> Data data_-1c8
0.28635706666666666


  view_to_actual(adata)


ARI 0.7250825392764871
>>>>> Data data_0c4
0.3523788


  view_to_actual(adata)


ARI 0.4751837056741168
>>>>> Data data_0c8
0.3519428


  view_to_actual(adata)


ARI 0.565972585638398
>>>>> Data data_0c16
0.3519462666666667


  view_to_actual(adata)


ARI 0.10635282906018595
>>>>> Data data_1.5c4
0.5219972


  view_to_actual(adata)


ARI 0.06377298940337543
>>>>> Data data_1c4
0.4567536


  view_to_actual(adata)


ARI 0.16369189535813286
>>>>> Data data_1.5c8
0.5210738666666667


  view_to_actual(adata)


ARI 0.04286761096011328
>>>>> Data data_1.5c16
0.5213713333333333


  view_to_actual(adata)


ARI 0.009091597589725938
>>>>> Data data_-1c16
0.28630053333333333


  view_to_actual(adata)


ARI 0.3697184579579826
>>>>> Data data_1c16
0.4562661333333333


  view_to_actual(adata)


ARI 0.008667993401556766
(24, 3)
>>>>> Data data_1c8
0.4561709333333333


  view_to_actual(adata)


ARI 0.14762171414997735
>>>>> Data data_-1c4
0.2867932


  view_to_actual(adata)


ARI 0.6531517656475629
>>>>> Data data_-1c8
0.28635706666666666


  view_to_actual(adata)


ARI 0.7285312986051056
>>>>> Data data_0c4
0.3523788


  view_to_actual(adata)


ARI 0.5102880279609546
>>>>> Data data_0c8
0.3519428


  view_to_actual(adata)


ARI 0.5559914158414436
>>>>> Data data_0c16
0.3519462666666667


  view_to_actual(adata)


ARI 0.10061332489445321
>>>>> Data data_1.5c4
0.5219972


  view_to_actual(adata)


ARI 0.08187154333074262
>>>>> Data data_1c4
0.4567536


  view_to_actual(adata)


ARI 0.16961929219111457
>>>>> Data data_1.5c8
0.5210738666666667


  view_to_actual(adata)


ARI 0.03754636795550212
>>>>> Data data_1.5c16
0.5213713333333333


  view_to_actual(adata)


ARI 0.007195560834480956
>>>>> Data data_-1c16
0.28630053333333333


  view_to_actual(adata)


ARI 0.36145085656687065
>>>>> Data data_1c16
0.4562661333333333


  view_to_actual(adata)


ARI 0.01194581210972369


In [18]:
df.sort_values(by ="dataset")

Unnamed: 0,dataset,scanpy,run
22,data_-1c16,0.369718,1
34,data_-1c16,0.361451,2
10,data_-1c16,0.361826,0
13,data_-1c4,0.669973,1
1,data_-1c4,0.660104,0
25,data_-1c4,0.653152,2
14,data_-1c8,0.725083,1
26,data_-1c8,0.728531,2
2,data_-1c8,0.708927,0
17,data_0c16,0.106353,1


In [15]:
df.mean()

scanpy    0.284071
run       1.000000
dtype: float64