In [1]:
import sys
import time
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix, issparse
from scipy.stats import iqr
from bionemo.scdl.io.single_cell_memmap_dataset import SingleCellMemMapDataset
from torch.utils.data import DataLoader
from functools import wraps

import subprocess
from bionemo.scdl.util.torch_dataloader_utils import collate_sparse_matrix_batch


In [3]:
def get_disk_size(directory):
    result = subprocess.run(['du', '-sb', directory], stdout=subprocess.PIPE, text=True)
    size_in_bytes = int(result.stdout.split()[0])
    return size_in_bytes
def timeit(method):
    @wraps(method)
    def timed(*args, **kwargs):
        start_time = time.time()
        result = method(*args, **kwargs)
        end_time = time.time()
        print(f"Method {method.__name__} took {end_time - start_time:.4f} seconds")
        return result
    return timed

def time_all_methods(cls):
    for attr_name, attr_value in cls.__dict__.items():
        if callable(attr_value):  # Check if the attribute is a method
            setattr(cls, attr_name, timeit(attr_value))
    return cls


In [8]:
@time_all_methods
class AnnDataMetrics:
    def __init__(self, adatapath):
        self.adatapath = adatapath
    def load_backed(self):
        self.adata_backed = ad.read_h5ad(self.adatapath, backed=True)

    def load_whole(self):
        self.adata = ad.read_h5ad(self.adatapath, backed=False)

    def max(self):
        return self.adata.X.max()

    def min(self):
        return self.adata.X.min()

    def mean(self):
        return self.adata.X.mean()

    def median(self):
        return np.median(self.adata.X)

    def interQuartRange(self):
        return iqr(self.adata.X)

    def num_values(self):
        return self.adata.X.shape[0] * self.adata.X.shape[1]

    def sparsity_stats(self):
        num_non_zero = 0
        # Get the number of non-zero values
        if issparse(self.adata.X):
            num_non_zero = self.adata.X.nnz
        else:
            num_non_zero = (self.adata.X.round() != 0).sum()
            
        num_vals = self.num_values(self.adata)
        num_zeros = num_vals - num_non_zero
        sparsity = num_zeros / num_vals
        
        return num_zeros, num_non_zero, sparsity

    def size_disk_bytes(self):
        return get_disk_size(self.adatapath)
    def size_adata_bytes(self): 
        return sys.getsizeof(self.adata)
    def size_adata_backed_bytes(self): 
        return sys.getsizeof(self.adata_backed)

    def random_rows_whole(self, random_samples = 10_000):
        L = self.adata.X.shape[0]     
        rIdx = np.sort(np.random.choice(L, size=(random_samples,), replace=True))
        x = self.adata[rIdx, :].X
        return x
    def random_rows_backed(self, random_samples = 10_000):
        L = self.adata_backed.X.shape[0]     
        rIdx = np.sort(np.random.choice(L, size=(random_samples,), replace=True))
        x = self.adata[rIdx, :].X
        return x
    def random_values_whole(self, random_samples = 10_000):
        rows = self.adata.X.shape[0]
        cols = self.adata.X.shape[1]
        rIdx = np.sort(np.random.choice(rows, size=(random_samples), replace=True))
        cIdx = np.sort(np.random.choice(cols, size=(random_samples), replace=True))
        return [self.adata.X[r,c] for r,c in zip(rIdx, cIdx)]
    
    def random_values_backed(self, random_samples = 10_000):
        rows = self.adata.X.shape[0]
        cols = self.adata.X.shape[0]
        rIdx = np.sort(np.random.choice(rows, size=(random_samples), replace=True))
        cIdx = np.sort(np.random.choice(cols, size=(random_samples), replace=True))
        return [self.adata_backed.X[r,c] for r,c in zip(rIdx, cIdx)]


In [13]:
@time_all_methods
class SCDLMetrics:
    def __init__(self, adatapath, memmap_dir):
        self.adatapath = adatapath
        self.memmap_dir = memmap_dir
    
    def create_from_adata(self):
        self.first_ds = SingleCellMemMapDataset(self.memmap_dir, self.adatapath)
    def save(self):
        self.first_ds.save()
    def load_backed(self):
        self.ds = SingleCellMemMapDataset(self.memmap_dir)
    def max(self):
        return np.max(self.ds.data)

    def min(self):
        return np.min(self.ds.data)

    def mean(self):
        return np.mean(self.ds.data)

    def median(self):
        return np.median(self.ds.data)

    def interQuartRange(self):
        return iqr(self.ds.data)

    def num_values(self):
        return self.ds.number_of_values()

    def sparsity_stats(self):
        return self.ds.sparsity()
    
    
    def size_disk_bytes(self):
        return get_disk_size(self.memmap_dir)
    def size_mem_dataset_bytes(self): 
        return sys.getsizeof(self.ds)

    def random_rows(self, random_samples = 10_000):
        L = self.ds.number_of_rows()   
        rIdx = np.sort(np.random.choice(L, size=(random_samples), replace=True))
        return [self.ds[v] for v in rIdx]
    
    def random_values(self, random_samples = 10_000):
        rows = self.ds.number_of_rows() 
        cols = self.ds.shape()[1][0]  
        rIdx = np.sort(np.random.choice(rows, size=(random_samples), replace=True))
        cIdx = np.sort(np.random.choice(cols, size=(random_samples), replace=True))
        return [self.ds.get_row_column(r,c) for r,c in zip(rIdx, cIdx)]

    def iterate_dl(self, batch_size = 8):
        model = lambda x : x

        dataloader = DataLoader(self.ds, batch_size=batch_size, shuffle=True, collate_fn=collate_sparse_matrix_batch)
        n_epochs = 1
        for e in range(n_epochs):
            for batch in dataloader:
                model(batch)


In [9]:
anndatapath = "examples/hdf5s/9da4d19f-f6ac-4bf0-a47e-2935b1164569.h5ad"
anndata_m = AnnDataMetrics(anndatapath)
anndata_m.load_backed()
anndata_m.load_whole()
anndata_m.max()
anndata_m.min()
anndata_m.mean()
print(f"Disk size: {anndata_m.size_disk_bytes()/(1_024**2)} MB")
print(f"Adata size: {anndata_m.size_adata_bytes()/(1_024**2)} MB")
print(f"Adata size backed: {anndata_m.size_adata_backed_bytes()/(1_024**2)} MB")



Method __init__ took 0.0000 seconds
Method load_backed took 1.0470 seconds
Method load_whole took 38.1525 seconds
Method max took 0.3304 seconds
Method min took 0.1310 seconds
Method mean took 4.8601 seconds
Method size_disk_bytes took 0.0031 seconds
Disk size: 5768.38676071167 MB
Method size_adata_bytes took 0.0639 seconds
Adata size: 5795.636876106262 MB
Method size_adata_backed_bytes took 0.0612 seconds
Adata size backed: 207.6631441116333 MB


In [47]:
#print(len(anndata_m.adata.obs))
print(anndata_m.adata.X.nnz)
print(anndata_m.adata.X.shape)
print(len(anndata_m.adata.X.data))
print(len(anndata_m.adata.X.indices))
print(len(anndata_m.adata.X.indptr))


732248790
(356213, 18024)
732248790
732248790
356214


In [21]:
anndata_m.random_rows_whole()
rows = anndata_m.random_rows_backed()


Method random_rows_whole took 0.0996 seconds
Method random_rows_backed took 0.0642 seconds
<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 20490362 stored elements and shape (10000, 18024)>
  Coords	Values
  (0, 10)	1.2342560291290283
  (0, 14)	0.6766462326049805
  (0, 20)	0.8552476763725281
  (0, 21)	2.3526313304901123
  (0, 23)	0.5917707085609436
  (0, 25)	0.7402293086051941
  (0, 43)	3.662249803543091
  (0, 49)	0.9115815162658691
  (0, 74)	1.364393949508667
  (0, 78)	0.5674358606338501
  (0, 87)	1.489476203918457
  (0, 90)	2.027841329574585
  (0, 114)	2.5541796684265137
  (0, 115)	1.8312854766845703
  (0, 128)	1.532244086265564
  (0, 130)	0.5525400638580322
  (0, 131)	1.5425962209701538
  (0, 139)	3.0524656772613525
  (0, 141)	0.582703173160553
  (0, 146)	1.2656664848327637
  (0, 159)	0.7641072273254395
  (0, 163)	0.9830567836761475
  (0, 174)	1.135430932044983
  (0, 183)	1.8649704456329346
  (0, 184)	0.9066843390464783
  :	:
  (9999, 17759)	0.4280794560909271
  (9999,

In [None]:
anndata_m.random_values_whole(random_samples = 100)
anndata_m.random_values_backed(random_samples = 100)

In [14]:
scdl_path = "memmap_9da4d19f"
scdl_m = SCDLMetrics(memmap_dir=scdl_path, adatapath=anndatapath)
scdl_m.create_from_adata()
scdl_m.save()

Method __init__ took 0.0000 seconds
Method create_from_adata took 46.5701 seconds
Method save took 0.0192 seconds


In [15]:
scdl_m.load_backed()
scdl_m.max()
scdl_m.min()
scdl_m.mean()
scdl_m.sparsity_stats()
print(f"Disk size: {scdl_m.size_disk_bytes()/(1_024**2)} MB")
print(f"SCDataset size: {scdl_m.size_mem_dataset_bytes()/(1_024**2)} MB")


x = scdl_m.random_rows()
y = scdl_m.random_values(random_samples = 100)
scdl_m.iterate_dl()

Method load_backed took 0.0358 seconds
Method max took 0.2057 seconds
Method min took 0.1164 seconds
Method mean took 0.1368 seconds
Method sparsity_stats took 0.0000 seconds
Method size_disk_bytes took 0.0025 seconds
Disk size: 5590.160111427307 MB
Method size_mem_dataset_bytes took 0.0000 seconds
SCDataset size: 4.57763671875e-05 MB
Method random_rows took 1.4874 seconds
Method random_values took 0.0510 seconds


  batch_sparse_tensor = torch.sparse_csr_tensor(batch_rows, batch_cols, batch_values, size=(len(batch), max_pointer))


Method iterate_dl took 9.4375 seconds


array([0.000e+00, 1.000e+00, 4.000e+00, 7.000e+00, 2.900e+01, 9.000e+01,
       1.536e+03])