In [25]:
import sys
import time
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix, issparse
from scipy.stats import iqr
from bionemo.scdl.io.single_cell_memmap_dataset import SingleCellMemMapDataset
from torch.utils.data import DataLoader
from functools import wraps
import signal

import subprocess
from bionemo.scdl.util.torch_dataloader_utils import collate_sparse_matrix_batch


In [31]:
def get_disk_size(directory):
    result = subprocess.run(['du', '-sb', directory], stdout=subprocess.PIPE, text=True)
    size_in_bytes = int(result.stdout.split()[0])
    return size_in_bytes
def timeit(method):
    @wraps(method)
    def timed(*args, **kwargs):
        start_time = time.time()
        result = method(*args, **kwargs)
        end_time = time.time()
        run_time = end_time - start_time
        print(f"Method {method.__name__} took {run_time:.4f} seconds")
        return result, run_time
    return timed

def time_all_methods(cls):
    for attr_name, attr_value in cls.__dict__.items():
        if callable(attr_value) and attr_name != "__init__":  # Check if the attribute is a method
            setattr(cls, attr_name, timeit(attr_value))
    return cls


In [32]:
@time_all_methods
class AnnDataMetrics:
    def __init__(self, adatapath):
        self.adatapath = adatapath
    def load_backed(self):
        self.adata_backed = ad.read_h5ad(self.adatapath, backed=True)

    def load_whole(self):
        self.adata = ad.read_h5ad(self.adatapath, backed=False)

    def max(self):
        return self.adata.X.data.max()

    def min(self):
        return self.adata.X.data.min()

    def mean(self):
        return self.adata.X.data.mean()

    def median(self):
        return np.median(self.adata.X.data)

    def interQuartRange(self):
        return iqr(self.adata.X.data)

    def num_values(self):
        return self.adata.X.shape[0] * self.adata.X.shape[1]

    def sparsity_stats(self):
        num_non_zero = 0
        # Get the number of non-zero values
        if issparse(self.adata.X):
            num_non_zero = self.adata.X.nnz
        else:
            num_non_zero = (self.adata.X.round() != 0).sum()
            
        num_vals = self.num_values(self.adata)
        num_zeros = num_vals - num_non_zero
        sparsity = num_zeros / num_vals
        
        return num_zeros, num_non_zero, sparsity

    def size_disk_bytes(self):
        return get_disk_size(self.adatapath)
    def size_adata_bytes(self): 
        return sys.getsizeof(self.adata)
    def size_adata_backed_bytes(self): 
        return sys.getsizeof(self.adata_backed)

    def random_rows_whole(self, random_samples = 10_000):
        L = self.adata.X.shape[0]     
        rIdx = np.sort(np.random.choice(L, size=(random_samples,), replace=True))
        x = self.adata[rIdx, :].X
        return x
    def random_rows_backed(self, random_samples = 10_000):
        L = self.adata_backed.X.shape[0]     
        rIdx = np.sort(np.random.choice(L, size=(random_samples,), replace=True))
        x = self.adata[rIdx, :].X
        return x
    def random_values_whole(self, random_samples = 10_000):
        rows = self.adata.X.shape[0]
        cols = self.adata.X.shape[1]
        rIdx = np.sort(np.random.choice(rows, size=(random_samples), replace=True))
        cIdx = np.sort(np.random.choice(cols, size=(random_samples), replace=True))
        return [self.adata.X[r,c] for r,c in zip(rIdx, cIdx)]
    
    def random_values_backed(self, random_samples = 10_000):
        rows = self.adata.X.shape[0]
        cols = self.adata.X.shape[1]
        rIdx = np.sort(np.random.choice(rows, size=(random_samples), replace=True))
        cIdx = np.sort(np.random.choice(cols, size=(random_samples), replace=True))
        return [self.adata_backed.X[r,c] for r,c in zip(rIdx, cIdx)]


In [28]:
@time_all_methods
class SCDLMetrics:
    def __init__(self, adatapath, memmap_dir):
        self.adatapath = adatapath
        self.memmap_dir = memmap_dir
    
    def create_from_adata(self):
        self.first_ds = SingleCellMemMapDataset(self.memmap_dir, self.adatapath)
    def save(self):
        self.first_ds.save()
    def load_backed(self):
        self.ds = SingleCellMemMapDataset(self.memmap_dir)
    def max(self):
        return np.max(self.ds.data)

    def min(self):
        return np.min(self.ds.data)

    def mean(self):
        return np.mean(self.ds.data)

    def median(self):
        return np.median(self.ds.data)

    def interQuartRange(self):
        return iqr(self.ds.data)

    def num_values(self):
        return self.ds.number_of_values()

    def sparsity_stats(self):
        return self.ds.sparsity()
    
    
    def size_disk_bytes(self):
        return get_disk_size(self.memmap_dir)
    def size_mem_dataset_bytes(self): 
        return sys.getsizeof(self.ds)

    def random_rows(self, random_samples = 10_000):
        L = self.ds.number_of_rows()   
        rIdx = np.sort(np.random.choice(L, size=(random_samples), replace=True))
        return [self.ds[v] for v in rIdx]
    
    def random_values(self, random_samples = 10_000):
        rows = self.ds.number_of_rows() 
        cols = self.ds.shape()[1][0]  
        rIdx = np.sort(np.random.choice(rows, size=(random_samples), replace=True))
        cIdx = np.sort(np.random.choice(cols, size=(random_samples), replace=True))
        return [self.ds.get_row_column(r,c) for r,c in zip(rIdx, cIdx)]

    def iterate_dl(self, batch_size = 8):
        model = lambda x : x

        dataloader = DataLoader(self.ds, batch_size=batch_size, shuffle=True, collate_fn=collate_sparse_matrix_batch)
        n_epochs = 1
        for e in range(n_epochs):
            for batch in dataloader:
                model(batch)


In [36]:
results_dict = {}

In [38]:
fn = "06a7ffec-2697-4d6f-96f6-d00a34bedb3d.h5ad"
anndatapath = "examples/hdf5s/" + fn
results_dict["anndata file"] = fn 
anndata_m = AnnDataMetrics(anndatapath)
results_dict["AnnData Dataset Backed Load Time (s)"] = anndata_m.load_backed()[1]
results_dict["AnnData Dataset Load Time (s)"] = anndata_m.load_whole()[1]

results_dict["AnnData Dataset Max Calculation Time (s)"] = anndata_m.max()[1]
results_dict["AnnData Dataset Min Calculation Time (s)"] = anndata_m.min()[1]
results_dict["AnnData Dataset Mean Calculation Time (s)"] = anndata_m.mean()[1]
results_dict["AnnData Dataset Size on Disk (MB)"] = anndata_m.size_disk_bytes()[0]/(1_024**2)

results_dict["AnnData Dataset Size on Disk (MB)"] = anndata_m.size_disk_bytes()[0]/(1_024**2)
results_dict["AnnData Dataset Size in Memory (MB)"] = anndata_m.size_adata_bytes()[0]/(1_024**2)
results_dict["AnnData Backed Dataset Size in Memory (MB)"] = anndata_m.size_adata_backed_bytes()/(1_024**2)


Method load_backed took 0.8858 seconds
Method load_whole took 1.2230 seconds
Method max took 0.0036 seconds
Method min took 0.0030 seconds
Method mean took 0.0038 seconds
Method size_disk_bytes took 0.0027 seconds
Method size_disk_bytes took 0.0030 seconds
Method size_adata_bytes took 0.0468 seconds
Method size_adata_backed_bytes took 0.0462 seconds


In [42]:
#anndata_m.random_values_whole(random_samples = 100)
#anndata_m.random_values_backed(random_samples = 100)
results_dict["AnnData Time to retrieve a random batch of 100 cells loaded in memory (s)"] = anndata_m.random_rows_whole(random_samples = 100)[1]
results_dict["AnnData Time to retrieve a random batch of 100 cells backed on disk (s)"] = anndata_m.random_rows_backed(random_samples = 100)[1]

Method random_rows_whole took 0.0133 seconds
Method random_rows_backed took 0.0122 seconds


In [None]:
scdl_path = "memmap_93bc4573"
scdl_m = SCDLMetrics(memmap_dir=scdl_path, adatapath=anndatapath)
results_dict["SCDL Dataset from AnnData Time (s)"] = scdl_m.create_from_adata()[1]
results_dict["SCDL Dataset save time (s)"] = scdl_m.save()[1]

Method __init__ took 0.0000 seconds
Method create_from_adata took 1.5763 seconds
Method save took 0.0050 seconds


In [None]:
results_dict["SCDL Dataset Load Time (s)"] = scdl_m.load_backed()[1]
results_dict["SCDL Dataset Max Calculation Time (s)"] = scdl_m.max()[1]
results_dict["SCDL Dataset Min Calculation Time (s)"] = scdl_m.min()[1]
results_dict["SCDL Dataset Mean Calculation Time (s)"] = scdl_m.mean()[1]
results_dict["Dataset Sparsity"] = scdl_m.sparsity_stats()[0]

results_dict["SCDL Dataset Size on Disk (MB)"] = scdl_m.size_disk_bytes()[0]/(1_024**2)
results_dict["SCDL Dataset Size in Memory (MB)"] = scdl_m.size_mem_dataset_bytes()[0]/(1_024**2)


results_dict["SCDL Time to retrieve a random batch of 100 cells backed on disk (s)"] =  scdl_m.random_rows()[1]
#y = scdl_m.random_values(random_samples = 100)
results_dict["SCDL Time to iterate over Dataset (s)"] = scdl_m.iterate_dl()[1]

Method load_backed took 0.0623 seconds
Method max took 0.0073 seconds
Method min took 0.0035 seconds
Method mean took 0.0043 seconds
Method sparsity_stats took 0.0000 seconds
Method size_disk_bytes took 0.0031 seconds
Disk size: 161.6428394317627 MB
Method size_mem_dataset_bytes took 0.0000 seconds
SCDataset size: 4.57763671875e-05 MB
Method random_rows took 0.0989 seconds
Method random_values took 0.0020 seconds


  batch_sparse_tensor = torch.sparse_csr_tensor(batch_rows, batch_cols, batch_values, size=(len(batch), max_pointer))


Method iterate_dl took 6.1821 seconds
