In this notebook, I demonstrate how to efficiently compute the mean and variance of a large dataset using CuPy, leveraging GPU acceleration for handling high-dimensional data. The dataset used is a colon data example from the 10x Genomics website. This notebook showcases the performance advantages of CuPy when working with large-scale genomic data, and includes visualizations of the calculated mean-variance relationships.

In [None]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.colors import ListedColormap
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.ticker import MaxNLocator
import seaborn as sns
import os
import gzip
import numpy as np
import scanpy as sc
import squidpy as sq
import cupy as cp
import cupyx
from cupyx.scipy.sparse import csr_matrix 
import os
import time
import rapids_singlecell as rsc
import numpy as np
import rmm
from rmm.allocators.cupy import rmm_cupy_allocator
import cupy

rmm.reinitialize(
    managed_memory=False,  # Allows oversubscription
    pool_allocator=False,  # default is False
    devices=0,  # GPU device IDs to register. By default registers only GPU 0.
)
cp.cuda.set_allocator(rmm_cupy_allocator)
import zarr
import pickle
from collections import OrderedDict
from scipy.sparse import csr_matrix
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests
from scipy.sparse import csr_matrix
import scipy
import anndata
from collections import OrderedDict
from rsc_functions.utility.applyqc import applyqc
from rsc_functions.reports.plot import plot_spatial,plot_spatial_data, plot_dist
from rsc_functions.utility.rank_genes_groups import return_markers,rank_genes_groups
from rsc_functions.reports.plot import plot_expression

In [None]:
path_016 = "/data/kanferg/Sptial_Omics/playGround/Data/Visium_HD_Human_Colon_Cancer_binned_outputs/binned_outputs/square_016um"
pathout = "/data/kanferg/Sptial_Omics/SpatialOmicsToolkit/out_2"

In [None]:
def parquet_to_csv(path):
    '''
    Converts a Parquet file to a CSV file if the CSV file does not already exist.
    '''
    file_path = os.path.join(path,'spatial/tissue_positions_list.csv')
    if not os.path.exists(file_path):
        df = pd.read_parquet(os.path.join(path,'spatial/tissue_positions.parquet'))
        # Write to a CSV file
        df.to_csv(os.path.join(path,'spatial/tissue_positions_list.csv'), index=False)
    return
parquet_to_csv(path_016)
andata = sc.read_visium(path=path_016)
rsc.get.anndata_to_GPU(andata)
andata.obsm['spatial'] = np.array(andata.obsm['spatial'], dtype=np.float64)
andata.var_names_make_unique()
andata.uns['config'] = OrderedDict()
andata.uns["config"]["secondary_var_names"] = andata.var_names
rsc.pp.flag_gene_family(andata, gene_family_name="MT", gene_family_prefix="MT-")
rsc.pp.calculate_qc_metrics(andata, qc_vars=["MT"])
rsc.pp.filter_cells(andata, min_count=1000,qc_var = 'total_counts')
rsc.pp.filter_genes(andata, min_count=50)
rsc.pp.filter_genes(andata, max_count=50_000)
andata.layers['counts'] = andata.X.copy()
rsc.pp.normalize_total(andata)
# rsc.pp.log1p(andata)
# andata.layers['log'] = andata.X.copy()

In [None]:
import cupy as cp
import cupyx.scipy.sparse as sparse

# Assuming your sparse matrix is in CSR format
sparse_data = andata.X  # Example: andata.X as the sparse matrix in CSR format (CSR format)

def sparse_mean_variance_covariance_csr(sparse_data):
    # Get non-zero data from the sparse matrix
    data = sparse_data.data        # Non-zero elements
    indices = sparse_data.indices  # Column indices of the non-zero elements
    indptr = sparse_data.indptr    # Points to the start of each row in data
    
    # Initialize arrays for mean, variance, and covariance
    num_cols = sparse_data.shape[1]
    col_sum = cp.zeros(num_cols)
    col_count = cp.zeros(num_cols)
    col_sum_sq_diff = cp.zeros(num_cols)  # For variance calculation
    
    # Iterate over rows
    for i in range(sparse_data.shape[0]):
        # Get the non-zero elements in the current row
        row_start = indptr[i]
        row_end = indptr[i + 1]
        row_data = data[row_start:row_end]
        row_indices = indices[row_start:row_end]
        
        # Update column sums and counts for the non-zero elements
        col_sum[row_indices] += row_data
        col_count[row_indices] += 1
    
    # Compute mean: sum divided by the count of non-zero elements
    col_mean = col_sum / col_count
    
    # Compute variance: (data - mean)^2 sum
    for i in range(sparse_data.shape[0]):
        # Get the non-zero elements in the current row
        row_start = indptr[i]
        row_end = indptr[i + 1]
        row_data = data[row_start:row_end]
        row_indices = indices[row_start:row_end]
        
        # Compute squared difference from the mean
        sq_diff = (row_data - col_mean[row_indices]) ** 2
        col_sum_sq_diff[row_indices] += sq_diff
    
    # Compute variance: sum of squared differences divided by count of non-zero elements
    col_variance = col_sum_sq_diff / col_count
    
    # Compute covariance: variance divided by mean (avoid division by zero)
    epsilon = 1e-10  # Small value to prevent division by zero
    col_covariance = (col_variance / (col_mean + epsilon))*100
    
    return col_mean, col_variance, col_covariance

# Run the function
mean, variance, covariance = sparse_mean_variance_covariance_csr(sparse_data)

# Output the result
print("Column-wise mean:", mean)
print("Column-wise variance:", variance)
print("Column-wise covariance (variance/mean):", covariance)

In [None]:
print("Column-wise mean:", len(mean))
print("Column-wise variance:", len(variance))
print("Column-wise covariance (variance/mean):", len(covariance))

In [None]:
df_var = pd.DataFrame({'mean':mean.get(),'variance':variance.get(),'covariance':covariance.get()})

In [None]:
ax = sns.scatterplot(
    data=df_var, x='mean', y="covariance", s=1
)