# Distance Covariance Analysis (DCA) for COMMOT results

- X = SVG or HVG expression matrix
- Y = spatial coords

author: @emilyekstrum
1/26/26

In [1]:
from dca import dca
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
import anndata
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist, squareform

In [2]:
# load in seurat object
ro.r('library(Seurat)')
ro.r('obj <- readRDS("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/seurat_objs/humanbrain_seurat.rds")')

R callback write-console: Loading required package: SeuratObject
  
R callback write-console: Loading required package: sp
  
R callback write-console: 
Attaching package: ‘SeuratObject’

  
R callback write-console: The following objects are masked from ‘package:base’:

    intersect, t

  



    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

In [4]:
# get mouse seurat obj coordinates
coords = ro.r('''
md <- obj@meta.data
md[, c("imagecol","imagerow")]
''')
coords = pandas2ri.rpy2py(coords)
coords.index = coords.index.astype(str)
coords.columns = ["x", "y"]  


print(coords.head()) # cell ID by location

                             x           y
AAACAAGTATCTCCCA-1  440.639079  381.098123
AAACAATCTACTAGCA-1  259.630972  126.327637
AAACACCAATAACTGC-1  183.078314  427.767792
AAACAGCTTTCAGAAG-1  152.700275  341.269139
AAACAGGGTCTATATT-1  164.941500  362.916304


In [5]:
# load in svgs
human_svgs = anndata.read_h5ad("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/COMMOT/human_svg_lr_sets/human_commot_LR_svg.h5ad")

In [6]:
# get counts matrix for svgs
svgs_counts = human_svgs.X.copy()
if isinstance(svgs_counts, np.ndarray):
    svgs_counts = svgs_counts
else:
    svgs_counts = svgs_counts.toarray()
svgs_counts = pd.DataFrame(svgs_counts, index=human_svgs.obs_names, columns=human_svgs.var_names)
svgs_counts.index = svgs_counts.index.astype(str)

In [7]:
# check matrix
print(svgs_counts.head())  # cell ID by gene expression

                    AL627309.1  AL669831.5  LINC00115  FAM41C  AL645608.1  \
AAACAAGTATCTCCCA-1         0.0         0.0        0.0     0.0         0.0   
AAACAATCTACTAGCA-1         0.0         0.0        0.0     0.0         0.0   
AAACACCAATAACTGC-1         0.0         0.0        0.0     0.0         0.0   
AAACAGCTTTCAGAAG-1         0.0         0.0        0.0     0.0         0.0   
AAACAGGGTCTATATT-1         0.0         0.0        0.0     0.0         0.0   

                    SAMD11     NOC2L  KLHL17  PERM1  AL645608.8  ...  \
AAACAAGTATCTCCCA-1     0.0  0.000000     0.0    0.0         0.0  ...   
AAACAATCTACTAGCA-1     0.0  0.000000     0.0    0.0         0.0  ...   
AAACACCAATAACTGC-1     0.0  0.693147     0.0    0.0         0.0  ...   
AAACAGCTTTCAGAAG-1     0.0  1.098612     0.0    0.0         0.0  ...   
AAACAGGGTCTATATT-1     0.0  0.000000     0.0    0.0         0.0  ...   

                      MT-ND6    MT-CYB  BX004987.1  AC145212.1  MAFIP  \
AAACAAGTATCTCCCA-1  0.693147  3

In [8]:
# align coords and counts matrix
common_cells = svgs_counts.index.intersection(coords.index)
svgs_counts = svgs_counts.loc[common_cells]
svg_coords = coords.loc[common_cells]

In [9]:
# check that indices match
assert all(svgs_counts.index == svg_coords.index), "Indices do not match!"

In [10]:
# load im hvgs
human_hvgs = anndata.read_h5ad("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/COMMOT/human_hvg_lr_sets/human_commot_LR_hvg.h5ad")

In [11]:
# get counts matrix for hvgs
hvgs_counts = human_hvgs.X.copy()
if isinstance(hvgs_counts, np.ndarray):
    hvgs_counts = hvgs_counts
else:
    hvgs_counts = hvgs_counts.toarray()
hvgs_counts = pd.DataFrame(hvgs_counts, index=human_hvgs.obs_names, columns=human_hvgs.var_names)
hvgs_counts.index = hvgs_counts.index.astype(str)

In [12]:
# check matrix
print(hvgs_counts.head())  # cell ID by gene expression

                    AL627309.1  AL669831.5  LINC00115  FAM41C  AL645608.1  \
AAACAAGTATCTCCCA-1         0.0         0.0        0.0     0.0         0.0   
AAACAATCTACTAGCA-1         0.0         0.0        0.0     0.0         0.0   
AAACACCAATAACTGC-1         0.0         0.0        0.0     0.0         0.0   
AAACAGCTTTCAGAAG-1         0.0         0.0        0.0     0.0         0.0   
AAACAGGGTCTATATT-1         0.0         0.0        0.0     0.0         0.0   

                    SAMD11     NOC2L  KLHL17  PERM1  AL645608.8  ...  \
AAACAAGTATCTCCCA-1     0.0  0.000000     0.0    0.0         0.0  ...   
AAACAATCTACTAGCA-1     0.0  0.000000     0.0    0.0         0.0  ...   
AAACACCAATAACTGC-1     0.0  0.693147     0.0    0.0         0.0  ...   
AAACAGCTTTCAGAAG-1     0.0  1.098612     0.0    0.0         0.0  ...   
AAACAGGGTCTATATT-1     0.0  0.000000     0.0    0.0         0.0  ...   

                      MT-ND6    MT-CYB  BX004987.1  AC145212.1  MAFIP  \
AAACAAGTATCTCCCA-1  0.693147  3

In [13]:
# align coords and counts matrix
hvg_common_cells = hvgs_counts.index.intersection(coords.index)
hvgs_counts = hvgs_counts.loc[hvg_common_cells]
hvg_coords = coords.loc[hvg_common_cells]

In [14]:
# check that indices match
assert all(hvgs_counts.index == hvg_coords.index), "Indices do not match!"

In [14]:
# for SVGS
gene_names = svgs_counts.columns.tolist()
cell_ids = svgs_counts.index.tolist()

print(gene_names[:5])  # first 5 gene names
print(cell_ids[:5])    # first 5 cell IDs

['AL627309.1', 'AL669831.5', 'LINC00115', 'FAM41C', 'AL645608.1']
['AAACAAGTATCTCCCA-1', 'AAACAATCTACTAGCA-1', 'AAACACCAATAACTGC-1', 'AAACAGCTTTCAGAAG-1', 'AAACAGGGTCTATATT-1']


In [15]:
# for HVGS
hvg_gene_names = hvgs_counts.columns.tolist()
hvg_cell_ids = hvgs_counts.index.tolist()

print(hvg_gene_names[:5])  # first 5 gene names
print(hvg_cell_ids[:5])    # first 5 cell IDs

['AL627309.1', 'AL669831.5', 'LINC00115', 'FAM41C', 'AL645608.1']
['AAACAAGTATCTCCCA-1', 'AAACAATCTACTAGCA-1', 'AAACACCAATAACTGC-1', 'AAACAGCTTTCAGAAG-1', 'AAACAGGGTCTATATT-1']


In [16]:
# inputs for DCA 
# need X -> standardized (z scored) gene x cell matrix

scaler = StandardScaler()
#X_svg = pd.DataFrame(scaler.fit_transform(svgs_counts), index=svgs_counts.index, columns=svgs_counts.columns)
X_hvg = pd.DataFrame(scaler.fit_transform(hvgs_counts), index=hvgs_counts.index, columns=hvgs_counts.columns)   

In [17]:
# transpose matrices to get gene x cell
#X_svg = X_svg.T
X_hvg = X_hvg.T

X_hvg.head()

Unnamed: 0,AAACAAGTATCTCCCA-1,AAACAATCTACTAGCA-1,AAACACCAATAACTGC-1,AAACAGCTTTCAGAAG-1,AAACAGGGTCTATATT-1,AAACAGTGTTCCTGGG-1,AAACATTTCCCGGATT-1,AAACCCGAACGAAATC-1,AAACCGGGTAGGTACC-1,AAACCGTTCGTCCAGG-1,...,TTGTGGTGGTACTAAG-1,TTGTGTATGCCACCAA-1,TTGTGTTTCCCGAAAG-1,TTGTTAGCAAATTCGA-1,TTGTTCAGTGTGCTAC-1,TTGTTGTGTGTCAAGA-1,TTGTTTCACATCCAGG-1,TTGTTTCATTAGTCTA-1,TTGTTTGTATTACACG-1,TTGTTTGTGTAAATTC-1
AL627309.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL669831.5,-0.16086,-0.16086,-0.16086,-0.16086,-0.16086,-0.16086,-0.16086,-0.16086,-0.16086,-0.16086,...,-0.16086,-0.16086,-0.16086,-0.16086,-0.16086,-0.16086,-0.16086,-0.16086,-0.16086,-0.16086
LINC00115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM41C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL645608.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# convert gene by cell matrices to arrays
#X_svg_array = X_svg.values
X_hvg_array = X_hvg.values.T  # transpose to cell by gene for DCA

In [21]:
X_hvg_array # expression values 

array([[ 0.        , -0.16086019,  0.        , ..., -0.41597751,
        -0.12561211,  0.        ],
       [ 0.        , -0.16086019,  0.        , ..., -0.41597751,
        -0.12561211,  0.        ],
       [ 0.        , -0.16086019,  0.        , ..., -0.41597751,
        -0.12561211,  0.        ],
       ...,
       [ 0.        , -0.16086019,  0.        , ..., -0.41597751,
        -0.12561211,  0.        ],
       [ 0.        , -0.16086019,  0.        , ..., -0.41597751,
        -0.12561211,  0.        ],
       [ 0.        , -0.16086019,  0.        , ..., -0.41597751,
        -0.12561211,  0.        ]], shape=(2897, 17650))

In [22]:
# dependent distance matrix from coordinates (cells x cells)

dist_matrix = squareform(pdist(hvg_coords.values, metric='euclidean'))
#dist_matrix = pd.DataFrame(dist_matrix, index=svg_coords.index, columns=svg_coords.index)

In [23]:
dist_matrix # distance matrix between cells

array([[  0.        , 312.52509541, 261.75485784, ..., 229.52913721,
        226.07327453, 280.58423024],
       [312.52509541,   0.        , 311.00880392, ..., 309.99075169,
        377.51293566,  32.90441618],
       [261.75485784, 311.00880392,   0.        , ...,  34.52974632,
        101.71767168, 297.41063321],
       ...,
       [229.52913721, 309.99075169,  34.52974632, ...,   0.        ,
         77.94339303, 293.07913395],
       [226.07327453, 377.51293566, 101.71767168, ...,  77.94339303,
          0.        , 357.20683064],
       [280.58423024,  32.90441618, 297.41063321, ..., 293.07913395,
        357.20683064,   0.        ]], shape=(2897, 2897))

In [24]:
# DCA parameters
k = 15 # number of dims 

In [None]:
# run DCA 

# ensure consistent sample order between X and coords/dist
cell_ids = list(map(str, X_hvg.columns))     # cells
X_hvg_array = X_hvg.values                   # genes × cells

hvg_coords = hvg_coords.copy()
hvg_coords.index = hvg_coords.index.map(str)

missing = set(cell_ids) - set(hvg_coords.index)
assert len(missing) == 0, f"Missing coords for {len(missing)} cells (example: {next(iter(missing))})"

hvg_coords = hvg_coords.loc[cell_ids]        # reorder coords to match X columns

xy = hvg_coords[["x", "y"]].to_numpy(dtype=float)
diff = xy[:, None, :] - xy[None, :, :]
dist_matrix = np.sqrt((diff**2).sum(axis=2))

print("X_hvg_array shape (genes x cells):", X_hvg_array.shape)   # (17650, 2897)
print("dist_matrix shape (cells x cells):", dist_matrix.shape)   # (2897, 2897)

hvg_U_list, hvg_dcovs = dca(
    Xs=[X_hvg_array],               # list of datasets (genes x cells)
    Ds=[dist_matrix],               # list of dependent distance matrices (cells x cells)
    num_dca_dimensions=k,           # number of DCA dimensions to learn
    num_iters_per_dataset=1,        # number of times to run DCA per dataset, default 1
    num_iters_foreach_dim=30,       # number of iterations to run for each dimension, default 30
    num_stoch_batch_samples=100,    # number of cells to use in stochastic batches when estimating gradients, default 100
    num_samples_to_compute_stepwise_dcov=1000, # number of samples to use when computing distance covariance, default 1000
    percent_increase_criterion=0.01 # convergence criterion, default 0.01
)

X_hvg_array shape (genes x cells): (17650, 2897)
dist_matrix shape (cells x cells): (2897, 2897)
dca dimension 1
   step 1: dcov = 0.6958741302774908
     batches:
.   step 1: dcov = 0.6958741302774908
     batches:
.......................................................
   step 2: dcov = 15.741801806276321
     batches:
.
   step 2: dcov = 15.741801806276321
     batches:
.......................................................
   step 3: dcov = 15.612139520979564
     batches:
.
   step 3: dcov = 15.612139520979564
     batches:
.......................................................
   step 4: dcov = 14.257750176681107
     batches:
.
   step 4: dcov = 14.257750176681107
     batches:
.......................................................
   step 5: dcov = 15.685028403364987
     batches:
.
   step 5: dcov = 15.685028403364987
     batches:
.......................................................
   step 6: dcov = 16.997986894643613
     batches:
.
   step 6: dcov = 16.99798689464361

In [None]:
# get outputs 
hvg_U = hvg_U_list[0]  # genes x k (loadings)
hvg_Z = (X_hvg_array.T @ hvg_U)  # cells x k (scores)

In [None]:
# look at outputs
print("HVG U shape (genes x k):", hvg_U.shape)
print("HVG Z shape (cells x k):", hvg_Z.shape)
print("HVG distance covariances shape (k,):", hvg_dcovs.shape)

HVG U shape (cells x k): (17650, 15)
HVG Z shape (genes x k): (2897, 15)
HVG distance covariances shape (k,): (1, 15)


In [21]:
# save to CSV files with labels
k_actual = hvg_U.shape[1]
cols = [f"DCA_dim_{i+1}" for i in range(k_actual)]

hvg_U_df = pd.DataFrame(hvg_U, index=hvg_gene_names, columns=cols) # loadings (genes x k)
hvg_U_df.to_csv("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/DCA/human_hvg_DCA_U.csv")

hvg_Z_df = pd.DataFrame(hvg_Z, index=hvg_cell_ids, columns=cols) # scores (cells x k)
hvg_Z_df.to_csv("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/DCA/human_hvg_DCA_Z.csv")

hvg_dcovs = np.asarray(hvg_dcovs).reshape(-1)
dc_df = pd.DataFrame({"dimension": [f"DCA{i+1}" for i in range(len(hvg_dcovs))], "distance_covariance": hvg_dcovs})
dc_df.to_csv("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/DCA/human_hvg_DCA_distance_covariance.csv", index=False)

In [22]:
# save items
np.save("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/DCA/human_hvg_U_DCA.npy", hvg_U) # gene loadings
np.save("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/DCA/human_hvg_Z_DCA.npy", hvg_Z) # cell embeddings
np.save("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/DCA/human_hvg_dcovs_DCA.npy", hvg_dcovs) # distance covariances per DCA dim