# Distance Covariance Analysis (DCA) for mouse COMMOT results 

- X = SVG or HVG expression matrix
- Y = spatial coords

author: @emilyekstrum
1/26/26

In [38]:
from dca import dca
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
import anndata
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist, squareform

In [None]:
# load in mouse seurat object
ro.r('library(Seurat)')
ro.r('obj <- readRDS("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/seurat_objs/mousebrain_seurat.rds")')

R callback write-console: Loading required package: SeuratObject
  
R callback write-console: Loading required package: sp
  
R callback write-console: 
Attaching package: ‘SeuratObject’

  
R callback write-console: The following objects are masked from ‘package:base’:

    intersect, t

  



    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

In [10]:
# get mouse seurat obj coordinates
coords = ro.r('''
md <- obj@meta.data
md[, c("imagecol","imagerow")]
''')
coords = pandas2ri.rpy2py(coords)
coords.index = coords.index.astype(str)
coords.columns = ["x", "y"]  


print(coords.head()) # cell ID by location

                         x       y
AAACAAGTATCTCCCA-1  7410.0  8455.0
AAACAGAGCGACTCCT-1  3097.0  7905.0
AAACAGGGTCTATATT-1  7050.0  2327.0
AAACATTTCCCGGATT-1  8728.0  8111.0
AAACCCGAACGAAATC-1  6811.0  9351.0


In [6]:
# load in mouse svgs
mouse_svgs = anndata.read_h5ad("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/COMMOT/mouse_svg_lr_sets/mouse_commot_LR_svg.h5ad")

In [11]:
# get counts matrix for svgs
svgs_counts = mouse_svgs.X.copy()
if isinstance(svgs_counts, np.ndarray):
    svgs_counts = svgs_counts
else:
    svgs_counts = svgs_counts.toarray()
svgs_counts = pd.DataFrame(svgs_counts, index=mouse_svgs.obs_names, columns=mouse_svgs.var_names)
svgs_counts.index = svgs_counts.index.astype(str)

In [12]:
# check matrix
print(svgs_counts.head())  # cell ID by gene expression

                    Xkr4  Sox17    Mrpl15  Lypla1     Tcea1     Rgs20  \
AAACAAGTATCTCCCA-1   0.0    0.0  0.000000     0.0  0.000000  0.000000   
AAACAGAGCGACTCCT-1   0.0    0.0  0.693147     0.0  0.693147  0.000000   
AAACAGGGTCTATATT-1   0.0    0.0  0.693147     0.0  0.000000  0.693147   
AAACATTTCCCGGATT-1   0.0    0.0  0.000000     0.0  1.098612  0.000000   
AAACCCGAACGAAATC-1   0.0    0.0  0.000000     0.0  0.000000  0.000000   

                     Atp6v1h  Oprk1  Npbwr1    Rb1cc1  ...    mt-Nd4  \
AAACAAGTATCTCCCA-1  0.000000    0.0     0.0  0.000000  ...  4.553877   
AAACAGAGCGACTCCT-1  0.693147    0.0     0.0  0.693147  ...  5.056246   
AAACAGGGTCTATATT-1  0.693147    0.0     0.0  0.693147  ...  4.787492   
AAACATTTCCCGGATT-1  0.000000    0.0     0.0  0.000000  ...  4.499810   
AAACCCGAACGAAATC-1  0.000000    0.0     0.0  0.693147  ...  4.317488   

                      mt-Nd5    mt-Nd6   mt-Cytb     Vamp7     Spry3  Tmlhe  \
AAACAAGTATCTCCCA-1  2.302585  0.000000  4.852030 

In [19]:
# align coords and counts matrix
common_cells = svgs_counts.index.intersection(coords.index)
svgs_counts = svgs_counts.loc[common_cells]
svg_coords = coords.loc[common_cells]

In [20]:
# check that indices match
assert all(svgs_counts.index == svg_coords.index), "Indices do not match!"

In [7]:
# load im mouse hvgs
mouse_hvgs = anndata.read_h5ad("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/COMMOT/mouse_hvg_lr_sets/mouse_commot_LR_hvg.h5ad")

In [13]:
# get counts matrix for hvgs
hvgs_counts = mouse_hvgs.X.copy()
if isinstance(hvgs_counts, np.ndarray):
    hvgs_counts = hvgs_counts
else:
    hvgs_counts = hvgs_counts.toarray()
hvgs_counts = pd.DataFrame(hvgs_counts, index=mouse_hvgs.obs_names, columns=mouse_hvgs.var_names)
hvgs_counts.index = hvgs_counts.index.astype(str)

In [17]:
# check matrix
print(hvgs_counts.head())  # cell ID by gene expression

                    Xkr4  Sox17    Mrpl15  Lypla1     Tcea1     Rgs20  \
AAACAAGTATCTCCCA-1   0.0    0.0  0.000000     0.0  0.000000  0.000000   
AAACAGAGCGACTCCT-1   0.0    0.0  0.693147     0.0  0.693147  0.000000   
AAACAGGGTCTATATT-1   0.0    0.0  0.693147     0.0  0.000000  0.693147   
AAACATTTCCCGGATT-1   0.0    0.0  0.000000     0.0  1.098612  0.000000   
AAACCCGAACGAAATC-1   0.0    0.0  0.000000     0.0  0.000000  0.000000   

                     Atp6v1h  Oprk1  Npbwr1    Rb1cc1  ...    mt-Nd4  \
AAACAAGTATCTCCCA-1  0.000000    0.0     0.0  0.000000  ...  4.553877   
AAACAGAGCGACTCCT-1  0.693147    0.0     0.0  0.693147  ...  5.056246   
AAACAGGGTCTATATT-1  0.693147    0.0     0.0  0.693147  ...  4.787492   
AAACATTTCCCGGATT-1  0.000000    0.0     0.0  0.000000  ...  4.499810   
AAACCCGAACGAAATC-1  0.000000    0.0     0.0  0.693147  ...  4.317488   

                      mt-Nd5    mt-Nd6   mt-Cytb     Vamp7     Spry3  Tmlhe  \
AAACAAGTATCTCCCA-1  2.302585  0.000000  4.852030 

In [21]:
# align coords and counts matrix
hvg_common_cells = hvgs_counts.index.intersection(coords.index)
hvgs_counts = hvgs_counts.loc[hvg_common_cells]
hvg_coords = coords.loc[hvg_common_cells]

In [22]:
# check that indices match
assert all(hvgs_counts.index == hvg_coords.index), "Indices do not match!"

In [None]:
# inputs for DCA 
# need X -> standardized (z scored) gene x cell matrix

scaler = StandardScaler()
X_svg = pd.DataFrame(scaler.fit_transform(svgs_counts), index=svgs_counts.index, columns=svgs_counts.columns)
X_hvg = pd.DataFrame(scaler.fit_transform(hvgs_counts), index=hvgs_counts.index, columns=hvgs_counts.columns)   

In [26]:
# transpose matrices to get gene x cell
X_svg = X_svg.T
X_hvg = X_hvg.T

X_svg.head()

Unnamed: 0,AAACAAGTATCTCCCA-1,AAACAGAGCGACTCCT-1,AAACAGGGTCTATATT-1,AAACATTTCCCGGATT-1,AAACCCGAACGAAATC-1,AAACCGGAAATGTTAA-1,AAACCGGGTAGGTACC-1,AAACGAGACGGTTGAT-1,AAACTGCTGGCTCCAA-1,AAACTTGCAAACGTAT-1,...,TTGTGAGGCATGACGC-1,TTGTGCAGCCACGTCA-1,TTGTGGCCCTGACAGT-1,TTGTGTATGCCACCAA-1,TTGTGTTTCCCGAAAG-1,TTGTTAGCAAATTCGA-1,TTGTTCAGTGTGCTAC-1,TTGTTGTGTGTCAAGA-1,TTGTTTCATTAGTCTA-1,TTGTTTCCATACAACT-1
Xkr4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sox17,-0.319774,-0.319774,-0.319774,-0.319774,-0.319774,2.737599,-0.319774,-0.319774,-0.319774,-0.319774,...,-0.319774,-0.319774,-0.319774,-0.319774,-0.319774,-0.319774,-0.319774,-0.319774,-0.319774,-0.319774
Mrpl15,-0.73481,0.869913,0.869913,-0.73481,-0.73481,0.869913,-0.73481,0.869913,-0.73481,-0.73481,...,0.869913,-0.73481,-0.73481,-0.73481,-0.73481,0.869913,-0.73481,0.869913,-0.73481,0.869913
Lypla1,-0.575309,-0.575309,-0.575309,-0.575309,-0.575309,-0.575309,-0.575309,1.353987,-0.575309,-0.575309,...,1.353987,-0.575309,-0.575309,-0.575309,-0.575309,1.353987,-0.575309,1.353987,-0.575309,-0.575309
Tcea1,-0.81253,0.716311,-0.81253,1.610626,-0.81253,-0.81253,0.716311,0.716311,-0.81253,-0.81253,...,-0.81253,-0.81253,0.716311,-0.81253,-0.81253,0.716311,0.716311,-0.81253,1.610626,1.610626


In [33]:
# convert gene by cell matrices to arrays
X_svg_array = X_svg.values
X_hvg_array = X_hvg.values

In [34]:
X_svg_array

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.31977388, -0.31977388, -0.31977388, ..., -0.31977388,
        -0.31977388, -0.31977388],
       [-0.73481012,  0.8699128 ,  0.8699128 , ...,  0.8699128 ,
        -0.73481012,  0.8699128 ],
       ...,
       [-1.24452374, -0.09148907, -0.09148907, ...,  1.06154561,
        -0.09148907, -0.09148907],
       [-0.37689703, -0.37689703, -0.37689703, ..., -0.37689703,
        -0.37689703, -0.37689703],
       [-0.10234466, -0.10234466, -0.10234466, ..., -0.10234466,
        -0.10234466, -0.10234466]], shape=(16112, 1640))

In [30]:
# dependent distance matrix from coordinates (cells x cells)

dist_matrix = squareform(pdist(svg_coords.values, metric='euclidean'))
#dist_matrix = pd.DataFrame(dist_matrix, index=svg_coords.index, columns=svg_coords.index)

In [32]:
dist_matrix

array([[   0.        , 4347.92697731, 6138.56530469, ..., 2854.21968321,
        5099.71107025, 5198.62452962],
       [4347.92697731,    0.        , 6836.6872826 , ..., 2348.73093393,
        7056.3992234 , 5923.07285115],
       [6138.56530469, 6836.6872826 ,    0.        , ..., 4805.88576643,
        1949.00102617,  993.18527979],
       ...,
       [2854.21968321, 2348.73093393, 4805.88576643, ...,    0.        ,
        4748.40194171, 3830.13485402],
       [5099.71107025, 7056.3992234 , 1949.00102617, ..., 4748.40194171,
           0.        , 1808.8830808 ],
       [5198.62452962, 5923.07285115,  993.18527979, ..., 3830.13485402,
        1808.8830808 ,    0.        ]], shape=(1640, 1640))

In [36]:
# DCA parameters
k = 15 # number of dims 

In [None]:
# run DCA for SVGs

svg_U_list, svg_dcovs = dca(
    Xs=[X_svg_array],               # list of datasets to project (genes x cells)
    Ds=[dist_matrix],               # list of dependent distance matrices (cells x cells)
    num_dca_dimensions=k,           # number of DCA dimensions to learn
    num_iters_per_dataset=1,        # number of times to run DCA per dataset, default 1
    num_iters_foreach_dim=30,       # number of iterations to run for each dimension, default 30
    num_stoch_batch_samples=100,    # number of cells to use in stochastic batches when estimating gradients, default 100
    num_samples_to_compute_stepwise_dcov=1000, # number of samples to use when computing distance covariance, default 1000
    percent_increase_criterion=0.01 # convergence criterion, default 0.01
)

dca dimension 1
   step 1: dcov = 3.9832745582685694
     batches:
................
   step 2: dcov = 65.16871748277676
     batches:
................
   step 3: dcov = 75.6528686969988
     batches:
................
   step 4: dcov = 65.29375698016224
     batches:
................
   step 5: dcov = 81.22455809925835
     batches:
................
   step 6: dcov = 83.38422241755038
     batches:
................
   step 7: dcov = 80.42404378625741
     batches:
................
   step 8: dcov = 82.3290576310698
     batches:
................
   step 9: dcov = 84.45226525669729
     batches:
................
   step 10: dcov = 87.03867446451189
     batches:
................
   step 11: dcov = 91.14038286708497
     batches:
................
   step 12: dcov = 87.74680718661202
     batches:
................
   step 13: dcov = 86.14837627799318
     batches:
................
   step 14: dcov = 88.21368595151864
     batches:
................
   step 15: dcov = 91.18770366822066
     

In [None]:
# get outputs 
svg_U = svg_U_list[0]  # cells x k
svg_Z = (X_svg_array.T @ svg_U)  # genes x k

In [None]:
# save to CSV files with labels
k_actual = svg_U.shape[1]
cols = [f"DCA_dim_{i+1}" for i in range(k_actual)]

svg_U_df = pd.DataFrame(svg_U, index=svgs_counts.index, columns=cols) # loadings (genes x k)
svg_U_df.to_csv("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/DCA/mouse_svg_DCA_U.csv")

svg_Z_df = pd.DataFrame(svg_Z, index=svgs_counts.columns, columns=cols) # scores (cells x k)
svg_Z_df.to_csv("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/DCA/mouse_svg_DCA_Z.csv")

svg_dcovs = np.asarray(svg_dcovs).reshape(-1)
dc_df = pd.DataFrame(("dimension": [f"DCA{i+1}" for i in range(len{svg_dcovs})], "distance_covariance": svg_dcovs))
dc_df.to_csv("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/DCA/mouse_svg_DCA_distance_covariance.csv", index=False)

In [None]:
# save items
np.save("svg_U_DCA.npy", svg_U) # gene loadings
np.save("svg_Z_DCA.npy", svg_Z) # cell embeddings
np.save("svg_dcovs_DCA.npy", dcovs) # distance covariances per DCA dim