In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.spatial.distance import pdist, squareform
from dca import dca

In [None]:
# INPUTS 
# don't have these files
SCT_CSV   = "/Users/emilyekstrum/Desktop/zhangLab/data/DLPFC_Data/Sample151673/Sim3_2454hvgs_matrix.csv"   # rows=genes, cols=cells
COORDS_CSV= "/Users/nanxi/Documents/ZhangLab/Spatial_transcriptomics/Spatial_clusters/Output/Sim3_151673/Sim3_cell_location.csv"    # rows=cells, cols include x,y (or X,Y / row,col)
K = 15                       # number of DCA dimensions to extract



In [None]:
# standardizes features (zâ€‘score),
def zscore_rows(A):
    """
    Z-score each gene across cells.
    A is (genes x cells). We center/scale row-wise.
    """
    mu = A.mean(axis=1, keepdims=True)
    sd = A.std(axis=1, ddof=1, keepdims=True)
    sd[sd == 0] = 1.0
    return (A - mu) / sd



In [17]:
# 1) load csvs
sct = pd.read_csv(SCT_CSV, index_col=0)      # (genes x cells)
coords_df = pd.read_csv(COORDS_CSV, index_col=0)

# 2) grab x,y
if {"x","y"}.issubset(coords_df.columns):
    xy = coords_df[["x","y"]].astype(float)
elif {"X","Y"}.issubset(coords_df.columns):
    xy = coords_df[["X","Y"]].astype(float)
else:
    xy = coords_df.iloc[:, :2].astype(float)

In [None]:
xy.index


Index(['AAACAAGTATCTCCCA-1', 'AAACAATCTACTAGCA-1', 'AAACACCAATAACTGC-1',
       'AAACAGAGCGACTCCT-1', 'AAACAGCTTTCAGAAG-1', 'AAACAGGGTCTATATT-1',
       'AAACAGTGTTCCTGGG-1', 'AAACATTTCCCGGATT-1', 'AAACCCGAACGAAATC-1',
       'AAACCGGGTAGGTACC-1',
       ...
       'TTGTGTATGCCACCAA-1', 'TTGTGTTTCCCGAAAG-1', 'TTGTTAGCAAATTCGA-1',
       'TTGTTCAGTGTGCTAC-1', 'TTGTTGTGTGTCAAGA-1', 'TTGTTTCACATCCAGG-1',
       'TTGTTTCATTAGTCTA-1', 'TTGTTTCCATACAACT-1', 'TTGTTTGTATTACACG-1',
       'TTGTTTGTGTAAATTC-1'],
      dtype='object', length=3611)

In [23]:
sct.columns

Index(['AAACAAGTATCTCCCA-1', 'AAACAATCTACTAGCA-1', 'AAACACCAATAACTGC-1',
       'AAACAGAGCGACTCCT-1', 'AAACAGCTTTCAGAAG-1', 'AAACAGGGTCTATATT-1',
       'AAACAGTGTTCCTGGG-1', 'AAACATTTCCCGGATT-1', 'AAACCCGAACGAAATC-1',
       'AAACCGGGTAGGTACC-1',
       ...
       'TTGTGTATGCCACCAA-1', 'TTGTGTTTCCCGAAAG-1', 'TTGTTAGCAAATTCGA-1',
       'TTGTTCAGTGTGCTAC-1', 'TTGTTGTGTGTCAAGA-1', 'TTGTTTCACATCCAGG-1',
       'TTGTTTCATTAGTCTA-1', 'TTGTTTCCATACAACT-1', 'TTGTTTGTATTACACG-1',
       'TTGTTTGTGTAAATTC-1'],
      dtype='object', length=3611)

In [24]:
# 3) align cells
cells = sct.columns.intersection(xy.index)
if len(cells) == 0:
    raise ValueError("No overlapping cell IDs between SCT and coords.")
#sct = sct.loc[cells]
xy  = xy.loc[cells]

In [25]:
gene_names = sct.index.to_list()
cell_ids   = sct.columns.to_list()

In [None]:
# 4) build inputs for dca()
#    X must be (genes x cells); then STANDARDIZE (z-score per gene across cells)
X_gc = sct.to_numpy(dtype=float)          # genes x cells
X_gc = zscore_rows(X_gc)                    # standardization step

#    dependent distance matrix from coordinates: (cells x cells)
D_coords = squareform(pdist(xy.to_numpy(dtype=float), metric="euclidean"))


In [41]:
X_gc[1:5,3608:3611]

array([[ 0.57559585,  0.41838561, -0.52487585],
       [-0.15203232, -0.00305302, -0.74794951],
       [ 1.06652056,  0.855888  , -0.1972748 ],
       [-0.38534272, -0.38534272, -0.38534272]])

In [38]:
D_coords

array([[  0.        , 312.52509541, 261.75485784, ..., 233.98605801,
        226.07327453, 280.58423024],
       [312.52509541,   0.        , 311.00880392, ..., 231.83050434,
        377.51293566,  32.90441618],
       [261.75485784, 311.00880392,   0.        , ...,  79.48418109,
        101.71767168, 297.41063321],
       ...,
       [233.98605801, 231.83050434,  79.48418109, ...,   0.        ,
        157.10789965, 217.95469838],
       [226.07327453, 377.51293566, 101.71767168, ..., 157.10789965,
          0.        , 357.20683064],
       [280.58423024,  32.90441618, 297.41063321, ..., 217.95469838,
        357.20683064,   0.        ]])

In [42]:
# 5) run DCA
U_list, dcovs = dca(
    Xs=[X_gc],                 # list of datasets to project (genes x cells)
    Ds=[D_coords],             # list of dependent distance matrices (cells x cells)
    num_dca_dimensions=K,
    num_iters_per_dataset=1,
    num_iters_foreach_dim=30,
    num_stoch_batch_samples=100,
    num_samples_to_compute_stepwise_dcov=1000,
    percent_increase_criterion=0.01
)

dca dimension 1
   step 1: dcov = 0.9792576886112597
     batches:
....................................
   step 2: dcov = 14.382052696963436
     batches:
....................................
   step 3: dcov = 14.723123976332015
     batches:
....................................
   step 4: dcov = 14.012817165392418
     batches:
....................................
   step 5: dcov = 14.534091479931611
     batches:
....................................
   step 6: dcov = 13.124548934605906
     batches:
....................................
   step 7: dcov = 14.259298599777608
     batches:
....................................
   step 8: dcov = 15.125225968384552
     batches:
....................................
   step 9: dcov = 13.353159294634816
     batches:
....................................
   step 10: dcov = 15.184130585074234
     batches:
....................................
   step 11: dcov = 15.293092306990394
     batches:
....................................
   step 12: dc

In [43]:

# 6) outputs
U = U_list[0]                 # (genes x K)
Z = (X_gc.T) @ U              # (cells x genes) @ (genes x K) -> (cells x K)

In [45]:
# 7) Save to CSV with labels
k_actual = U.shape[1]
cols = [f"DCA{i+1}" for i in range(k_actual)]

U_df = pd.DataFrame(U, index=gene_names, columns=cols)   # loadings (genes x K)
Z_df = pd.DataFrame(Z, index=cell_ids,  columns=cols)    # scores   (cells x K)

dcovs = np.asarray(dcovs).reshape(-1)
dc_df = pd.DataFrame({"dimension": [f"DCA{i+1}" for i in range(len(dcovs))],
                      "distance_covariance": dcovs})

U_df.to_csv("/Users/nanxi/Documents/ZhangLab/Spatial_transcriptomics/Spatial_clusters/DCA/Output/Sim3_U_DCA.csv")
Z_df.to_csv("/Users/nanxi/Documents/ZhangLab/Spatial_transcriptomics/Spatial_clusters/DCA/Output/Sim3_Z_DCA.csv")
dc_df.to_csv("/Users/nanxi/Documents/ZhangLab/Spatial_transcriptomics/Spatial_clusters/DCA/Output/Sim3_dcovs.csv", index=False)

In [None]:
# 7) save
np.save("U_DCA.npy", U)               # gene loadings
np.save("Z_DCA.npy", Z)               # cell embeddings
np.save("dcovs.npy", np.array(dcovs)) # distance-covariance per DCA dim 
print("Saved: U_DCA.npy, Z_DCA.npy, dcovs.npy")