## Applies COMMOT to HVGs, SVGs, and unfiltered expression data set for human brain ST 

- matches cell barcodes in seurat and HVG/SVG matrices
- creates anndata object for COMMOT input
- runs COMMOT on different LR filtered SVG/HVG/unfiltered genes

author: @emilyekstrum
1/20/26

In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import commot as ct
import os
import pyreadr
import rpy2
from rpy2.robjects.packages import importr
import anndata as ad
import copy
from collections import defaultdict
from scipy import sparse
import math
import matplotlib.pyplot as plt

np.random.seed(42)

### Prepare data for COMMOT input
- align by barcode 
- get non negative expression matrix

In [2]:
# load in human data HVGs and SVGs - using unLR filtered data
# sorted by p value in ascending order
hvg_gene_cell_mat = os.path.join("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/hvgs", "human_hvg_gene_cell_matrix.csv")
svg_gene_cell_mat = os.path.join("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/svgs", "nnSVG_human_svg_gene_cell_matrix.csv")

In [3]:
# check first lines of HVG and SVG gene cell matrices
hvg_df = pd.read_csv(hvg_gene_cell_mat, index_col=0)
svg_df = pd.read_csv(svg_gene_cell_mat, index_col=0)

hvg_df.head() # gene by spot matrix
#svg_df.head()

Unnamed: 0,AAACAAGTATCTCCCA.1,AAACAATCTACTAGCA.1,AAACACCAATAACTGC.1,AAACAGCTTTCAGAAG.1,AAACAGGGTCTATATT.1,AAACAGTGTTCCTGGG.1,AAACATTTCCCGGATT.1,AAACCCGAACGAAATC.1,AAACCGGGTAGGTACC.1,AAACCGTTCGTCCAGG.1,...,TTGTGGTGGTACTAAG.1,TTGTGTATGCCACCAA.1,TTGTGTTTCCCGAAAG.1,TTGTTAGCAAATTCGA.1,TTGTTCAGTGTGCTAC.1,TTGTTGTGTGTCAAGA.1,TTGTTTCACATCCAGG.1,TTGTTTCATTAGTCTA.1,TTGTTTGTATTACACG.1,TTGTTTGTGTAAATTC.1
MBP,1.609438,2.197225,4.70953,2.639057,3.401197,3.73767,1.386294,0.0,1.791759,4.477337,...,1.609438,3.178054,2.890372,2.484907,1.791759,2.995732,3.89182,4.110874,3.850148,0.0
PLP1,1.386294,0.693147,3.871201,1.609438,3.218876,3.7612,1.386294,1.098612,2.197225,3.988984,...,0.693147,1.94591,1.791759,1.098612,1.609438,1.098612,4.59512,3.610918,2.890372,1.098612
IGKC,1.098612,0.0,0.693147,2.197225,0.693147,0.0,2.079442,0.0,1.098612,0.0,...,1.098612,0.0,0.0,0.693147,0.693147,1.386294,0.0,1.791759,0.0,0.0
NPY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,...,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GFAP,0.0,2.302585,2.995732,0.0,1.386294,2.70805,0.0,0.693147,1.386294,3.044522,...,1.386294,0.693147,2.079442,0.0,0.0,0.0,1.94591,3.583519,2.564949,0.693147


In [4]:
# convert matrices to gene lists
SVG = set(svg_df.index.tolist())
HVG = set(hvg_df.index.tolist())

In [5]:
# look at lists
print(f"First 10 HVGs: {list(HVG)[:10]}")
print(f"First 10 SVGs: {list(SVG)[:10]}")

First 10 HVGs: ['TAF7', 'GTF2H1', 'PCSK7', 'CSF1', 'TBRG1', 'TAZ', 'MRPS12', 'SRSF8', 'ENPP6', 'FBXW7']
First 10 SVGs: ['TAF7', 'TBRG1', 'SRSF8', 'MRPS12', 'FBXW7', 'ZBTB44', 'IMP4', 'KCNMB4', 'MAF1', 'CDK17']


In [6]:
# get non negative expression and coordiantes from human seurat

import rpy2.robjects as ro
from rpy2.robjects import pandas2ri

# load in seurat object
ro.r('library(Seurat)')
ro.r('obj <- readRDS("/Users/emilyekstrum/repos/zhangLab_Rotation/data/processed/seurat_objs/humanbrain_seurat.rds")')

R callback write-console: Loading required package: SeuratObject
  
R callback write-console: Loading required package: sp
  
R callback write-console: 
Attaching package: ‘SeuratObject’

  
R callback write-console: The following objects are masked from ‘package:base’:

    intersect, t

  



    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

In [7]:
# look for images slot
print(ro.r('head(colnames(obj@meta.data), 50)'))  # show first 50 meta columns

 [1] "orig.ident"        "nCount_RNA"        "nFeature_RNA"     
 [4] "...1"              "imagerow"          "imagecol"         
 [7] "Manual.annotation" "percent.mt"        "nCount_SCT"       
[10] "nFeature_SCT"     



In [None]:
# crashing kernel 

pat = ro.r('''
md <- obj@meta.data
cells <- colnames(obj)
bc <- as.character(md[["...1"]])

bc <- sub("^X", "", bc)
bc <- gsub("\\.", "-", bc)

list(
  cell_example = head(cells, 10),
  bc_example = head(bc, 10),
  cell_has_prefix = any(grepl("_", cells)),
  bc_has_suffix = any(grepl("-1$", bc)),
  cell_has_suffix = any(grepl("-1$", cells))
)
''')

print("cell examples:", list(pat.rx2("cell_example")))
print("bc examples:", list(pat.rx2("bc_example")))
print("cell_has_prefix '_' :", bool(pat.rx2("cell_has_prefix")[0]))
print("bc_has_suffix '-1':", bool(pat.rx2("bc_has_suffix")[0]))
print("cell_has_suffix '-1':", bool(pat.rx2("cell_has_suffix")[0]))

: 

In [None]:


# 3) (Optional) Pull aligned barcodes + coords into Python
barcodes = pd.Index([str(x) for x in ro.r('colnames(obj)')])

coords = ro.r('''
md <- obj@meta.data
md[, c("imagecol","imagerow")]
''')
coords = pandas2ri.rpy2py(coords)
coords.index = coords.index.astype(str)
coords = coords.loc[barcodes, :]  # enforce identical ordering
coords.columns = ["x", "y"]

print("barcodes n:", len(barcodes))
print("coords shape:", coords.shape)
print("coords head:\n", coords.head())


In [10]:
# get seurat barcodes
seurat_barcodes = list(ro.r('rownames(obj@meta.data)'))
seurat_barcodes = [str(x) for x in seurat_barcodes]

print("Seurat n spots:", len(seurat_barcodes))
print("CSV n spots:", svg_df.shape[1])

Seurat n spots: 2897
CSV n spots: 2897


In [11]:
# map csv columns to seurat barcodes by order
assert svg_df.shape[1] == len(seurat_barcodes), "Counts differ —> can't map by order"

# rename csv columns to seurat barcodes
svg_df.columns = seurat_barcodes
hvg_df.columns = seurat_barcodes

In [12]:
# Get meta.data with rownames kept as a column
md = ro.r('''
md <- obj@meta.data
md$barcode <- rownames(md)
md[, c("barcode", "cell_id", "imagecol", "imagerow", "x", "y")]
''')
md = pandas2ri.rpy2py(md)

# check types 
md["barcode"] = md["barcode"].astype(str)
md["cell_id"] = md["cell_id"].astype(str)
md = md.dropna(subset=["cell_id"])

# check matches between svg spot ids and meta data cell ids/barcodes
spot_ids_csv = pd.Index(svg_df.columns.astype(str))

# check matches
n_match_cellid = spot_ids_csv.isin(pd.Index(md["cell_id"])).sum()
n_match_barcode = spot_ids_csv.isin(pd.Index(md["barcode"])).sum()

print("CSV -> cell_id matches:", n_match_cellid, "of", len(spot_ids_csv))
print("CSV -> barcode matches:", n_match_barcode, "of", len(spot_ids_csv))

R callback write-console: Error in `[.data.frame`(md, , c("barcode", "cell_id", "imagecol", "imagerow",  : 
  undefined columns selected
  


RRuntimeError: Error in `[.data.frame`(md, , c("barcode", "cell_id", "imagecol", "imagerow",  : 
  undefined columns selected
