## Notebook to compute gene.activity using cicero

Last updated: 2/19/2024
Author: Yang-Joon Kim

- Goal: find the description from the cicero webpage.

- inputs: 
    - a Seurat object
    - cicero output (connections)
    - genome annotation (GTF)
    
- output: 


In [2]:
.libPaths("/hpc/scratch/group.data.science/yangjoon.kim/.local/R_lib")
withr::with_libpaths(new = "/hpc/scratch/group.data.science/yangjoon.kim/.local/R_lib", library(monocle3))
withr::with_libpaths(new = "/hpc/scratch/group.data.science/yangjoon.kim/.local/R_lib", library(cicero))

# load other libraries
#library(cicero)
library(Signac)
library(Seurat)
library(SeuratWrappers)
library(readr)

Loading required package: Biobase

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


Loading required package: SingleCellExperiment

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading

In [3]:
# parallelization in Signac: https://stuartlab.org/signac/articles/future
library(future)
plan()

plan("multicore", workers = 20)
plan()

# set the max memory size for the future
options(future.globals.maxSize = 256 * 1024 ^ 3) # for 256 Gb RAM

In [4]:
seurat_object_path <- "/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/01_Signac_processed/TDR118reseq/TDR118_processed.RDS" 
cicero_path <- "/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/02_cicero_processed/TDR118reseq_cicero/"
gref_path <- "/hpc/reference/sequencing_alignment/alignment_references/zebrafish_genome_GRCz11/genes/genes.gtf.gz"
assay <- "peaks_merged" 
data_id <- "TDR118reseq"
peaktype <- "peaks_merged"

In [5]:
# Step 1. import a seurat object and convert it to CDS object
seurat_object <- readRDS(seurat_object_path)
DefaultAssay(seurat_object) <- assay
print(paste0("default assay is ", assay))

# conver to CellDataSet (CDS) format
seurat_object.cds <- as.cell_data_set(x=seurat_object) # a function from SeuratWrappers
print("cds object created")

[1] "default assay is peaks_merged"


"non-NULL 'rownames(value)' should be the same as 'colnames(x)' for
'reducedDim<-'. This will be an error in the next release of
Bioconductor."
"non-NULL 'rownames(value)' should be the same as 'colnames(x)' for
'reducedDim<-'. This will be an error in the next release of
Bioconductor."
"non-NULL 'rownames(value[[2]])' should be the same as 'colnames(x)' for
'reducedDims<-'. This will be an error in the next release of
Bioconductor."
"non-NULL 'rownames(value[[4]])' should be the same as 'colnames(x)' for
'reducedDims<-'. This will be an error in the next release of
Bioconductor."


[1] "cds object created"


In [6]:
# Reformat the CDS object with feature-level metadata
# Step 1: Extract and process row names
site_names <- rownames(seurat_object.cds)
chr_bp_info <- strsplit(site_names, "-")
chr <- sapply(chr_bp_info, function(x) x[1])
bp1 <- sapply(chr_bp_info, function(x) x[2])
bp2 <- sapply(chr_bp_info, function(x) x[3])

# Step 2: Calculate num_cells_expressed
# Assuming 'counts' assay is used to calculate expression
counts_matrix <- counts(seurat_object.cds)
num_cells_expressed <- rowSums(counts_matrix > 0)

# Step 3: Update rowData
rowData(seurat_object.cds)$site_name <- site_names
rowData(seurat_object.cds)$chr <- chr #paste0("chr", chr)
rowData(seurat_object.cds)$bp1 <- as.numeric(bp1)
rowData(seurat_object.cds)$bp2 <- as.numeric(bp2)
rowData(seurat_object.cds)$num_cells_expressed <- num_cells_expressed

# Verify the updated rowData
head(rowData(seurat_object.cds))

DataFrame with 6 rows and 5 columns
                  site_name         chr       bp1       bp2 num_cells_expressed
                <character> <character> <numeric> <numeric>           <integer>
1-3427-4032     1-3427-4032           1      3427      4032                 461
1-4473-6136     1-4473-6136           1      4473      6136                1734
1-11020-12944 1-11020-12944           1     11020     12944                4789
1-13368-13575 1-13368-13575           1     13368     13575                 393
1-14640-15050 1-14640-15050           1     14640     15050                1565
1-16769-17193 1-16769-17193           1     16769     17193                1898

In [7]:
# head(rowData(seurat_object.cds))

In [8]:
# Step 2. import the cicero connections (result of run_cicero)
library(readr)
conns_filepath = paste0(cicero_path, "02_", data_id, "_cicero_connections_",peaktype, "_peaks.csv")
conns <- read_csv(conns_filepath, col_types = cols(.default = col_guess(), `...1` = col_skip()))
head(conns)

[1m[22mNew names:
[36m•[39m `` -> `...1`


Peak1,Peak2,coaccess
<chr>,<chr>,<dbl>
1-10000286-10000789,1-9753075-9753596,0.0
1-10000286-10000789,1-9759496-9760011,-0.005966881
1-10000286-10000789,1-9764309-9764532,-0.005358352
1-10000286-10000789,1-9768426-9768767,0.002992008
1-10000286-10000789,1-9770155-9770487,-0.069031148
1-10000286-10000789,1-9771797-9771996,-0.008605915


In [9]:
# Step 3. import the gene annotation
gene_anno <- rtracklayer::readGFF(gref_path)
head(gene_anno)

Unnamed: 0_level_0,seqid,source,type,start,end,score,strand,phase,gene_id,gene_version,⋯,transcript_version,transcript_name,transcript_source,transcript_biotype,exon_number,exon_id,exon_version,protein_id,protein_version,tag
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<int>,<int>,<dbl>,<chr>,<int>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,4,ensembl,gene,17308,18211,,-,,ENSDARG00000102141,2,⋯,,,,,,,,,,
2,4,ensembl,transcript,17308,18211,,-,,ENSDARG00000102141,2,⋯,2.0,ptpn12-201,ensembl,protein_coding,,,,,,
3,4,ensembl,exon,18134,18211,,-,,ENSDARG00000102141,2,⋯,2.0,ptpn12-201,ensembl,protein_coding,1.0,ENSDARE00001173708,2.0,,,
4,4,ensembl,CDS,18134,18211,,-,0.0,ENSDARG00000102141,2,⋯,2.0,ptpn12-201,ensembl,protein_coding,1.0,,,ENSDARP00000130978,1.0,
5,4,ensembl,exon,17948,18046,,-,,ENSDARG00000102141,2,⋯,2.0,ptpn12-201,ensembl,protein_coding,2.0,ENSDARE00001162488,1.0,,,
6,4,ensembl,CDS,17948,18046,,-,0.0,ENSDARG00000102141,2,⋯,2.0,ptpn12-201,ensembl,protein_coding,2.0,,,ENSDARP00000130978,1.0,


In [10]:
head(gene_anno$transcript_id)

In [11]:
colnames(gene_anno)

In [12]:
# rename some columns to match requirements
gene_anno$chromosome <- paste0("chr", gene_anno$seqid)
gene_anno$gene <- gene_anno$gene_id
#gene_anno$transcript_id <- gene_anno$gene_id # rename the transcript_id using the gene_id
gene_anno$transcript <- gene_anno$transcript_id
gene_anno$symbol <- gene_anno$gene_name


In [13]:
# # visualize the connections to get a rough sense of the data
# plot_connections(conns, "chr4", 21630000, 21750000,
#                  gene_model = gene_anno, 
#                  coaccess_cutoff = .8, 
#                  connection_width = .5, 
#                  collapseTranscripts = "longest" )

## NOTES on CCANs
The function generate_ccans has one optional input called coaccess_cutoff_override. When coaccess_cutoff_override is NULL, the function will determine and report an appropriate co-accessibility score cutoff value for CCAN generation based on the number of overall CCANs at varying cutoffs. You can also set coaccess_cutoff_override to be a numeric between 0 and 1, to override the cutoff-finding part of the function. This option is useful if you feel that the cutoff found automatically was too strict or loose, or for speed if you are rerunning the code and know what the cutoff will be, since the cutoff finding procedure can be slow.

In [14]:
# Step 4. compute CCANs (cis-Co-Accessibility Networks: a community/cluster of highly co-accessible peaks)
CCAN_assigns <- generate_ccans(conns)

head(CCAN_assigns)

[1] "Coaccessibility cutoff used: 0.42"


Unnamed: 0_level_0,Peak,CCAN
Unnamed: 0_level_1,<chr>,<dbl>
1-10079409-10080205,1-10079409-10080205,1
1-10084036-10084471,1-10084036-10084471,2
1-10090633-10091664,1-10090633-10091664,1
1-10094078-10094478,1-10094078-10094478,1
1-10097926-10098708,1-10097926-10098708,2
1-10100886-10101536,1-10100886-10101536,2


In [15]:
# Step 5. compute the gene activity score using cicero results

# Step 5-1. reformat the gene_anno to anntoate the CDS for each gene. 
# If not annotated, we'll use the first exon

# Add a column for the pData table indicating the gene if a peak is a promoter ####
# Create a gene annotation set that only marks the transcription start sites of 
# the genes. We use this as a proxy for promoters.
# To do this we need the first exon of each transcript
pos <- subset(gene_anno, strand == "+")
pos <- pos[order(pos$start),] 
# remove all but the first exons per transcript
pos <- pos[!duplicated(pos$transcript),] 
# make a 1 base pair marker of the TSS
pos$end <- pos$start + 1 

neg <- subset(gene_anno, strand == "-")
neg <- neg[order(neg$start, decreasing = TRUE),] 
# remove all but the first exons per transcript
neg <- neg[!duplicated(neg$transcript),] 
neg$start <- neg$end - 1

gene_annotation_sub <- rbind(pos, neg)

# Make a subset of the TSS annotation columns containing just the coordinates 
# and the gene name
gene_annotation_sub <- gene_annotation_sub[,c("chromosome", "start", "end", "symbol")]

# Remove the 'chr' prefix from the 'chromosome' column in 'gene_annotation_sub'
gene_annotation_sub$chromosome <- gsub("chr", "", gene_annotation_sub$chr)

# Rename the gene symbol column to "gene"
names(gene_annotation_sub)[4] <- "gene"

gene_annotation_sub

Unnamed: 0_level_0,chromosome,start,end,gene
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>
1153638,MT,951,952,NC_002333.23
1153639,MT,951,952,NC_002333.23
1153642,MT,1020,1021,NC_002333.17
533251,8,1239,1240,tmed7
533264,8,1239,1240,tmed7
533277,8,1314,1315,tmed7
533290,8,1314,1315,tmed7
440575,16,1471,1472,CABZ01090785.1
709896,18,1615,1616,homer2
533303,8,1715,1716,tmed7


In [16]:
subset_df <- subset(gene_annotation_sub, gene == "myf5")
subset_df

Unnamed: 0_level_0,chromosome,start,end,gene
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>
26367,4,21741228,21741229,myf5
26378,4,21741228,21741229,myf5


In [17]:
seurat_object.cds <- annotate_cds_by_site(seurat_object.cds, gene_annotation_sub)

tail(fData(seurat_object.cds))

DataFrame with 6 rows and 7 columns
                              site_name         chr       bp1       bp2
                            <character> <character> <numeric> <numeric>
9-56297368-56298245 9-56297368-56298245           9  56297368  56298245
9-56298513-56299501 9-56298513-56299501           9  56298513  56299501
9-56396807-56397687 9-56396807-56397687           9  56396807  56397687
9-56400365-56401096 9-56400365-56401096           9  56400365  56401096
9-56411957-56412701 9-56411957-56412701           9  56411957  56412701
9-56433462-56434391 9-56433462-56434391           9  56433462  56434391
                    num_cells_expressed   overlap        gene
                              <integer> <integer> <character>
9-56297368-56298245                 145        NA          NA
9-56298513-56299501                 132        NA          NA
9-56396807-56397687                 129        NA          NA
9-56400365-56401096                 465        NA          NA
9-56411957-56412

In [18]:
head(fData(seurat_object.cds))

DataFrame with 6 rows and 7 columns
                  site_name         chr       bp1       bp2 num_cells_expressed
                <character> <character> <numeric> <numeric>           <integer>
1-3427-4032     1-3427-4032           1      3427      4032                 461
1-4473-6136     1-4473-6136           1      4473      6136                1734
1-11020-12944 1-11020-12944           1     11020     12944                4789
1-13368-13575 1-13368-13575           1     13368     13575                 393
1-14640-15050 1-14640-15050           1     14640     15050                1565
1-16769-17193 1-16769-17193           1     16769     17193                1898
                overlap        gene
              <integer> <character>
1-3427-4032          NA          NA
1-4473-6136          NA          NA
1-11020-12944         2       rpl24
1-13368-13575        NA          NA
1-14640-15050        NA          NA
1-16769-17193        NA          NA

In [19]:
# Check which entries got their genes(TSS) mapped
# Access fData from the CDS object
cds_fData <- fData(seurat_object.cds)

# Find indices where 'overlap' is not NA
non_na_indices <- which(!is.na(cds_fData$overlap))

# Subset the CDS object to keep only those features
filtered_cds <- seurat_object.cds[non_na_indices, ]

tail(fData(filtered_cds))

DataFrame with 6 rows and 7 columns
                              site_name         chr       bp1       bp2
                            <character> <character> <numeric> <numeric>
9-30160875-30161584 9-30160875-30161584           9  30160875  30161584
9-31346542-31347347 9-31346542-31347347           9  31346542  31347347
9-33477233-33477679 9-33477233-33477679           9  33477233  33477679
9-35017696-35018558 9-35017696-35018558           9  35017696  35018558
9-38369524-38370248 9-38369524-38370248           9  38369524  38370248
9-48455731-48456378 9-48455731-48456378           9  48455731  48456378
                    num_cells_expressed   overlap        gene
                              <integer> <integer> <character>
9-30160875-30161584                  98         2     abi3bpa
9-31346542-31347347                 292         2  BX571774.1
9-33477233-33477679                 131         2       caska
9-35017696-35018558                 167         2       gabpa
9-38369524-38370

### Checking the gRanges objects from CDS object and gene_anno object

- This was for the debugging steps ("chr" in the dataframe) - extracted from the annotate_cds_by_site source code (cicero).

In [20]:
# granges <- ranges_for_coords(rownames(fData(seurat_object.cds)), with_names = TRUE)
# granges

In [21]:
# feature_data <- gene_annotation_sub

# names(feature_data)[c(1,2,3)] <- c("chr", "start", "stop")
# # Remove the 'chr' prefix from the 'chr' column in 'feature_data'
# feature_data$chr <- gsub("chr", "", feature_data$chr)

# dtt <- GenomicRanges::makeGRangesFromDataFrame(feature_data,
#                                                    keep.extra.columns = TRUE)
# dtt

In [22]:
# ol <- GenomicRanges::findOverlaps(granges, dtt, select = "all",
#                                   maxgap = 0)
# ol

In [23]:
# Check if there are any NA values in the 'coaccess' column
anyNA_conns <- any(is.na(conns$coaccess))
print(paste("Are there any NA values in 'coaccess'? ", anyNA_conns))

# Check if there are any infinite values in the 'coaccess' column
anyInf_conns <- any(is.infinite(conns$coaccess))
print(paste("Are there any infinite values in 'coaccess'? ", anyInf_conns))

[1] "Are there any NA values in 'coaccess'?  TRUE"
[1] "Are there any infinite values in 'coaccess'?  FALSE"


In [24]:
# # Subset 'conns' to only include rows where 'coaccess' is NA
# na_coaccess_entries <- conns[is.na(conns$coaccess), ]

# # View the entries with NA 'coaccess' values
# head(na_coaccess_entries)


In [25]:
# debug(cicero:::make_sparse_matrix)

In [50]:
# undebug(cicero:::make_sparse_matrix)


In [None]:
1

In [26]:
# Step 5-2.Generate gene activity scores

# Check if there are any NA values in the 'coaccess' column
anyNA_conns <- any(is.na(conns$coaccess))
print(paste("Are there any NA values in 'coaccess'? ", anyNA_conns))

# Check if there are any infinite values in the 'coaccess' column
anyInf_conns <- any(is.infinite(conns$coaccess))
print(paste("Are there any infinite values in 'coaccess'? ", anyInf_conns))

# Remove rows with NA or infinite values in 'coaccess'
conns_clean <- conns[!is.na(conns$coaccess) & !is.infinite(conns$coaccess), ]

[1] "Are there any NA values in 'coaccess'?  TRUE"
[1] "Are there any infinite values in 'coaccess'?  FALSE"


In [37]:
unnorm_ga <- build_gene_activity_matrix(seurat_object.cds, conns_clean)

ERROR: Error in if (!class(data$x) %in% c("numeric", "integer")) stop("x.name column must be numeric"): the condition has length > 1


In [37]:
# Step 5-2.Generate gene activity scores

# Check if there are any NA values in the 'coaccess' column
anyNA_conns <- any(is.na(conns$coaccess))
print(paste("Are there any NA values in 'coaccess'? ", anyNA_conns))

# Check if there are any infinite values in the 'coaccess' column
anyInf_conns <- any(is.infinite(conns$coaccess))
print(paste("Are there any infinite values in 'coaccess'? ", anyInf_conns))

# Remove rows with NA or infinite values in 'coaccess'
conns_clean <- conns[!is.na(conns$coaccess) & !is.infinite(conns$coaccess), ]

# make sure that the "conns_clean" has numeric values in "coaccess"

# generate unnormalized gene activity matrix
unnorm_ga <- build_gene_activity_matrix(seurat_object.cds, conns_clean)

# remove any rows/columns with all zeroes
unnorm_ga <- unnorm_ga[!Matrix::rowSums(unnorm_ga) == 0, 
                       !Matrix::colSums(unnorm_ga) == 0]

# make a list of num_genes_expressed
num_genes <- pData(seurat_object.cds)$num_genes_expressed
names(num_genes) <- row.names(pData(seurat_object.cds))

# normalize
cicero_gene_activities <- normalize_gene_activities(unnorm_ga, num_genes)

# # if you had two datasets to normalize, you would pass both:
# # num_genes should then include all cells from both sets
# unnorm_ga2 <- unnorm_ga
# cicero_gene_activities <- normalize_gene_activities(list(unnorm_ga, unnorm_ga2), 
#                                                     num_genes)

[1] "Are there any NA values in 'coaccess'?  TRUE"
[1] "Are there any infinite values in 'coaccess'?  FALSE"
debugging in: make_sparse_matrix(nonneg_cons, x.name = "coaccess")
debug: {
    if (!i.name %in% names(data) | !j.name %in% names(data) | 
        !x.name %in% names(data)) {
        stop("i.name, j.name, and x.name must be columns in data")
    }
    data$i <- as.character(data[, i.name])
    data$j <- as.character(data[, j.name])
    data$x <- data[, x.name]
    if (!class(data$x) %in% c("numeric", "integer")) 
        stop("x.name column must be numeric")
    peaks <- data.frame(Peak = unique(c(data$i, data$j)), index = seq_len(length(unique(c(data$i, 
        data$j)))))
    data <- data[, c("i", "j", "x")]
    data <- rbind(data, data.frame(i = peaks$Peak, j = peaks$Peak, 
        x = 0))
    data <- data[!duplicated(data[, c("i", "j", "x")]), ]
    data <- data.table::as.data.table(data)
    peaks <- data.table::as.data.table(peaks)
    data.table::setkey(data, "i")
    da

ERROR: Error in if (!class(data$x) %in% c("numeric", "integer")) stop("x.name column must be numeric"): the condition has length > 1


## Debugging "build_gene_activity_matrix" function in cicero

In [67]:
save(seurat_object.cds, 
     file='/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/02_cicero_processed/TDR118reseq_cicero/10_TDR118_monocle3.cds')


In [68]:
write.table(conns_clean, file = "/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/02_cicero_processed/TDR118reseq_cicero/11_TDR118_cicero_conns_noNA.csv")

In [61]:
# save the intermediate objects
# 1) CDS object
# save_monocle_objects(cds=seurat_object.cds, 
#                      directory_path='/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/02_cicero_processed/TDR118reseq_cicero/10_TDR118_monocle3.cds', 
#                      comment='This is the intermediate TDR118 cds. Stored 2024-02-21.')
save(seurat_object.cds, 
     file='/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/02_cicero_processed/TDR118reseq_cicero/10_TDR118_monocle3.cds')

# 2) cicero object
write.table(conns_clean, file = "/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/02_cicero_processed/TDR118reseq_cicero/11_TDR118_cicero_conns_noNA.csv")

# 3) 

ERROR: Error in cds@reduce_dim_aux[[reduction_method]][["nn_index"]]: subscript out of bounds


In [60]:
head(conns_clean)

Peak1,Peak2,coaccess
<chr>,<chr>,<dbl>
1-10000286-10000789,1-9753075-9753596,0.0
1-10000286-10000789,1-9759496-9760011,-0.005966881
1-10000286-10000789,1-9764309-9764532,-0.005358352
1-10000286-10000789,1-9768426-9768767,0.002992008
1-10000286-10000789,1-9770155-9770487,-0.069031148
1-10000286-10000789,1-9771797-9771996,-0.008605915


In [None]:
unnorm_ga <- build_gene_activity_matrix(seurat_object.cds, conns_clean)

In [52]:
build_composite_gene_activity_matrix <- function(input_cds,
                                                 site_weights,
                                                 cicero_cons_info,
                                                 dist_thresh=250000,
                                                 coaccess_cutoff=0.25) {
    accessibility_mat <- exprs(input_cds)
    promoter_peak_table <- fData(input_cds)
    promoter_peak_table$peak <- as.character(row.names(promoter_peak_table))
    promoter_peak_table <-
        promoter_peak_table[!is.na(promoter_peak_table$gene),]
    promoter_peak_table <- promoter_peak_table[,c("peak", "gene")]
    promoter_peak_table$gene <- as.character(promoter_peak_table$gene)

    # Make site_weight matrix
    site_names <- names(site_weights)
    site_weights <- as(Matrix::Diagonal(x=as.numeric(site_weights)),
                      "sparseMatrix")
    row.names(site_weights) <- site_names
    colnames(site_weights) <- site_names

    # Find distance between cicero peaks. If distance already calculated, skip
    if ("dist" %in% colnames(cicero_cons_info) == FALSE) {
        Peak1_cols <- split_peak_names(cicero_cons_info$Peak1)
        Peak2_cols <- split_peak_names(cicero_cons_info$Peak2)
        Peak1_bp <- round((as.integer(Peak1_cols[,3]) +
                          as.integer(Peak1_cols[,2])) / 2)
        Peak2_bp <- round((as.integer(Peak2_cols[,3]) +
                          as.integer(Peak2_cols[,2])) / 2)
        cicero_cons_info$dist <- abs(Peak2_bp - Peak1_bp)
    }

    # Get connections between promoters and distal sites above coaccess
    # threshold
    nonneg_cons <-
        cicero_cons_info[(cicero_cons_info$Peak1 %in%
                          promoter_peak_table$peak |
                          cicero_cons_info$Peak2 %in%
                          promoter_peak_table$peak) &
                          cicero_cons_info$coaccess >= coaccess_cutoff &
                          cicero_cons_info$dist < dist_thresh,]
    nonneg_cons <- nonneg_cons[,c("Peak1", "Peak2", "coaccess")]
    nonneg_cons <- nonneg_cons[!duplicated(nonneg_cons),]

    nonneg_cons$Peak1 <- as.character(nonneg_cons$Peak1)
    nonneg_cons$Peak2 <- as.character(nonneg_cons$Peak2)

    nonneg_cons <- rbind(nonneg_cons,
                        data.frame(Peak1=unique(promoter_peak_table$peak),
                                   Peak2=unique(promoter_peak_table$peak),
                                   coaccess=0))

    # Make square matrix of connections from distal to proximal
    distal_connectivity_matrix <- make_sparse_matrix(nonneg_cons,
                                                    x.name="coaccess")

    # Make connectivity matrix of promoters versus all
    promoter_conn_matrix <-
        distal_connectivity_matrix[unique(promoter_peak_table$peak),]

    # Get list of promoter and distal sites in accessibility mat
    promoter_safe_sites <- intersect(rownames(promoter_conn_matrix),
                                     row.names(accessibility_mat))
    distal_safe_sites <- intersect(colnames(promoter_conn_matrix),
                                     row.names(accessibility_mat))
    distal_safe_sites <- setdiff(distal_safe_sites, promoter_safe_sites)

    # Get accessibility info for promoters
    promoter_access_mat_in_cicero_map <- accessibility_mat[promoter_safe_sites,, drop=FALSE]

    # Get accessibility for distal sites
    distal_activity_scores <- accessibility_mat[distal_safe_sites,, drop=FALSE]

    # Scale connectivity matrix by site_weights
    scaled_site_weights <- site_weights[distal_safe_sites,distal_safe_sites, drop=FALSE]
    total_linked_site_weights <- promoter_conn_matrix[,distal_safe_sites, drop=FALSE] %*%
        scaled_site_weights
    total_linked_site_weights <- 1/Matrix::rowSums(total_linked_site_weights,
                                                na.rm=TRUE)
    total_linked_site_weights[is.finite(total_linked_site_weights) == FALSE] <- 0
    total_linked_site_weights[is.na(total_linked_site_weights)] <- 0
    total_linked_site_weights[is.nan(total_linked_site_weights)] <- 0
    site_names <- names(total_linked_site_weights)
    total_linked_site_weights <- Matrix::Diagonal(x=total_linked_site_weights)
    row.names(total_linked_site_weights) <- site_names
    colnames(total_linked_site_weights) <- site_names
    scaled_site_weights <- total_linked_site_weights %*%
        promoter_conn_matrix[,distal_safe_sites, drop=FALSE] %*%
        scaled_site_weights
    scaled_site_weights@x[scaled_site_weights@x > 1] <- 1

    # Multiply distal accessibility by site weights
    distal_activity_scores <- scaled_site_weights %*% distal_activity_scores

    distal_activity_scores <-
        distal_activity_scores[row.names(promoter_access_mat_in_cicero_map),, drop=FALSE]

    # Sum distal and promoter scores
    promoter_activity_scores <- distal_activity_scores +
        promoter_access_mat_in_cicero_map

    # Make and populate final matrix
    promoter_gene_mat <-
        Matrix::sparseMatrix(j=as.numeric(factor(promoter_peak_table$peak)),
                             i=as.numeric(factor(promoter_peak_table$gene)),
                             x=1)
    colnames(promoter_gene_mat) = levels(factor(promoter_peak_table$peak))
    row.names(promoter_gene_mat) = levels(factor(promoter_peak_table$gene))
    promoter_gene_mat <- promoter_gene_mat[,row.names(promoter_activity_scores)]
    gene_activity_scores <- promoter_gene_mat %*% promoter_activity_scores

    return(gene_activity_scores)
}


## 02/21/2024 --- up to this point
bug report:     distal_activity_scores <-
        distal_activity_scores[row.names(promoter_access_mat_in_cicero_map),, drop=FALSE]

    # Sum distal and promoter scores
    promoter_activity_scores <- distal_activity_scores +
        promoter_access_mat_in_cicero_map

In [None]:
# Make connectivity matrix of promoters versus all
promoter_conn_matrix <-
    distal_connectivity_matrix[unique(promoter_peak_table$peak),]

# Get list of promoter and distal sites in accessibility mat
promoter_safe_sites <- intersect(rownames(promoter_conn_matrix),
                                 row.names(accessibility_mat))
distal_safe_sites <- intersect(colnames(promoter_conn_matrix),
                                 row.names(accessibility_mat))
distal_safe_sites <- setdiff(distal_safe_sites, promoter_safe_sites)

# Get accessibility info for promoters
promoter_access_mat_in_cicero_map <- accessibility_mat[promoter_safe_sites,, drop=FALSE]

# Get accessibility for distal sites
distal_activity_scores <- accessibility_mat[distal_safe_sites,, drop=FALSE]

# Scale connectivity matrix by site_weights
scaled_site_weights <- site_weights[distal_safe_sites,distal_safe_sites, drop=FALSE]
total_linked_site_weights <- promoter_conn_matrix[,distal_safe_sites, drop=FALSE] %*%
    scaled_site_weights
total_linked_site_weights <- 1/Matrix::rowSums(total_linked_site_weights,
                                            na.rm=TRUE)
total_linked_site_weights[is.finite(total_linked_site_weights) == FALSE] <- 0
total_linked_site_weights[is.na(total_linked_site_weights)] <- 0
total_linked_site_weights[is.nan(total_linked_site_weights)] <- 0
site_names <- names(total_linked_site_weights)
total_linked_site_weights <- Matrix::Diagonal(x=total_linked_site_weights)
row.names(total_linked_site_weights) <- site_names
colnames(total_linked_site_weights) <- site_names
scaled_site_weights <- total_linked_site_weights %*%
    promoter_conn_matrix[,distal_safe_sites, drop=FALSE] %*%
    scaled_site_weights
scaled_site_weights@x[scaled_site_weights@x > 1] <- 1

# Multiply distal accessibility by site weights
distal_activity_scores <- scaled_site_weights %*% distal_activity_scores

distal_activity_scores <-
    distal_activity_scores[row.names(promoter_access_mat_in_cicero_map),, drop=FALSE]

# Sum distal and promoter scores
promoter_activity_scores <- distal_activity_scores +
    promoter_access_mat_in_cicero_map

# Make and populate final matrix
promoter_gene_mat <-
    Matrix::sparseMatrix(j=as.numeric(factor(promoter_peak_table$peak)),
                         i=as.numeric(factor(promoter_peak_table$gene)),
                         x=1)
colnames(promoter_gene_mat) = levels(factor(promoter_peak_table$peak))
row.names(promoter_gene_mat) = levels(factor(promoter_peak_table$gene))
promoter_gene_mat <- promoter_gene_mat[,row.names(promoter_activity_scores)]
gene_activity_scores <- promoter_gene_mat %*% promoter_activity_scores

return(gene_activity_scores)

In [55]:
# split_peak_names <- function(peak_names) {
#   # Split the peak names at each '-' and convert the result into a data frame
#   split_names <- strsplit(as.character(peak_names), "-")
  
#   # Extract components: chromosome, start, and end positions
#   # Assuming each peak name is in the format "chromosome-start-end"
#   chromosome <- sapply(split_names, function(x) x[1])
#   start <- sapply(split_names, function(x) as.integer(x[2]))
#   end <- sapply(split_names, function(x) as.integer(x[3]))
  
#   # Combine these components into a new data frame
#   df <- data.frame(
#     chromosome = chromosome,
#     start = start,
#     end = end,
#     stringsAsFactors = FALSE  # Ensure character columns aren't converted to factors
#   )
  
#   return(df)
# }

In [None]:
split_peak_names <- function(inp) {
  out <- stringr::str_split_fixed(stringi::stri_reverse(inp), 
                                  ":|-|_", 3)
  out[,1] <- stringi::stri_reverse(out[,1])
  out[,2] <- stringi::stri_reverse(out[,2])
  out[,3] <- stringi::stri_reverse(out[,3])
  out[,c(3,2,1), drop=FALSE]
}


In [36]:
library(data.table)
library(stringr)
setDF(conns_clean)


Attaching package: 'data.table'


The following object is masked from 'package:SummarizedExperiment':

    shift


The following object is masked from 'package:GenomicRanges':

    shift


The following object is masked from 'package:IRanges':

    shift


The following objects are masked from 'package:S4Vectors':

    first, second




In [32]:
head(conns_clean)

Peak1,Peak2,coaccess
<chr>,<chr>,<dbl>
1-10000286-10000789,1-9753075-9753596,0.0
1-10000286-10000789,1-9759496-9760011,-0.005966881
1-10000286-10000789,1-9764309-9764532,-0.005358352
1-10000286-10000789,1-9768426-9768767,0.002992008
1-10000286-10000789,1-9770155-9770487,-0.069031148
1-10000286-10000789,1-9771797-9771996,-0.008605915


In [56]:
# Let's break down this function piece by piece
gene_promoter_activity <- build_composite_gene_activity_matrix(seurat_object.cds,
                                         site_weights=NULL,
                                         conns_clean,
                                         dist_thresh=250000,
                                         coaccess_cutoff=0.25)
gene_promoter_activity

In [57]:
# debugging by breaking apart the "build_composite_gene_activity_matrix" function

# define the input parameters
input_cds <- seurat_object.cds
site_weights <- NULL
cicero_cons_info <- conns_clean
dist_thresh=250000
coaccess_cutoff=0.25

accessibility_mat <- exprs(input_cds)
promoter_peak_table <- fData(input_cds)
promoter_peak_table$peak <- as.character(row.names(promoter_peak_table))
promoter_peak_table <-
    promoter_peak_table[!is.na(promoter_peak_table$gene),]
promoter_peak_table <- promoter_peak_table[,c("peak", "gene")]
promoter_peak_table$gene <- as.character(promoter_peak_table$gene)


In [58]:
promoter_peak_table

DataFrame with 24928 rows and 2 columns
                                   peak        gene
                            <character> <character>
1-11020-12944             1-11020-12944       rpl24
1-18477-20032             1-18477-20032      nfkbiz
1-27143-28515             1-27143-28515         eed
1-36500-38297             1-36500-38297     hikeshi
1-43648-44789             1-43648-44789     tmem39a
...                                 ...         ...
9-31346542-31347347 9-31346542-31347347  BX571774.1
9-33477233-33477679 9-33477233-33477679       caska
9-35017696-35018558 9-35017696-35018558       gabpa
9-38369524-38370248 9-38369524-38370248      plcd4b
9-48455731-48456378 9-48455731-48456378       lrp2a

In [None]:
cicero_cons_info

In [None]:
cicero_cons_info <- cicero_cons_info[,c("Peak1","Peak2","coaccess")]

In [59]:
# Make site_weight matrix
site_names <- names(site_weights)
site_weights <- as(Matrix::Diagonal(x=as.numeric(site_weights)),
                  "sparseMatrix")
row.names(site_weights) <- site_names
colnames(site_weights) <- site_names

# Find distance between cicero peaks. If distance already calculated, skip
if ("dist" %in% colnames(cicero_cons_info) == FALSE) {
    Peak1_cols <- split_peak_names(cicero_cons_info$Peak1)
    Peak2_cols <- split_peak_names(cicero_cons_info$Peak2)
    Peak1_bp <- round((as.integer(Peak1_cols[,3]) +
                      as.integer(Peak1_cols[,2])) / 2)
    Peak2_bp <- round((as.integer(Peak2_cols[,3]) +
                      as.integer(Peak2_cols[,2])) / 2)
    cicero_cons_info$dist <- abs(Peak2_bp - Peak1_bp)
}
cicero_cons_info

In [76]:
cicero_cons_info_sub = cicero_cons_info[1:10]
cicero_cons_info_sub

Peak1,Peak2,coaccess,peak1_mid,peak2_mid,dist
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1-10000286-10000789,1-9753075-9753596,0.0,27363475,27363475,0
1-10000286-10000789,1-9759496-9760011,-0.005966881,27363475,27363475,0
1-10000286-10000789,1-9764309-9764532,-0.005358352,27363475,27363475,0
1-10000286-10000789,1-9768426-9768767,0.002992008,27363475,27363475,0
1-10000286-10000789,1-9770155-9770487,-0.069031148,27363475,27363475,0
1-10000286-10000789,1-9771797-9771996,-0.008605915,27363475,27363475,0
1-10000286-10000789,1-9780918-9781236,-0.007685523,27363475,27363475,0
1-10000286-10000789,1-9781927-9782126,-0.032186761,27363475,27363475,0
1-10000286-10000789,1-9785384-9785711,-0.098083798,27363475,27363475,0
1-10000286-10000789,1-9788934-9789194,0.009972235,27363475,27363475,0


In [None]:
# Find distance between cicero peaks. If distance already calculated, skip
if ("dist" %in% colnames(cicero_cons_info_sub) == FALSE) {
    Peak1_cols <- split_peak_names(cicero_cons_info_sub$Peak1)
    Peak2_cols <- split_peak_names(cicero_cons_info_sub$Peak2)
    Peak1_bp <- round((as.integer(Peak1_cols[,3]) +
                      as.integer(Peak1_cols[,2])) / 2)
    Peak2_bp <- round((as.integer(Peak2_cols[,3]) +
                      as.integer(Peak2_cols[,2])) / 2)
    cicero_cons_info_sub$dist <- abs(Peak2_bp - Peak1_bp)
}
cicero_cons_info_sub

In [77]:
# library(data.table)
# library(stringr)

# # Convert the cicero_cons_info to a data.table if it's not already
# setDT(cicero_cons_info_sub)

# # Function to calculate midpoint, revised to ensure correct numeric conversion
# get_midpoint <- function(peak) {
#   coords <- str_split(peak, "-", simplify = TRUE)
#   coords <- as.numeric(coords[, 2:3])
#   mean(coords)
# }

# # Apply the function to calculate midpoints for Peak1 and Peak2
# cicero_cons_info_sub[, peak1_mid := sapply(Peak1, get_midpoint)]
# cicero_cons_info_sub[, peak2_mid := sapply(Peak2, get_midpoint)]

# # Calculate distances between peaks
# cicero_cons_info_sub[, dist := abs(peak1_mid - peak2_mid)]

# # Inspect the first few rows
# head(cicero_cons_info_sub)

Peak1,Peak2,coaccess,peak1_mid,peak2_mid,dist
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1-10000286-10000789,1-9753075-9753596,0.0,10000538,9753336,247202.0
1-10000286-10000789,1-9759496-9760011,-0.005966881,10000538,9759754,240784.0
1-10000286-10000789,1-9764309-9764532,-0.005358352,10000538,9764420,236117.0
1-10000286-10000789,1-9768426-9768767,0.002992008,10000538,9768596,231941.0
1-10000286-10000789,1-9770155-9770487,-0.069031148,10000538,9770321,230216.5
1-10000286-10000789,1-9771797-9771996,-0.008605915,10000538,9771896,228641.0


In [None]:
# Ensure cicero_cons_info_sub is a data.table
setDT(cicero_cons_info)

# Split 'Peak1' and 'Peak2' into columns using vectorized operations
cicero_cons_info[, c("start1", "end1") := tstrsplit(Peak1, "-", type.convert = TRUE)[2:3]]
cicero_cons_info[, c("start2", "end2") := tstrsplit(Peak2, "-", type.convert = TRUE)[2:3]]

# Calculate midpoints in a vectorized manner
cicero_cons_info[, peak1_mid := (start1 + end1) / 2]
cicero_cons_info[, peak2_mid := (start2 + end2) / 2]

# Calculate distances between peaks
cicero_cons_info[, dist := abs(peak1_mid - peak2_mid)]

# Inspect the first few rows to confirm
head(cicero_cons_info)

In [None]:
nonneg_cons <-
    cicero_cons_info[(cicero_cons_info$Peak1 %in%
                      promoter_peak_table$peak |
                      cicero_cons_info$Peak2 %in%
                      promoter_peak_table$peak) &
                      cicero_cons_info$coaccess >= coaccess_cutoff &
                      cicero_cons_info$dist < dist_thresh,]

In [79]:
# Get connections between promoters and distal sites above coaccess
# threshold
nonneg_cons <-
    cicero_cons_info_sub[(cicero_cons_info_sub$Peak1 %in%
                      promoter_peak_table$peak |
                      cicero_cons_info_sub$Peak2 %in%
                      promoter_peak_table$peak) &
                      cicero_cons_info_sub$coaccess >= coaccess_cutoff &
                      cicero_cons_info_sub$dist < dist_thresh,]
nonneg_cons <- nonneg_cons[,c("Peak1", "Peak2", "coaccess")]
nonneg_cons <- nonneg_cons[!duplicated(nonneg_cons),]

nonneg_cons$Peak1 <- as.character(nonneg_cons$Peak1)
nonneg_cons$Peak2 <- as.character(nonneg_cons$Peak2)

nonneg_cons <- rbind(nonneg_cons,
                    data.frame(Peak1=unique(promoter_peak_table$peak),
                               Peak2=unique(promoter_peak_table$peak),
                               coaccess=0))
nonneg_cons

Peak1,Peak2,coaccess
<chr>,<chr>,<dbl>
1-11020-12944,1-11020-12944,0
1-18477-20032,1-18477-20032,0
1-27143-28515,1-27143-28515,0
1-36500-38297,1-36500-38297,0
1-43648-44789,1-43648-44789,0
1-47922-48323,1-47922-48323,0
1-48901-50727,1-48901-50727,0
1-64580-65338,1-64580-65338,0
1-70906-72712,1-70906-72712,0
1-109028-109937,1-109028-109937,0


In [83]:
#' Make a symmetric square sparse matrix from data frame
#'
#' Convert a data frame into a square sparse matrix (all versus all)
#'
#' @param data data frame
#' @param i.name name of i column
#' @param j.name name of j column
#' @param x.name name of value column
#'
#' @return sparse matrix
#'
#'
make_sparse_matrix <- function(data,
                               i.name = "Peak1",
                               j.name = "Peak2",
                               x.name = "value") {
  if(!i.name %in% names(data) |
     !j.name %in% names(data) |
     !x.name %in% names(data)) {
    stop('i.name, j.name, and x.name must be columns in data')
  }
  
  data$i <- as.character(data[,i.name])
  data$j <- as.character(data[,j.name])
  data$x <- data[,x.name]
  
  if(!class(data$x) %in%  c("numeric", "integer"))
    stop('x.name column must be numeric')
  
  peaks <- data.frame(Peak = unique(c(data$i, data$j)),
                      index = seq_len(length(unique(c(data$i, data$j)))))
  
  data <- data[,c("i", "j", "x")]
  
  data <- rbind(data, data.frame(i=peaks$Peak, j = peaks$Peak, x = 0))
  data <- data[!duplicated(data[,c("i", "j", "x")]),]
  data <- data.table::as.data.table(data)
  peaks <- data.table::as.data.table(peaks)
  data.table::setkey(data, "i")
  data.table::setkey(peaks, "Peak")
  data <- data[peaks]
  data.table::setkey(data, "j")
  data <- data[peaks]
  data <- as.data.frame(data)
  
  data <- data[,c("index", "i.index", "x")]
  data2 <- data
  names(data2) <- c("i.index", "index", "x")
  
  data <- rbind(data, data2)
  
  data <- data[!duplicated(data[,c("index", "i.index")]),]
  data <- data[data$index >= data$i.index,]
  
  sp_mat <- Matrix::sparseMatrix(i=as.numeric(data$index),
                                 j=as.numeric(data$i.index),
                                 x=data$x,
                                 symmetric = TRUE)
  
  colnames(sp_mat) <- peaks[order(peaks$index),]$Peak
  row.names(sp_mat) <- peaks[order(peaks$index),]$Peak
  return(sp_mat)
}


In [86]:
colnames(nonneg_cons)

In [87]:
setDF(nonneg_cons)

In [88]:
# Make square matrix of connections from distal to proximal
distal_connectivity_matrix <- make_sparse_matrix(nonneg_cons,
                                                x.name="coaccess")
distal_connectivity_matrix

  [[ suppressing 32 column names '1-11020-12944', '1-18477-20032', '1-27143-28515' ... ]]

  [[ suppressing 32 column names '1-11020-12944', '1-18477-20032', '1-27143-28515' ... ]]

  [[ suppressing 32 column names '1-11020-12944', '1-18477-20032', '1-27143-28515' ... ]]



24928 x 24928 sparse Matrix of class "dsCMatrix"
                                                                                          
1-11020-12944       0 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
1-18477-20032       . 0 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
1-27143-28515       . . 0 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
1-36500-38297       . . . 0 . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
1-43648-44789       . . . . 0 . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
1-47922-48323       . . . . . 0 . . . . . . . . . . . . . . . . . . . . . . . . . . ......
1-48901-50727       . . . . . . 0 . . . . . . . . . . . . . . . . . . . . . . . . . ......
1-64580-65338       . . . . . . . 0 . . . . . . . . . . . . . . . . . . . . . . . . ......
1-70906-72712       . . . . . . . . 0 . . . . . . . . . . . . . . . . . . . . . . . ......
1-109028-109937     . . . . . . . . . 0 .

In [None]:
# Get connections between promoters and distal sites above coaccess
# threshold
nonneg_cons <-
    cicero_cons_info[(cicero_cons_info$Peak1 %in%
                      promoter_peak_table$peak |
                      cicero_cons_info$Peak2 %in%
                      promoter_peak_table$peak) &
                      cicero_cons_info$coaccess >= coaccess_cutoff &
                      cicero_cons_info$dist < dist_thresh,]
nonneg_cons <- nonneg_cons[,c("Peak1", "Peak2", "coaccess")]
nonneg_cons <- nonneg_cons[!duplicated(nonneg_cons),]

nonneg_cons$Peak1 <- as.character(nonneg_cons$Peak1)
nonneg_cons$Peak2 <- as.character(nonneg_cons$Peak2)

nonneg_cons <- rbind(nonneg_cons,
                    data.frame(Peak1=unique(promoter_peak_table$peak),
                               Peak2=unique(promoter_peak_table$peak),
                               coaccess=0))

# Make square matrix of connections from distal to proximal
distal_connectivity_matrix <- make_sparse_matrix(nonneg_cons,
                                                x.name="coaccess")

In [34]:
small_conns <- head(conns_clean, 1000)
test_matrix <- cicero::build_gene_activity_matrix(seurat_object.cds, small_conns)

ERROR: Error in if (!class(data$x) %in% c("numeric", "integer")) stop("x.name column must be numeric"): the condition has length > 1


In [35]:
# This mimics the problematic check in the error message
if (!class(small_conns$coaccess) %in% c("numeric", "integer")) {
  stop("coaccess column must be numeric")
}

In [None]:
# save the normalized cicero_gene_activities
gene.activity.path = paste0(cicero_path, "06_", data_id, "_gene_activities_",peaktype, "_peaks.csv")
write.csv(cicero_gene_activities, gene.activity.path, row.names=TRUE, col.names=TRUE)

In [None]:

# # old script to import GTF file
# gtf_zf <- rtracklayer::import(gref_path)

# # make a gene.coord object
# gene.coords.zf <- gtf_zf
# # filter out the entries without the gene_name
# gene.coords.zf <- gene.coords.zf[! is.na(gene.coords.zf$gene_name),]

# # only keep the regions within standard chromosomes
# gene.coords.zf <- keepStandardChromosomes(gene.coords.zf, pruning.mode = 'coarse')
# # name the genome - GRCz11
# genome(gene.coords.zf) <- 'GRCz11'

# # copy the "gene_id" for the "tx_id" and "transcript_id" 
# gene.coords.zf$tx_id <- gene.coords.zf$gene_id
# gene.coords.zf$transcript_id <- gene.coords.zf$gene_id