# Matching genes to proximal peaks

A common approach to start peak - gene correlation analysis is to find all peaks within 50kb of a gene. Here we build an adjacency matrix matching peak to genes.

In [None]:
import numpy as np
import scanpy as sc 
import pandas as pd
import anndata
import anndata2ri ## For sparse matrix conversion from r 2 py

#### r2py set-up

In [None]:
import rpy2.rinterface_lib.callbacks
import logging

In [None]:
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

Loading the `rpy2` extension enables cell magic to be used. This runs R code in jupyter notebook cells.

In [None]:
%load_ext rpy2.ipython

### Load data

In [None]:
outdir = "/home/jovyan/data/lung_adult_scATAC/"
experiment_prefix = 'lungAdult_'

In [None]:
adata = sc.read_h5ad(outdir + experiment_prefix + "_ATAC.wCisTopic.h5ad")

In [None]:
peaks = adata.var_names

### Match peaks 2 genes

In [None]:
%%R 
library(Matrix)
library(GenomicRanges)
library(ensembldb)
library(EnsDb.Hsapiens.v86) ## Remember to pick your genome!
library(tidyr)
# library(Signac)

In [None]:
%%R
## String - GRanges conversion
## Borrowed from Signac functions 
## https://satijalab.org/signac/reference/GRangesToString.html
StringToGRanges <- function(regions, sep = c("-", "-"), ...) {
  ranges.df <- data.frame(ranges = regions)
  ranges.df <- separate(
    data = ranges.df,
    col = "ranges",
    sep = paste0(sep[[1]], "|", sep[[2]]),
    into = c("chr", "start", "end")
  )
  granges <- makeGRangesFromDataFrame(df = ranges.df, ...)
  return(granges)
}

GRangesToString <- function(grange, sep = c("-", "-")) {
  regions <- paste0(
    as.character(x = seqnames(x = grange)),
    sep[[1]],
    start(x = grange),
    sep[[2]],
    end(x = grange)
  )
  return(regions)
}

# Extend genomicRanges
# 
extend <- function(x, upstream=0, downstream=0)     
{
    if (any(strand(x) == "*"))
        warning("'*' ranges were treated as '+'")
    on_plus <- strand(x) == "+" | strand(x) == "*"
    new_start <- start(x) - ifelse(on_plus, upstream, downstream)
    new_end <- end(x) + ifelse(on_plus, downstream, upstream)
    ranges(x) <- IRanges(new_start, new_end)
    trim(x)
}


# Find peaks close to features of interest
#
# @param peaks_gr GenomicRanges object containing peaks
# @param features_gr GenomicRanges object containing features (e.g. genes)
# @param d distance to include peak, in bps (default 50000)
# @param feat_anno column in `features_gr@elementMetadata` containing annotation to name features (if NULL converts Granges to string)
#
# @return Sparse adjacency matrix indicating hits
peak2feature <- function(peaks_gr, features_gr, d=50000, feat_anno=NULL){
  seqlevelsStyle(features_gr) <- seqlevelsStyle(peaks_gr)
  
  ## Find peaks overlapping the search range around the features
  ext_gr <- extend(features_gr, upstream = d, downstream = d)
  ovs <- findOverlaps(peaks_gr, ext_gr)
  
  ## Define identifiers for peaks and features
  all_peaks <- GRangesToString(peaks_gr, sep = c(":", '-'))
  if (is.null(feat_anno)) {
    all_feats <- GRangesToString(features_gr, sep = c(":", '-'))
  } else {
    all_feats <- features_gr@elementMetadata[[feat_anno]]
  }
  
  ## Build adjacency matrix for hits
  adj_mat <- Matrix(data=0, nrow = length(all_peaks), ncol=length(all_feats))
  for (i in unique(subjectHits(ovs))) {
    # if (length(adj_mat[queryHits(ovs[subjectHits(ovs)==i]),i]) > 0) {
    adj_mat[queryHits(ovs[subjectHits(ovs)==i]),i] <- 1
    # }
  }
  colnames(adj_mat) <- all_feats
  rownames(adj_mat) <- all_peaks
  
  adj_mat
  
}

In [10]:
%%R  -i peaks -o adj_mat
genes_gr <- genes(EnsDb.Hsapiens.v86)
peaks_gr <- StringToGRanges(peaks, sep=c(":", "-"))

## Compute peak2gene adjacency matrix
adj_mat <- peak2feature(peaks_gr, genes_gr, feat_anno = "gene_id", d=50000)

In [31]:
%%R -o genes
genes <- colnames(adj_mat)

In [22]:
## Convert sparse matrix w anndata2ri
adj_mat = anndata2ri.r2py.rmat_to_spmat(adj_mat)

We can store the adjacency matrix in the `.varm` slot of the anndata

In [32]:
adata.varm["peak2gene"] = adj_mat
adata.uns["peak2gene_genes"] = genes

### Save anndata object

In [36]:
adata.write_h5ad(outdir + experiment_prefix + "_ATAC.wCisTopic.h5ad")