# Motif analysis with chromVAR

In [2]:
import numpy as np
import scanpy as sc 
import pandas as pd
import anndata

### r2py set-up

In [3]:
import rpy2.rinterface_lib.callbacks
import logging

In [4]:
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

I use `anndata2ri` to handle conversion of the sparse matrix (not too straightforward with `rpy2` alone)

In [5]:
import anndata2ri
anndata2ri.activate()

Loading the `rpy2` extension enables cell magic to be used. This runs R code in jupyter notebook cells.

In [6]:
%load_ext rpy2.ipython

Load R packages

In [7]:
%%R
library(tidyverse)
library(chromVAR)
library(chromVARmotifs)
library(BSgenome.Hsapiens.UCSC.hg38)
library(SummarizedExperiment)
library(motifmatchr)
# Use cisBP motifs (curated set to remove redudancy)
data("human_pwms_v2")
# Use ENCODE motifs
data("encode_pwms")

### Load dataset

AnnData generated in `N2_add_cistopic.ipynb`

In [8]:
outdir = "/home/jovyan/data/lung_adult_scATAC/"
adata = sc.read_h5ad(outdir + "lungAdult__ATAC.wCisTopic.h5ad")

### Data setup to run chromVAR

Import data in R using `anndata2ri`

In [9]:
adata_chromvar = anndata.AnnData(adata.X)

adata_chromvar.var_names = adata.var_names
adata_chromvar.obs_names = adata.obs_names

In [10]:
%%R -i adata_chromvar
adata_chromvar

class: SingleCellExperiment 
dim: 96161 53585 
metadata(0):
assays(1): X
rownames(96161): chr1:816942-817444 chr1:826600-827805 ...
  chrY:21574860-21575895 chrY:22366897-22367799
rowData names(0):
colnames(53585): 59-AAACGAAAGATATGAC-1 59-AAACGAAAGGTAGGAA-1 ...
  04-TTTGTGTTCGGATAGG-1 04-TTTGTGTTCTCGTAGA-1
colData names(0):
reducedDimNames(0):
spikeNames(0):
altExpNames(0):


Transform `SingleCellExperiment` object to `rangedSummarizedExperiment` object

In [13]:
%%R
## Extract peak ids
peaks_ids <- rownames(adata_chromvar)

[1] "chr1:816942-817444" "chr1:826600-827805" "chr1:858056-859079"
[4] "chr1:869401-870227" "chr1:876302-877674" "chr1:904112-905616"


In [14]:
%%R
## String - GRanges conversion
## Borrowed from Signac functions 
## https://satijalab.org/signac/reference/GRangesToString.html
StringToGRanges <- function(regions, sep = c("-", "-"), ...) {
  ranges.df <- data.frame(ranges = regions)
  ranges.df <- separate(
    data = ranges.df,
    col = "ranges",
    sep = paste0(sep[[1]], "|", sep[[2]]),
    into = c("chr", "start", "end")
  )
  granges <- makeGRangesFromDataFrame(df = ranges.df, ...)
  return(granges)
}

GRangesToString <- function(grange, sep = c("-", "-")) {
  regions <- paste0(
    as.character(x = seqnames(x = grange)),
    sep[[1]],
    start(x = grange),
    sep[[2]],
    end(x = grange)
  )
  return(regions)
}

# Extend genomicRanges
# 
extend <- function(x, upstream=0, downstream=0)     
{
    if (any(strand(x) == "*"))
        warning("'*' ranges were treated as '+'")
    on_plus <- strand(x) == "+" | strand(x) == "*"
    new_start <- start(x) - ifelse(on_plus, upstream, downstream)
    new_end <- end(x) + ifelse(on_plus, downstream, upstream)
    ranges(x) <- IRanges(new_start, new_end)
    trim(x)
}

## Make rangedSummarized experiment object
atac_se <- SummarizedExperiment(assays = list(counts=assay(adata_chromvar, "X")), rowRanges = StringToGRanges(peaks_ids, sep=c(":","-")))

Match peaks to motifs

In [15]:
%%R
## Calculate GC bias across genome
atac_se <- addGCBias(atac_se, genome = BSgenome.Hsapiens.UCSC.hg38)
## Calculate GC bias across genome
motif_ix <- matchMotifs(human_pwms_v2, atac_se, genome = BSgenome.Hsapiens.UCSC.hg38, p.cutoff=5e-5)
rownames(motif_ix) <- peaks_ids
rownames(atac_se) <- peaks_ids

motif_ix

class: RangedSummarizedExperiment 
dim: 96161 870 
metadata(0):
assays(1): motifMatches
rownames(96161): chr1:816942-817444 chr1:826600-827805 ...
  chrY:21574860-21575895 chrY:22366897-22367799
rowData names(1): bias
colnames(870): ENSG00000008196_LINE2_TFAP2B_D_N1
  ENSG00000008197_LINE6_TFAP2D_D ...
  ENSG00000112837_LINE19949_TBX18_I_N1
  ENSG00000122145_LINE20002_TBX22_I_N1
colData names(1): name


Compute motif deviations (i.e. how much does TF accessibility deviate from expected value)

In [17]:
%%R
bg <- getBackgroundPeaks(object = atac_se)



Error in get(as.character(FUN), mode = "function", envir = envir) : 
  object 'as.SimpleList' of mode 'function' was not found


In [None]:
%%R
head(motif_ix)
motif_dev <- computeDeviations(object = atac_se, annotations = motif_ix, background_peaks = bg)

In [None]:
motif_dev