# ATAC-seq - Get promoter regions for integrative analysis of promoter-gene pairs
- goal: corresponding promoter regions from RNA features
- input: RNA genomic metadata
- output: promoter regions for accesibility quantification

In [1]:
# set correct working directory -> project folder
getwd()
setwd('..')
getwd()

In [2]:
# load libraries
library(Rsamtools)
library(GenomicAlignments)
library(GenomicRanges)
library(glue)
library(rtracklayer)

Loading required package: GenomeInfoDb

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors

Loading required package: stats4


Att

In [3]:
# configs
data_path <- file.path('results','ATAC','all')
gtf_path <- file.path("resources/RNA/genome.gtf")

## load data

In [4]:
# load RNA genome annotation
RNA_gtf <- rtracklayer::import(gtf_path)
RNA_gtf_df <- as.data.frame(RNA_gtf)
head(RNA_gtf)

GRanges object with 6 ranges and 26 metadata columns:
      seqnames          ranges strand |   source       type     score     phase
         <Rle>       <IRanges>  <Rle> | <factor>   <factor> <numeric> <integer>
  [1]        1 3073253-3074322      + |  havana  gene              NA      <NA>
  [2]        1 3073253-3074322      + |  havana  transcript        NA      <NA>
  [3]        1 3073253-3074322      + |  havana  exon              NA      <NA>
  [4]        1 3102016-3102125      + |  ensembl gene              NA      <NA>
  [5]        1 3102016-3102125      + |  ensembl transcript        NA      <NA>
  [6]        1 3102016-3102125      + |  ensembl exon              NA      <NA>
                 gene_id gene_version     gene_name gene_source gene_biotype
             <character>  <character>   <character> <character>  <character>
  [1] ENSMUSG00000102693            1 4933401J01Rik      havana          TEC
  [2] ENSMUSG00000102693            1 4933401J01Rik      havana          TE

In [6]:
# filter for genes
RNA_gtf_genes <- (RNA_gtf[elementMetadata(RNA_gtf)[,'type']=='gene'])
names(RNA_gtf_genes) <- RNA_gtf_genes$gene_id
head(RNA_gtf_genes)

GRanges object with 6 ranges and 26 metadata columns:
                     seqnames          ranges strand |         source     type
                        <Rle>       <IRanges>  <Rle> |       <factor> <factor>
  ENSMUSG00000102693        1 3073253-3074322      + | havana             gene
  ENSMUSG00000064842        1 3102016-3102125      + | ensembl            gene
  ENSMUSG00000051951        1 3205901-3671498      - | ensembl_havana     gene
  ENSMUSG00000102851        1 3252757-3253236      + | havana             gene
  ENSMUSG00000103377        1 3365731-3368549      - | havana             gene
  ENSMUSG00000104017        1 3375556-3377788      - | havana             gene
                         score     phase            gene_id gene_version
                     <numeric> <integer>        <character>  <character>
  ENSMUSG00000102693        NA      <NA> ENSMUSG00000102693            1
  ENSMUSG00000064842        NA      <NA> ENSMUSG00000064842            1
  ENSMUSG00000051951  

## export whole genome regions as .bed file

In [7]:
genome_df <- data.frame(seqnames=paste0('chr',seqnames(RNA_gtf_genes)),
  starts=start(RNA_gtf_genes)-1,
  ends=end(RNA_gtf_genes),
  names=RNA_gtf_genes$gene_id,
  scores=c(rep(".", length(RNA_gtf_genes))),
  strands=strand(RNA_gtf_genes))

# rename mitochondrial chromosome to fit chromsize file 
genome_df[genome_df$seqnames=='chrMT', 'seqnames'] <- 'chrM'

write.table(genome_df, file=file.path(data_path,"mm10_regions.bed"), quote=F, sep="\t", row.names=F, col.names=F)

In [9]:
dim(genome_df)
head(genome_df)

Unnamed: 0_level_0,seqnames,starts,ends,names,scores,strands
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<chr>,<fct>
1,chr1,3073252,3074322,ENSMUSG00000102693,.,+
2,chr1,3102015,3102125,ENSMUSG00000064842,.,+
3,chr1,3205900,3671498,ENSMUSG00000051951,.,-
4,chr1,3252756,3253236,ENSMUSG00000102851,.,+
5,chr1,3365730,3368549,ENSMUSG00000103377,.,-
6,chr1,3375555,3377788,ENSMUSG00000104017,.,-


## get promoter regions

In [8]:
# get promotor regions by default 2000bp upstream and 200bp downstream of TSS
RNA_promoters <- promoters(RNA_gtf_genes)
head(RNA_promoters)

GRanges object with 6 ranges and 26 metadata columns:
                     seqnames          ranges strand |         source     type
                        <Rle>       <IRanges>  <Rle> |       <factor> <factor>
  ENSMUSG00000102693        1 3071253-3073452      + | havana             gene
  ENSMUSG00000064842        1 3100016-3102215      + | ensembl            gene
  ENSMUSG00000051951        1 3671299-3673498      - | ensembl_havana     gene
  ENSMUSG00000102851        1 3250757-3252956      + | havana             gene
  ENSMUSG00000103377        1 3368350-3370549      - | havana             gene
  ENSMUSG00000104017        1 3377589-3379788      - | havana             gene
                         score     phase            gene_id gene_version
                     <numeric> <integer>        <character>  <character>
  ENSMUSG00000102693        NA      <NA> ENSMUSG00000102693            1
  ENSMUSG00000064842        NA      <NA> ENSMUSG00000064842            1
  ENSMUSG00000051951  

## export promoter regions as .bed file

In [28]:
promoter_df <- data.frame(seqnames=paste0('chr',seqnames(RNA_promoters)),
  starts=start(RNA_promoters)-1,
  ends=end(RNA_promoters),
  names=RNA_promoters$gene_id,
  scores=c(rep(".", length(RNA_promoters))),
  strands=strand(RNA_promoters))

# gene at the start of a chromosome can have negative start coordinates
promoter_df[promoter_df$starts<0, 'starts'] <- 0

# rename mitochondrial chromosome to fit chromsize file 
promoter_df[promoter_df$seqnames=='chrMT', 'seqnames'] <- 'chrM'

write.table(promoter_df, file=file.path(data_path,"promoter_regions.bed"), quote=F, sep="\t", row.names=F, col.names=F)

In [29]:
head(promoter_df)

Unnamed: 0_level_0,seqnames,starts,ends,names,scores,strands
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>,<chr>,<fct>
1,chr1,3071252,3073452,ENSMUSG00000102693,.,+
2,chr1,3100015,3102215,ENSMUSG00000064842,.,+
3,chr1,3671298,3673498,ENSMUSG00000051951,.,-
4,chr1,3250756,3252956,ENSMUSG00000102851,.,+
5,chr1,3368349,3370549,ENSMUSG00000103377,.,-
6,chr1,3377588,3379788,ENSMUSG00000104017,.,-
