Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add getGenomicTiles #54

Merged
merged 9 commits into from
Feb 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: swissknife
Title: Handy code shared in the FMI CompBio group
Version: 0.26
Version: 0.27
Authors@R: c(person("Michael", "Stadler", email = "michael.stadler@fmi.ch", role = c("aut", "cre")),
person("Charlotte", "Soneson", email = "charlotte.soneson@fmi.ch", role = "aut"),
person("Panagiotis", "Papasaikas", email = "panagiotis.papasaikas@fmi.ch", role = "aut"),
Expand Down
11 changes: 11 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ export(calcAndCountDist)
export(calcPhasogram)
export(col2hex)
export(estimateNRL)
export(getGenomicTiles)
export(getMappableRegions)
export(labelCells)
export(normGenesetExpression)
Expand All @@ -25,17 +26,25 @@ importFrom(BiocGenerics,end)
importFrom(BiocGenerics,start)
importFrom(BiocGenerics,strand)
importFrom(BiocGenerics,subset)
importFrom(BiocGenerics,unlist)
importFrom(BiocParallel,SerialParam)
importFrom(BiocParallel,bplapply)
importFrom(Biostrings,fasta.seqlengths)
importFrom(Biostrings,oligonucleotideFrequency)
importFrom(Biostrings,readDNAStringSet)
importFrom(GenomeInfoDb,seqlengths)
importFrom(GenomeInfoDb,seqnames)
importFrom(GenomicAlignments,cigarWidthAlongReferenceSpace)
importFrom(GenomicRanges,GRanges)
importFrom(GenomicRanges,distance)
importFrom(GenomicRanges,findOverlaps)
importFrom(GenomicRanges,gaps)
importFrom(GenomicRanges,nearest)
importFrom(GenomicRanges,ranges)
importFrom(GenomicRanges,seqnames)
importFrom(GenomicRanges,sort)
importFrom(GenomicRanges,tileGenome)
importFrom(GenomicRanges,width)
importFrom(Gviz,DataTrack)
importFrom(Gviz,GeneRegionTrack)
importFrom(Gviz,GenomeAxisTrack)
Expand All @@ -56,6 +65,8 @@ importFrom(Rsamtools,scanBamFlag)
importFrom(Rsamtools,scanBamHeader)
importFrom(S4Vectors,"%in%")
importFrom(S4Vectors,mcols)
importFrom(S4Vectors,queryHits)
importFrom(S4Vectors,subjectHits)
importFrom(SingleCellExperiment,counts)
importFrom(SingleCellExperiment,logcounts)
importFrom(SingleCellExperiment,sizeFactors)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# swissknife 0.27

* Add getGenomicTiles to obtain annotated regions tiling a given genome

# swissknife 0.26

* speed-up getMappableRegions by dumping k-mers in C++
Expand Down
226 changes: 226 additions & 0 deletions R/getGenomicTiles.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
#' @title Get regions tiling a genome.
#'
#' @description Get sequential, potentially annotated regions of a fixed lengths
#' (tiles) along chromosomes of a genome.
#'
#' @author Michael Stadler
#'
#' @param genome The genome to work on. Either a \code{\link[BSgenome]{BSgenome}}
#' object, a \code{character} scalar with the name of an installed \code{\link[BSgenome]{BSgenome}}
#' or with a file path and name pointing to a fasta file with the genome sequence,
#' or a named \code{numeric} vector giving the names and lengths of chromosomes.
#' @param tileWidth \code{numeric} scalar with the tile length.
#' @param hasOverlap Named \code{list} with \code{\link[GenomicRanges]{GRanges}}
#' or \code{\link[GenomicRanges]{GRangesList}} object(s).
#' For each list element, a logical vector "X.hasOverlap" will be added to the
#' \code{mcols} of the result, with \code{TRUE} for each tile that overlaps
#' any region in that element. "X" is obtained from \code{names(hasOverlap)}.
#' @param fracOverlap Named \code{list} with \code{\link[GenomicRanges]{GRanges}}
#' or \code{\link[GenomicRanges]{GRangesList}} object(s).
#' For each list element, a numeric vector "X.fracOverlap" will be added to the
#' \code{mcols} of the result, with a value between 0 and 1 giving the fraction
#' of bases in a tile that overlaps with any region in that element. "X" is
#' obtained from \code{names(fracOverlap)}.
#' @param numOverlap Named \code{list} with \code{\link[GenomicRanges]{GRanges}}
#' or \code{\link[GenomicRanges]{GRangesList}} object(s).
#' For each list element, two numeric vectors "X.numOverlapWithin" and
#' "X.numOverlapAny" will be added to the \code{mcols} of the result, giving
#' the number of ranges in that element that are fully contained within
#' a tile, or that overlap with a tile in any way, respectively. "X" is
#' obtained from \code{names(numOverlap)}.
#' @param nearest Named \code{list} with \code{\link[GenomicRanges]{GRanges}}
#' or \code{\link[GenomicRanges]{GRangesList}} object(s).
#' For each list element, two numeric vectors "X.nearestName" and
#' "X.nearestDistance" will be added to the \code{mcols} of the result, giving
#' the name and distance of the nearest range in that element for each tile. "X" is
#' obtained from \code{names(nearest)}, and the values of "X.nearestName" from
#' \code{names(nearest$X)}.
#' @param addSeqComp \code{logical} scalar. If \code{TRUE} and primary sequence
#' can be obtained from \code{genome}, also add sequence composition features
#' for each tile to the annotations. Currently, the following features are
#' included: percent of G+C bases ("percGC"), CpG observed-over-expected ratio
#' ("CpGoe").
#'
#' @details The last tile in each chromosome is dropped if it would be shorter
#' than \code{tileWidth}. Generated tiles are unstranded (\code{*}) and
mbstadler marked this conversation as resolved.
Show resolved Hide resolved
#' therefore overlaps or searching for nearest neighbors are ignoring
#' strands of annotations (\code{ignore.strand=TRUE}). If multiple nearest
#' ranges are at the same distance from a tile, an arbitrary one is
#' reported in "X.nearestName".
mbstadler marked this conversation as resolved.
Show resolved Hide resolved
#'
#' @return A \code{\link[GenomicRanges]{GRanges}} object with genome tiling regions.
#' Optional tile annotations are contained in its metadata columns (\code{mcols}).
#'
#' @examples
#' library(GenomicRanges)
#'
#' tss <- GRanges("chr1", IRanges(c(1, 10, 30), width = 1,
#' names = paste0("t", 1:3)))
#' blacklist <- GRanges("chr1", IRanges(20, width = 5))
#' getGenomicTiles(c(chr1 = 45, chr2 = 12), tileWidth = 10,
#' hasOverlap = list(Blacklist = blacklist),
#' fracOverlap = list(Blacklist = blacklist),
#' numOverlap = list(TSS = tss),
#' nearest = list(TSS = tss))
#'
#' @seealso \code{\link[GenomicRanges]{tileGenome}}, \code{\link[GenomicRanges]{findOverlaps}}
#' and \code{\link[GenomicRanges]{nearest}} in package \pkg{GenomicRanges} used by
#' \code{getGenomicTiles} internally.
#'
#' @importFrom GenomicRanges GRanges tileGenome width findOverlaps nearest distance
#' @importFrom BSgenome seqinfo getSeq
#' @importFrom GenomeInfoDb seqlengths
#' @importFrom Biostrings fasta.seqlengths readDNAStringSet oligonucleotideFrequency
#' @importFrom methods is
#' @importFrom IRanges overlapsAny
#' @importFrom S4Vectors queryHits subjectHits
#' @importFrom BiocGenerics unlist
#'
#' @export
getGenomicTiles <- function(genome,
tileWidth,
hasOverlap = list(),
fracOverlap = list(),
numOverlap = list(),
nearest = list(),
addSeqComp = TRUE) {
## check arguments
## ... genome and addSeqComp
stopifnot(exprs = {
is.logical(addSeqComp)
length(addSeqComp) == 1L
})
if(is(genome, "BSgenome")) {
chrlens <- GenomeInfoDb::seqlengths(BSgenome::seqinfo(genome))
genomeobj <- genome
} else if (is.character(genome) && length(genome) == 1L) {
if (suppressWarnings(require(genome, character.only = TRUE, quietly = TRUE))) {
genomeobj <- get(genome)
chrlens <- GenomeInfoDb::seqlengths(BSgenome::seqinfo(genomeobj))
} else if (file.exists(genome)) {
chrlens <- Biostrings::fasta.seqlengths(genome)
genomeobj <- Biostrings::readDNAStringSet(genome)
} else {
stop("'genome' is neither a valid file nor a BSgenome object.")
}
} else if (is.numeric(genome) && !is.null(names(genome))) {
chrlens <- genome
if (addSeqComp) {
warning("ignoring 'addSeqComp' (sequence not available from 'genome')")
addSeqComp <- FALSE
}
} else {
stop("'genome' is not a valid argument for getGenomicTiles()")
}
## ... other arguments
stopifnot(exprs = {
# tileWidth
is.numeric(tileWidth)
length(tileWidth) == 1L
tileWidth > 0
# hasOverlap
is.list(hasOverlap)
length(hasOverlap) == 0L || !is.null(names(hasOverlap))
all(unlist(lapply(hasOverlap, is, "GRanges"))) || all(unlist(lapply(hasOverlap, is, "GRangesList")))
# fracOverlap
is.list(fracOverlap)
length(fracOverlap) == 0L || !is.null(names(fracOverlap))
all(unlist(lapply(fracOverlap, is, "GRanges"))) || all(unlist(lapply(fracOverlap, is, "GRangesList")))
# numOverlap
is.list(numOverlap)
length(numOverlap) == 0L || !is.null(names(numOverlap))
all(unlist(lapply(numOverlap, is, "GRanges"))) || all(unlist(lapply(numOverlap, is, "GRangesList")))
# nearest
is.list(nearest)
length(nearest) == 0L || !is.null(names(nearest))
all(unlist(lapply(nearest, is, "GRanges"))) || all(unlist(lapply(nearest, is, "GRangesList")))
all(unlist(lapply(nearest, function(x) !is.null(names(x)))))
})

## create tiles
gr <- GenomicRanges::tileGenome(seqlengths = chrlens, tilewidth = tileWidth,
cut.last.tile.in.chrom = TRUE)
mbstadler marked this conversation as resolved.
Show resolved Hide resolved
gr <- gr[GenomicRanges::width(gr) == tileWidth]

## annotate tiles
df <- S4Vectors::mcols(gr)

## ... addSeqComp
if (addSeqComp) {
grSeq <- BSgenome::getSeq(x = genomeobj, names = gr)
grF1 <- Biostrings::oligonucleotideFrequency(x = grSeq, width = 1L,
as.prob = TRUE,
simplify.as = "matrix")
grF2 <- Biostrings::oligonucleotideFrequency(x = grSeq, width = 2L,
as.prob = TRUE,
simplify.as = "matrix")
df[["percGC"]] <- rowSums(grF1[,c("C","G")]) * 100
df[["CpGoe"]] <- grF2[, "CG"] / (grF1[, "C"] * grF1[, "G"])
}

## ... hasOverlap
for (i in seq_along(hasOverlap)) {
nm <- paste0(names(hasOverlap)[i], ".hasOverlap")
df[[nm]] <- IRanges::overlapsAny(query = gr, subject = hasOverlap[[i]],
ignore.strand = TRUE)
}

## ... fracOverlap
for (i in seq_along(fracOverlap)) {
gr2 <- fracOverlap[[i]]
if (is(gr2, "GRangesList")) {
gr2 <- BiocGenerics::unlist(gr2)
}
ov <- GenomicRanges::findOverlaps(query = gr, subject = gr2,
ignore.strand = TRUE)
tilesov <- pmin(GenomicRanges::end(gr[S4Vectors::queryHits(ov)]),
mbstadler marked this conversation as resolved.
Show resolved Hide resolved
GenomicRanges::end(gr2[S4Vectors::subjectHits(ov)])) -
pmax(GenomicRanges::start(gr[S4Vectors::queryHits(ov)]),
GenomicRanges::start(gr2[S4Vectors::subjectHits(ov)])) + 1
tilesovSum <- tapply(tilesov, S4Vectors::queryHits(ov), sum)
posPerTile <- rep(0, length(gr))
posPerTile[as.numeric(names(tilesovSum))] <- tilesovSum
nm <- paste0(names(fracOverlap)[i], ".fracOverlap")
df[[nm]] <- posPerTile / tileWidth
}

## ... numOverlap
for (i in seq_along(numOverlap)) {
gr2 <- numOverlap[[i]]
nm1 <- paste0(names(numOverlap)[i], ".numOverlapWithin")
nm2 <- paste0(names(numOverlap)[i], ".numOverlapAny")
# remark: findOverlaps(..., type = "within") refers to the query
ov1 <- GenomicRanges::findOverlaps(query = gr2, subject = gr,
type = "within",
ignore.strand = TRUE)
ov1perGr <- table(S4Vectors::subjectHits(ov1))
nwithin <- rep(0L, length(gr))
nwithin[as.numeric(names(ov1perGr))] <- as.integer(ov1perGr)
df[[nm1]] <- nwithin
df[[nm2]] <- GenomicRanges::countOverlaps(query = gr, subject = gr2,
type = "any",
ignore.strand = TRUE)
}

## ... nearest
for (i in seq_along(nearest)) {
gr2 <- nearest[[i]]
if (is(gr2, "GRangesList")) {
gr2 <- BiocGenerics::unlist(gr2)
}
nm1 <- paste0(names(nearest)[i], ".nearestName")
nm2 <- paste0(names(nearest)[i], ".nearestDistance")
j <- GenomicRanges::nearest(x = gr, subject = gr2, select = "arbitrary",
ignore.strand = TRUE)
df[[nm1]] <- names(gr2)[j]
dists <- rep(NA_integer_, length(gr))
ok <- !is.na(j)
dists[ok] <- GenomicRanges::distance(x = gr[ok], y = gr2[j[ok]],
ignore.strand = TRUE)
df[[nm2]] <- dists
}

## return result
S4Vectors::mcols(gr) <- df
return(gr)
}
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ reference:
- '`parsePkgVersions`'
- '`prepareGTF`'
- '`getMappableRegions`'
- '`getGenomicTiles`'
- title: General package information
contents:
- '`swissknife-package`'
96 changes: 96 additions & 0 deletions man/getGenomicTiles.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading