In [1]:
library(GenomicRanges)
# install.packages("car", verbose=TRUE)

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
    tapply, union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following object is masked from ‘package:utils’:

    findMatches


The following objects are masked from ‘package:base’:

    expand.grid, I, unname


Loading required package: IRanges

Loading required package: GenomeInfoDb



In [2]:
trans.expr.p.f <- 'tre.df.RData'
indexed.geno.f <- 'genotype.tsv.bgz'
gene.loc.chunk <- 'nominal_in.596'
covariates.f <- 'covariates.df.RData'
output.f <- 'test'

load(trans.expr.p.f)                                                            # Load tre.df
genes.bed <- read.table(gene.loc.chunk, header = FALSE, as.is = TRUE)           # Load chunk
colnames(genes.bed) <- c("chr", "start", "end", "geneId")                       # Name chunk
genes <- genes.bed$geneId                                                       # Get gene names
tre.df <- subset(tre.df, geneId %in% genes)                                     # Subset tre.df
load(covariates.f)                                                              # Load covariates.df


In [3]:
# <!-- res.df <- sqtl.seeker(tre.df, indexed.geno.f, genes.bed,
#                       covariates = covariates.df,
#                       genic.window = opt$window,
#                       min.nb.ext.scores = opt$min_nb_ext_scores,
#                       nb.perm.max = opt$nb_perm_max,
#                       nb.perm.max.svQTL = opt$nb_perm_max_svqtl,
#                       svQTL = opt$svqtl, asympt = opt$asympt,
#                       min.nb.ind.geno = opt$min_nb_ind_geno,
#                       ld.filter = LD, verbose = opt$verbose) -->

genotype.f = indexed.geno.f
gene.loc = genes.bed
covariates=covariates.df
genic.window = 5000
min.nb.ext.scores = 1000
nb.perm.max = 1e6
nb.perm.max.svQTL = 1e4
svQTL = FALSE
asympt = TRUE
ld.filter = NULL
min.nb.ind.geno = 10
verbose = TRUE


In [4]:
read.bedix <- function(file, subset.reg = NULL, header = TRUE)
{
    if(!is.character(file)){
        file <- as.character(file)
    }
    if(!file.exists(file)){
        file <- paste0(file, ".bgz")
    }
    if(!file.exists(file)){
        stop(file, "Input file not found (with and without .bgz extension).")
    }
    if(!file.exists(paste0(file, ".tbi"))){
        stop(paste0(file, ".tbi"), "Index file not found.")
    }
    if(is.null(subset.reg)){
        return(utils::read.table(file, as.is = TRUE, header = header))
    }
    if (is.data.frame(subset.reg)) {
        if(!all(c("chr", "start", "end") %in% colnames(subset.reg))){
            stop("Missing column in 'subset.reg'. 'chr', 'start' and 'end' are required.")
        }
        subset.reg <- with(subset.reg,
                           GenomicRanges::GRanges(chr, IRanges::IRanges(start, end)))
    } else if (class(subset.reg) != "GRanges") {
        stop("'subset.reg' must be a data.frame or a GRanges object.")
    }
    print('are we doing anything?')
    subset.reg <- subset.reg[order(as.character(GenomicRanges::seqnames(subset.reg)),
                                   GenomicRanges::start(subset.reg))]
    read.chunk <- function(gr){
        print('this is the file name in read bedix')
        print(file)
        print('\n')
        bed <- unlist(Rsamtools::scanTabix(file))
        # bed <- unlist(Rsamtools::scanTabix(file, param = GenomicRanges::reduce(gr)))
        print(bed)
        print('genomic range thing')
        print(gr)
        #bed <- tryCatch(unlist(Rsamtools::scanTabix(file, param = GenomicRanges::reduce(gr)),
        #                       use.names = FALSE, recursive = FALSE), error = function(e) c())
        if (length(bed) == 0) {
            print('here in read file thing')
            return(NULL)
        }
        ncol <- length(strsplit(bed[1], "\t")[[1]])
        bed <- matrix(unlist(strsplit(bed, "\t", fixed = TRUE), use.names=FALSE, recursive = FALSE),
                      length(bed), ncol, byrow = TRUE)
        bed <- data.table::data.table(bed)
        if (header) {
            data.table::setnames(bed,
                                 as.character(utils::read.table(file, nrows = 1, as.is = TRUE)))
        }
        bed <- bed[, lapply(.SD, function(ee)utils::type.convert(as.character(ee), as.is = TRUE))]
        bed <- as.data.frame(bed)
        return(bed)
    }
    if (length(subset.reg) > 10000) {
        chunks <- cut(1:length(subset.reg), ceiling(length(subset.reg)/10000))
        bed.df <- plyr::ldply(levels(chunks), function(ch.id){
            read.chunk(subset.reg[which(chunks == ch.id)])
        })
    } else {
        bed.df <- read.chunk(subset.reg)
    }
    if(!is.null(bed.df)){
        bed.df <- bed.df[order(bed.df$chr, bed.df$start), ]
    }
    return(bed.df)
}

In [7]:
. <- nb.groups <- snpId <- NULL ## Uglily appease R checks (dplyr)
analyze.gene.f <- function(tre.gene){
    if(verbose) message(tre.gene$geneId[1])
    if(sum(duplicated(gene.loc$geneId)) > 1){
        stop(tre.gene$geneId[1], " Repeated gene in gene location file.")
    }
    gr.gene <- with(gene.loc[which(gene.loc$geneId == tre.gene$geneId[1]), ],
                    GenomicRanges::GRanges(chr, IRanges::IRanges(start, end)))
    if(genic.window > 0){
        gr.gene <- GenomicRanges::resize(gr.gene, GenomicRanges::width(gr.gene) +
                                           2 * genic.window, fix = "center")
    }
    if(length(gr.gene) > 0){
        if(!is.null(covariates)){
            if(!all(rownames(covariates) %in% colnames(tre.gene))){
                stop("All samples should have covariate information (either a value or NA).")                  
            }
            cov.na <- apply(covariates, 1, function(x){any(is.na(x))})
            covariates <- covariates[!cov.na,  ,drop = FALSE]                                     
            if(sum(cov.na) > 0){
                warning(sprintf("%s samples with NA values for at least one covariate have been removed.",
                                sum(cov.na)))     
            }
            if(sum(cov.na) / nrow(covariates) > 0.05){
                stop("More than 5% of the samples contain NA values for at least one covariate")  
            }
        }
        tre.gene <- tre.gene[, !is.na(tre.gene[1, ])]
        genotype.headers <- as.character(utils::read.table(genotype.f,
                                                           as.is = TRUE, nrows = 1))
        if(!is.null(covariates)){
            com.samples <- Reduce(intersect, list(colnames(tre.gene),
                                                  genotype.headers, rownames(covariates)))   
            if (length(com.samples) == 0) {
                stop("No common samples between genotype, covariate and transcript files.")
            }
        } else{
            com.samples <- intersect(colnames(tre.gene), genotype.headers)
            if (length(com.samples) == 0) {
                stop("No common samples between genotype and transcript files.")
            }
        }
        tre.gene <- tre.gene[, c("trId", "geneId", com.samples)]
        allzero <- apply(tre.gene[, com.samples], 1, function(x){ sum(x) == 0 })
        if (any(allzero)){
            tr.out <- tre.gene$trId[allzero]
            tre.gene <- tre.gene[!allzero,]
            warning(sprintf("Transcript(s) %s is(are) removed due to zero expression in all common samples.",
                            paste(tr.out, collapse = ", ")) )
        }
        tre.tc <- t(sqrt(tre.gene[, com.samples]))
        colnames(tre.tc) <- tre.gene$tr
        if(!is.null(covariates)){
            covariates <- covariates[com.samples, , drop = FALSE]
            multiclass <- apply(covariates, 2, function(x){length(table(x)) > 1})
            covariates <- covariates[, multiclass, drop = FALSE]
            if (verbose & any(!multiclass)){
              message("\t", "Covariates removed due to only one value: ",
                      paste(names(multiclass)[!multiclass], collapse = ", "))
            }
            fit <- stats::lm(tre.tc ~ ., data = covariates)
            if (ncol(covariates) > 1){
                vifs <- car::vif(stats::lm(tre.tc[, 1] ~ ., data = covariates))
                if (verbose){
                    message("\t", "Covariates VIF - ",
                            paste(names(vifs), round(vifs, 2), sep = ": ", collapse = ", "))
                    print(vifs)
                }
                if (any(vifs > 5)){
                    warning("Check multicollinearity. VIF > 5 for some covariates:", "\n",
                            paste(names(vifs), round(vifs, 2), sep = ": ", collapse = ", "))
                }  
            }
            tre.tc <- fit$residual
        }
        res.df <- data.frame()
        if(GenomicRanges::width(gr.gene) > 20000 && is.null(ld.filter)){
            pos.breaks <- unique(round(seq(GenomicRanges::start(gr.gene),
                                           GenomicRanges::end(gr.gene),
                                           length.out = floor(GenomicRanges::width(gr.gene)/10000) + 1)))
            gr.gene.spl <- rep(gr.gene, length(pos.breaks) - 1)
            GenomicRanges::start(gr.gene.spl) <- pos.breaks[-length(pos.breaks)]
            pos.breaks[length(pos.breaks)] <- pos.breaks[length(pos.breaks)] + 1
            GenomicRanges::end(gr.gene.spl) <- pos.breaks[-1] - 1
        } else {
            gr.gene.spl <- gr.gene
        }
        browser()
        res.df <- lapply(1:length(gr.gene.spl), function(ii){
            browser()
            res.range <- data.frame()
            if(verbose) message("  Sub-range ", ii)
            print(gr.gene.spl[ii])
            print('this is the file name in analyze gene')
            print(genotype.f)
            print('\n')
            genotype.gene <- read.bedix(genotype.f, gr.gene.spl[ii])
            print(gr.gene)
            print(genotype.gene)
            if(verbose && is.null(genotype.gene)) {
              message("\tNo SNPs in the genomic range.")
            } else { message("\t oooh I found a snp")}
            if(!is.null(genotype.gene)){
                snps.to.keep <- check.genotype(genotype.gene[, com.samples],
                                               tre.gene[, com.samples],
                                               min.nb.ind.geno = min.nb.ind.geno)
                if(verbose){
                    snps.to.keep.t <- table(snps.to.keep)
                    message("\t", paste(names(snps.to.keep.t), snps.to.keep.t,
                                        sep = ": ", collapse=", "))
                }
                if(any(snps.to.keep == "PASS")){
                    genotype.gene <- genotype.gene[snps.to.keep == "PASS", ]
                    if(!is.null(ld.filter)){
                        if(verbose) message("\tLD filtering")
                        genotype.gene <- LD.filter(genotype.gene = genotype.gene,
                                                   tre.mt = tre.tc, th = ld.filter,
                                                   svQTL = svQTL)
                    }
                    res.range <- dplyr::do(dplyr::group_by(genotype.gene, snpId),
                                           compFscore(., tre.tc, svQTL = svQTL,
                                                      asympt = asympt,
                                                      res = !is.null(covariates)))
                }
            }
            return(res.range)
        })
        range.done <- which(unlist(lapply(res.df, nrow)) > 0)
        if(length(range.done) > 0){
            res.df <- res.df[range.done]
            res.df <- do.call(rbind, res.df)
            if (!is.null(ld.filter)){
                ld <- res.df[,c("snpId","LD")]
                res.df$LD <- NULL
            }
            if(!asympt){
                res.df <- dplyr::do(dplyr::group_by(res.df, nb.groups),
                                    compPvalue(., tre.tc, min.nb.ext.scores = min.nb.ext.scores,
                                               nb.perm.max = nb.perm.max))
            }
            if(svQTL){
                res.df <- dplyr::do(dplyr::group_by(res.df, nb.groups),
                                    compPvalue(., tre.tc, svQTL = TRUE,
                                               min.nb.ext.scores = min.nb.ext.scores,
                                               nb.perm.max = nb.perm.max.svQTL))
            }
            if (!is.null(ld.filter)){
                res.df <- merge(res.df, ld, by = "snpId")
            }
            res.df <- dplyr::arrange(res.df, pv)
            return(data.frame(done = TRUE, res.df))
        }
    } else {
        if(verbose) warning("Issue with the gene location.")
    }
    return(data.frame(done = FALSE))
}

In [9]:
# ret.df <- lapply(c("ENSG00000102189.17"), function(gene.i){
#        df <- tre.df[which(tre.df$geneId == gene.i), ]
#        data.frame(geneId = gene.i, analyze.gene.f(df))
#     })

In [13]:
# thing = read.bedix(indexed.geno.f)
geneId="ENSG00000102189.17"
gr.gene <- with(gene.loc[which(gene.loc$geneId == geneId), ],
                GenomicRanges::GRanges(chr, IRanges::IRanges(start, end)))
gr.gene

temp  <- as.character(utils::read.table(indexed.geno.f,
                                                           as.is = TRUE, nrows = 2))
head(temp)

GRanges object with 1 range and 0 metadata columns:
      seqnames            ranges strand
         <Rle>         <IRanges>  <Rle>
  [1]    chr12 92770636-92929327      *
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

In [None]:

<!--
ret.df <- lapply(unique(tre.df$geneId), function(gene.i){
       df <- tre.df[which(tre.df$geneId == gene.i), ]
       data.frame(geneId = gene.i, analyze.gene.f(df))
    })

    done <- which(unlist(lapply(ret.df, ncol)) > 2)
    if(length(done) > 0){
        ret.df <- ret.df[done]
        ret.df <- do.call(rbind, ret.df)
        ret.df$done <- NULL
        return(ret.df)
    } else {
        return(NULL)
    } -->
