In [2]:
library(RPostgreSQL)
library(dplyr)
library(GenomicRanges)
library(doParallel)

In [3]:
source("~/git-repos/BDDS/footprints/testdb/src/dbFunctions.R")

In [4]:
# connect to databases

# load chipseq traditionally -- will just put this in memory eventually
if(!exists("db.chipseq"))
    dbname="chipseq"
    port="5432"
    driver=dbDriver("PostgreSQL")
    user="trena"
    password="trena"
    host="whovian"
    db.chipseq <- dbConnect(drv=driver, user=user, password=password, dbname=dbname, host=host, port=port)

## create map: M[TF] = c(motif1,motif2,...)

In [5]:
# chipseq hits table can fit in memory so load it up
chipseq.hits <- dbGetQuery(db.chipseq, "select * from hits")
chipseq.hits <- as.tbl(chipseq.hits)

# get chipseq regions data and change chrom notations from eg chr10 to just 10 as in fimo
chipseq.regions <- dbGetQuery(db.chipseq, "select * from regions")
chipseq.regions <- as.tbl(chipseq.regions)
chr.list <- chipseq.regions$chrom
cutoff <- nchar("chr")+1
no.chr.list <- substring(chr.list,cutoff)
chipseq.regions$chrom <- no.chr.list

In [6]:
# see which of the TFs in the chipseq data are mapped to any motifs
TF.motif.pairs <- read.csv("/local/sament/tfbs/human_brain/motif_to_tf_mappings_with_tfclass_include_multiple.csv",
                            stringsAsFactors=FALSE)

unique.cs.tfs <- unique(chipseq.hits$name)
cs.tf.nomatches <- unique.cs.tfs[!(unique(unique.cs.tfs) %in% unique(TF.motif.pairs$tfs))]
cs.tf.matches <- unique.cs.tfs[unique(unique.cs.tfs) %in% unique(TF.motif.pairs$tfs)]

In [7]:
length(unique.cs.tfs)
length(cs.tf.nomatches)
length(cs.tf.matches)

In [8]:
# create a list of lists (poor sub for dict) for TFs -> motifs and motifs -> TFs
# only use jaspar motifs

TF.motif.pairs <- as.tbl(read.csv("/local/sament/tfbs/human_brain/motif_to_tf_mappings_with_tfclass_include_multiple.csv",
                            stringsAsFactors=FALSE))

TF.motif.pairs.jaspar <- subset(TF.motif.pairs, grepl("^MA[0-9]", motif) )

TFs.to.motifs <- list()
for (TF in cs.tf.matches) {
    this.TF.df <- subset(TF.motif.pairs.jaspar, tfs %in% TF)
    TFs.to.motifs[TF] <- list(this.TF.df$motif)
}
TFs.to.motifs <- TFs.to.motifs[lapply(TFs.to.motifs,length) >= 1]

motifs.to.TFs <- list()
for (mtf in unique(TF.motif.pairs.jaspar$motif)) {
    this.motif.df <- subset(TF.motif.pairs.jaspar, motif %in% mtf)
    motifs.to.TFs[mtf] <- list(this.motif.df$tfs)
}
motifs.to.TFs <- motifs.to.TFs[lapply(motifs.to.TFs,length) >= 1]

In [9]:
length(TFs.to.motifs)
length(motifs.to.TFs)

In [10]:
allmots <- c()

for (TFname in names(TFs.to.motifs)) {
    allmots  <-  c(allmots, TFs.to.motifs[[TFname]])
}
length(unique(allmots))

## function to output df containing pos and neg examples for each TF

In [11]:
create.TF.df <- function(TF, neg.pos.ratio=10, verbose=FALSE) {
    
    db.fimo.dplyr <- src_postgres(drv=dbDriver("PostgreSQL"),
                                  user="trena",
                                  password="trena",
                                  dbname="fimo",
                                  host="whovian",
                                  port="5432")
    tbl.fimo.dplyr <- tbl(db.fimo.dplyr, "fimo_hg38")
    
    # regions locs we can compute on but hits have TF info so need both
    chipseq.hits.TF <- subset(chipseq.hits, name == TF)
    locs.TF <- chipseq.hits.TF$loc
    chipseq.regions.TF <- subset(chipseq.regions, loc %in% locs.TF)
    
    # next step is slow, gives you some context
    if (verbose == TRUE) {
        if (length(TFs.to.motifs[[TF]])==1) {
            message(paste(TF, "- querying fimo database for", length(TFs.to.motifs[[TF]]), "motif"))
        } else {
            message(paste(TF, "- querying fimo database for", length(TFs.to.motifs[[TF]]), "motifs"))
        }       
    }
        
    # this is the slow step -- doing SQL queries on tbl.fimo.dplyr = call to whole fimo database
    # need branch since %in% conversion to SQL doesn't work on length == 1
    if (length(TFs.to.motifs[[TF]]) > 1 ) {
        fimo.motifs.for.TF <- as.tbl(as.data.frame(filter(tbl.fimo.dplyr, motifname %in% TFs.to.motifs[[TF]])))
    } else {
        fimo.motifs.for.TF <- as.tbl(as.data.frame(filter(tbl.fimo.dplyr, motifname  ==  TFs.to.motifs[[TF]])))
    }
    
    # find intersect using fast genomic ranges data structure
    gr.fimo.TF <- with(fimo.motifs.for.TF, GRanges(chrom, IRanges(start=start, end=endpos)))
    gr.chipseq.TF <- with(chipseq.regions.TF, GRanges(chrom, IRanges(start=start, end=endpos)))
    overlaps.gr.TF <- findOverlaps(gr.chipseq.TF, gr.fimo.TF, type="any")
    overlaps.TF <- as.tbl(as.data.frame(overlaps.gr.TF))
    
    # row numbers in fimo.motifs.for.TF where motifs overlap with chipseq peaks
    positive.fimo.examples.rows.TF <- unique(overlaps.TF$subjectHits)
    positive.fimo.examples.TF.df <- fimo.motifs.for.TF[positive.fimo.examples.rows.TF,]
    
    # figure out how many negative samples for each motif we want
    tot.motif.counts.TF <- table(fimo.motifs.for.TF$motifname)
    pos.motif.counts.TF <- table(fimo.motifs.for.TF[positive.fimo.examples.rows.TF,]$motifname)
    nx.pos.motif.counts.TF <- pos.motif.counts.TF*neg.pos.ratio

    # want neg samples in fimo.motifs.for.TF to be non overlapping with chipseq peaks or pos examples
    neg.cands.for.single.TF.df <- subset(fimo.motifs.for.TF,
                                         !(start %in% positive.fimo.examples.TF.df$start) &
                                         !(endpos %in% positive.fimo.examples.TF.df$endpos))
    neg.motif.counts.TF <- table(neg.cands.for.single.TF.df$motifname)

    # don't try to sample more than the population
    neg.sample.counts.TF <- pmin(nx.pos.motif.counts.TF,neg.motif.counts.TF)
    
    # for each motif this TF matches, sample some examples where no CS hit
    negative.fimo.examples.TF.df <- tibble()
    for (motname in names(neg.sample.counts.TF)) {
        neg.cands.for.single.TF.df.single.motif <- subset(neg.cands.for.single.TF.df, motifname == motname)
        negative.fimo.examples.TF.df <- rbind(negative.fimo.examples.TF.df,
                                    sample_n(neg.cands.for.single.TF.df.single.motif, neg.sample.counts.TF[[motname]]))
    }
    
    # annotate and collect all samples
    positive.fimo.examples.TF.df <- as.tbl(cbind(positive.fimo.examples.TF.df, "cs_hit"=1))
    negative.fimo.examples.TF.df <- as.tbl(cbind(negative.fimo.examples.TF.df, "cs_hit"=0))
    all.fimo.examples.TF.df <- as.tbl(rbind(positive.fimo.examples.TF.df,negative.fimo.examples.TF.df))
    
    return(all.fimo.examples.TF.df)
    
}

## create df of pos/neg samples for all TFs all together

In [54]:
# ratio of negative to positve examples in data set
pnr=49

In [19]:
# make a cluster for parallel computing
cl <- makePSOCKcluster(11)
clusterEvalQ(cl, {
    library(DBI)
    library(RPostgreSQL)
    library(dplyr)
    library(GenomicRanges)
})
registerDoParallel(cl)

# parallel loop over all TFs

sorted.TF.names <- sort(names(TFs.to.motifs))
N.TF <- length(sorted.TF.names)

TF.df.foreach <- foreach(i.TF=1:N.TF, .inorder=FALSE,
                          .packages=c("DBI", "RPostgreSQL", "dplyr", "GenomicRanges")) %dopar% {    
    TFname <- sorted.TF.names[[i.TF]]
    create.TF.df(TFname, neg.pos.ratio=pnr, verbose=TRUE)
}

# clean up after parallel setup
stopCluster(cl)
registerDoSEQ()

In [26]:
all.TF.df <- tibble()
for (df in TF.df.foreach) {
    all.TF.df <- rbind(all.TF.df, df)
}

In [56]:
fname=paste("/local/rory/all.TF.fimo.samples.ratio.",pnr,".df.RData", sep="")
save(all.TF.df, file=fname)

## serial version of production loop

In [57]:
# serial version -- hung on last TF, don't know why

all.TF.df <- tibble()

sorted.TF.names <- sort(names(TFs.to.motifs))
for (TFname in sorted.TF.names) {
    
    TFnum <- which(sorted.TF.names %in% TFname)
    message(paste("Processing TF", TFnum,"/", length(sorted.TF.names)))
    TF.df <- create.TF.df(TFname, neg.pos.ratio=9, verbose=TRUE)
    all.TF.df <- rbind(all.TF.df,TF.df)
    
    save(all.TF.df, file="/local/rory/all.TF.fimo.samples.ratio.9.df.RData")

}

## old code used to construct main function

In [14]:
# # eventual function parameters
# TF="E2F4"
# neg.pos.ratio=10

In [15]:
# # regions locs we can compute on but hits have TF info so need both
# chipseq.hits.TF <- subset(chipseq.hits, name == TF)
# locs.TF <- chipseq.hits.TF$loc
# chipseq.regions.TF <- subset(chipseq.regions, loc %in% locs.TF)

In [16]:
# str(chipseq.regions.TF)

In [17]:
# # this is the slow step -- doing SQL queries on tbl.fimo.dplyr = call to whole fimo database
# fimo.motifs.for.TF <- as.tbl(as.data.frame(filter(tbl.fimo.dplyr, motifname %in% TFs.to.motifs[[TF]])))

In [18]:
# str(fimo.motifs.for.TF)

In [19]:
# # find intersect using fast genomic ranges data structure
# gr.fimo.TF <- with(fimo.motifs.for.TF, GRanges(chrom, IRanges(start=start, end=endpos)))
# gr.chipseq.TF <- with(chipseq.regions.TF, GRanges(chrom, IRanges(start=start, end=endpos)))
# overlaps.gr.TF <- findOverlaps(gr.chipseq.TF, gr.fimo.TF, type="any")

In [20]:
# overlaps.TF <- as.tbl(as.data.frame(overlaps.gr.TF))

In [21]:
# str(overlaps.TF)

In [1]:
# # row numbers in fimo.motifs.for.TF where motifs overlap with chipseq peaks
# positive.fimo.examples.rows.TF <- unique(overlaps.TF$subjectHits)

In [2]:
# str(positive.fimo.examples.rows.TF)

In [None]:
# positive.fimo.examples.TF.df <- fimo.motifs.for.TF[positive.fimo.examples.rows.TF,]

In [None]:
# # figure out how many negative samples for each motif we want
# tot.motif.counts.TF <- table(fimo.motifs.for.TF$motifname)
# pos.motif.counts.TF <- table(fimo.motifs.for.TF[positive.fimo.examples.rows.TF,]$motifname)
# nx.pos.motif.counts.TF <- pos.motif.counts.TF*neg.pos.ratio

# # want neg samples in fimo.motifs.for.TF to be non overlapping with chipseq peaks or pos examples
# neg.cands.for.single.TF.df <- subset(fimo.motifs.for.TF,
#                                      !(start %in% positive.fimo.examples.TF.df$start) &
#                                      !(endpos %in% positive.fimo.examples.TF.df$endpos))
# neg.motif.counts.TF <- table(neg.cands.for.single.TF.df$motifname)

# # don't try to sample more than the population
# neg.sample.counts.TF <- pmin(nx.pos.motif.counts.TF,neg.motif.counts.TF)

In [23]:
# # for each motif this TF matches, sample some examples where no CS hit
# negative.fimo.examples.TF.df <- tibble()
# for (motname in names(neg.sample.counts.TF)) {
#     neg.cands.for.single.TF.df.single.motif <- subset(neg.cands.for.single.TF.df, motifname == motname)
#     negative.fimo.examples.TF.df <- rbind(negative.fimo.examples.TF.df,
#                                 sample_n(neg.cands.for.single.TF.df.single.motif, neg.sample.counts.TF[[motname]]))
# }

In [24]:
# str(negative.fimo.examples.TF.df)

In [28]:
# positive.fimo.examples.TF.df <- as.tbl(cbind(positive.fimo.examples.TF.df, "cs_hit"=1))
# negative.fimo.examples.TF.df <- as.tbl(cbind(negative.fimo.examples.TF.df, "cs_hit"=0))
# all.fimo.examples.TF.df <- as.tbl(rbind(positive.fimo.examples.TF.df,negative.fimo.examples.TF.df))

In [29]:
# str(all.fimo.examples.TF.df)

In [30]:
# sample_n(all.fimo.examples.TF.df,50)

In [31]:
# need to compile these pos+neg hits into new table and save -- per TF or all together?