In [62]:
libs <- c(
    'dplyr',
    'tidyr',
    'stringr'
)

for (lib in libs) {
        if (!require(lib, character.only = TRUE, quietly = TRUE)) {
            install.packages(lib, repos='http://cran.us.r-project.org')
        }
}

# source("http://bioconductor.org/biocLite.R")
# biocLite("BSgenome.Hsapiens.UCSC.hg38")
library(BSgenome.Hsapiens.UCSC.hg38)
hg38 = BSgenome.Hsapiens.UCSC.hg38

## cleaning and splitting functions

In [53]:
# function to split data into independent classifier work set and 
# ensemble classifier work set
split.ic.ec <- function(df, p=0.5) {
    # p is the portion of the data used for independent classifier work
    # the rest is set aside for ensemble classifier work
    
    require("tibble")
    
    df <- as_data_frame(df)
    N.all <- dim(df)[1]
    
    all.inds <- c(1:N.all)
    ic.inds <- sample(N.all, round(p*N.all), replace=FALSE)
    
    ic.inds.bool <- all.inds %in% ic.inds
    ec.inds.bool <- !ic.inds.bool
    
    ic.df <- df[ic.inds.bool,]
    ec.df <- df[ec.inds.bool,]
    
    out.list <- list()
    out.list$ic <- ic.df
    out.list$ec <- ec.df

    return(out.list)
}


# function to split data in to training, testing, and validation sets
split.test.train.valid <- function(df, p.tt=0.75, p.valid=0.25) {
    # p.valid is the portion of the data set aside for validation
    # p.tt is the fraction of the remaining data used for training

    require("tibble")
    
    df <- as_data_frame(df)
    N.all <- dim(df)[1]

    all.inds <- c(1:N.all)
    valid.inds <- sample(N.all, round(p.valid*N.all), replace=FALSE)

    valid.inds.bool <- all.inds %in% valid.inds
    tt.inds.bool <- !valid.inds.bool
    
    valid.df <- df[valid.inds.bool,]
    tt.df <- df[tt.inds.bool,]

    N.tt <- dim(tt.df)[1]
    tt.inds <- c(1:N.tt)
    train.inds <- sample(N.tt, round(p.tt*N.tt), replace=FALSE)

    train.inds.bool <- tt.inds %in% train.inds
    test.inds.bool <- !train.inds.bool

    train.df <- tt.df[train.inds.bool,]
    test.df <- tt.df[test.inds.bool,]

    out.list <- list()
    out.list$train <- train.df
    out.list$test <- test.df
    out.list$valid <- valid.df
    
    return(out.list)
}

## Load Data

In [49]:
# df loads as "tb;" variable
load("~/git-repos/BDDS/trenadb/featureTable/lymphoblast/featureTableLymphoblast-chr19-entire.RData")

# remove any duplicate rows
tbl.nodupes <- tbl[!duplicated(tbl),]

# fix spurious zeros in piq score 2 
tmp <- tbl.nodupes$score2.p
tmp[tmp == 0] <- -999
tbl.nodupes$score2.p <- tmp

In [55]:
# cory's motif to tf-class mapping
tf.fams.file <- "/local/Cory/for_Paul/motif_class_fam"
tf.fams.table <- read.delim(tf.fams.file, stringsAsFactors=FALSE)

In [64]:
# check overlap of motif names from cory and paul

motif.names.from.cory <- str_replace_all(string=tf.fams.table$ID, pattern="[[:space:]]", repl="")
motif.names.from.paul <- unique(tbl.nodupes$motif)

# motifs not occurring from sample data
from.cory.not.in.paul <- motif.names.from.cory[!(motif.names.from.cory %in% motif.names.from.paul)]

# motifs not mapped in cory's table
from.paul.not.in.cory <- motif.names.from.paul[!(motif.names.from.paul %in% motif.names.from.cory)]

# number of motifs that i'll use
from.paul.in.cory <- sort(motif.names.from.paul[(motif.names.from.paul %in% motif.names.from.cory)])
length(from.paul.in.cory)

In [65]:
# filter out the motifs in pauls' table that cory doesn' have tf classes for
ft.clean.corymotifs <- tbl.nodupes %>% filter(motif %in% from.paul.in.cory)

In [66]:
# find unique tf class names from cory's list

class.names.nospaces <- str_replace_all(string=tf.fams.table$class, pattern="[[:space:]]", repl="")
unique.class.names.nospaces <- unique(class.names.nospaces)
double.entry.classes <- grep('::', unique.class.names.nospaces, value=TRUE)
dedoubled.classes <- unique(unlist(strsplit(double.entry.classes,'::')))

class.names.nospaces.minus.doubles <- unique.class.names.nospaces[!(unique.class.names.nospaces %in% double.entry.classes)]
class.names.final <- sort(unique(c(class.names.nospaces.minus.doubles,dedoubled.classes)))
length(class.names.final)

In [67]:
# cory's map: arrange so grepping on class column will yeild matches from paul's names

tf.fams.table.renamed <- tf.fams.table
tf.fams.table.renamed$ID <- str_replace_all(string=tf.fams.table.renamed$ID, pattern="[[:space:]]", repl="")
tf.fams.table.renamed <- tf.fams.table.renamed %>% filter(ID %in% from.paul.in.cory)
rownames(tf.fams.table.renamed) <- tf.fams.table.renamed$ID
tf.class.foreach.motif <- tf.fams.table.renamed[,-c(1)]

tf.class.foreach.motif$class <- str_replace_all(string=tf.class.foreach.motif$class, pattern="[[:space:]]", repl="")

In [68]:
# make sparse class / tf feature matrix map
# all zeros to start, fill in ones below
tf.motif.class.matrix <- matrix(0L, nrow = length(from.paul.in.cory), ncol = length(class.names.final))
rownames(tf.motif.class.matrix) <- from.paul.in.cory
colnames(tf.motif.class.matrix) <- class.names.final

# loop through tfs and classes to fill in ones where there is a match

for (tf.name in rownames(tf.motif.class.matrix)) {
    for (class.name in colnames(tf.motif.class.matrix)) {
        class.to.check <- tf.class.foreach.motif[tf.name,'class']
        classes.match <- any(grep(class.name, class.to.check, value=FALSE, fixed=TRUE))
        if (classes.match == 1) {
            tf.motif.class.matrix[tf.name,class.name] <- 1
        }
    }
    
}

In [69]:
# number of motifs with extra entries is small:
dim(tf.motif.class.matrix)
sum(tf.motif.class.matrix)

# check to see if any motifs didn't match
rownames(tf.motif.class.matrix)[rowSums(tf.motif.class.matrix) == 0]

In [70]:
# merging is a way saner way to... merge dataframes, who knew?
tf.motif.class.df <- as.data.frame(tf.motif.class.matrix)
tf.motif.class.df$motif <- rownames(tf.motif.class.df)
dim(tf.motif.class.df)

ft.clean.corymotifs.tfclasses <- merge(ft.clean.corymotifs, tf.motif.class.df, by="motif", all.x=TRUE)
dim(ft.clean.corymotifs.tfclasses)

In [71]:
# check to see if any loc/motifs didn't get a class
rownames(ft.clean.corymotifs.tfclasses)[rowSums(ft.clean.corymotifs.tfclasses[,colnames(tf.motif.class.matrix)]) == 0]

In [72]:
# most entries just have one class
dim(ft.clean.corymotifs.tfclasses[,colnames(tf.motif.class.matrix)])[1]
sum(ft.clean.corymotifs.tfclasses[,colnames(tf.motif.class.matrix)])

## Split data into different sets

In [75]:
# split into test, train, validation dfs for both
# independent classifier (ic) and ensemble classifier(ec) work

ic.ec.list <- split.ic.ec(ft.clean.corymotifs.tfclasses)

ic.df <- ic.ec.list$ic
ec.df <- ic.ec.list$ec

ic.train.test.valid.list <- split.test.train.valid(ic.df)
ec.train.test.valid.list <- split.test.train.valid(ec.df)

ic.train.df <- ic.train.test.valid.list$train
ic.test.df <- ic.train.test.valid.list$test
ic.valid.df <- ic.train.test.valid.list$valid

ec.train.df <- ec.train.test.valid.list$train
ec.test.df <- ec.train.test.valid.list$test
ec.valid.df <- ec.train.test.valid.list$valid

In [82]:
nrow(ic.df) + nrow(ic.df)

nrow(ic.train.df) + nrow(ic.test.df) + nrow(ic.valid.df) + 
nrow(ec.train.df) + nrow(ec.test.df) + nrow(ec.valid.df)

In [84]:
save(ic.train.df,
     ic.test.df,
     ic.valid.df,
     ec.train.df,
     ec.test.df,
     ec.valid.df,
     file="train_test_valid_data_sets.Rdata")