diff --git a/bon_wilsonrnaseq_hsc.R b/bon_wilsonrnaseq_hsc.R index ed89ea2..b246809 100644 --- a/bon_wilsonrnaseq_hsc.R +++ b/bon_wilsonrnaseq_hsc.R @@ -2,7 +2,7 @@ library(BTR) library(doParallel) -path='~/bool_final/' +path='~/btr_output/' setwd(path) #Setting up for parallel processing. @@ -20,14 +20,11 @@ bmodel = initialise_model(bon_bmodel) istate = initialise_data(bon_istate) #(3)Filtering expression data. -tmp_data = initialise_raw_data(wilson_raw_rnaseq) #do not filter data at this stage. keep the whole matrix. -cdata = tmp_data[[1]] #continuous data -ddata = tmp_data[[2]] #discretised data +cdata = initialise_raw_data(wilson_raw_rnaseq) #do not filter data at this stage. keep the whole matrix. fcdata = cdata[grepl('hsc', rownames(cdata)),] #select only HSC cells. -fddata = ddata[grepl('hsc', rownames(ddata)),] #(3)Filtering expression data. -result = model_train(cdata=fcdata, ddata=fddata, bmodel=bmodel, istate=istate, verbose=T) +result = model_train(cdata=fcdata, bmodel=bmodel, istate=istate, verbose=T) filename = 'bon_wilson_rnaseq_HSC_trained_bmodel.rda' save(result, file=filename) diff --git a/gnw_bnlearn_btr_scoring.R b/gnw_bnlearn_btr_scoring.R index db8bdce..f363383 100644 --- a/gnw_bnlearn_btr_scoring.R +++ b/gnw_bnlearn_btr_scoring.R @@ -1,183 +1,263 @@ #(1) Setup paths and environment. +#library(BoolTraineR) library(BTR) library(bnlearn) library(ggplot2) library(gridExtra) #for plotting multiple graphs library(reshape2) -path='~/res1/' +path='~/btr_output/' setwd(path) -inter_bool = T max_varperrule = 6 -acyclic=T nonoise=F +subset_bool = F -for(file_ind in 1:5) +for(acyclic in c(T,F)) { - #Load data. - load(paste(path, ifelse(acyclic, 'acyclic', 'cyclic'), '_data/test_data_', ifelse(nonoise, 'nonoise_', ''),'yeast', file_ind,'_20n30s10g.rda', sep='')) #test_data. - - #score the true model under Bayesian framework. - if(acyclic) - { - true_ndamat = abs(test_data$true_amat) - true_bngraph = empty.graph(colnames(test_data$true_amat)) - amat(true_bngraph) = true_ndamat #if this step throws error, that means the true graph is not DAG. Has to regenerate the random data using GNW. - true_bnscore = score(true_bngraph, as.data.frame(test_data$cdata)) - } - - #score the true model under Boolean model framework. - true_bmmodel = amat_to_bm(test_data$true_amat) - overlap_gene = unname(colnames(test_data$cdata)) - fddata = test_data$ddata - fcdata = test_data$cdata - fcdata = unique_raw_data(fddata, fcdata) #removes duplicates in continuous data. - true_bmscore = calc_mscore(bmodel=true_bmmodel, istate=test_data$istate, fcdata=fcdata, overlap_gene=overlap_gene, max_varperrule=max_varperrule) - - #score the other bn models under Bayesian framework using GNW data. - bnmod_gnw_bnscore = c() - if(acyclic) + for(single_cell in c(T,F)) { - for(i in 1:length(test_data$bn_modamat)) + for(file_ind in 1:5) { - for(j in 1:length(test_data$bn_modamat[[i]])) + #Load data. + load(paste(path, ifelse(acyclic, 'acyclic', 'cyclic'), '_data/test_data_', ifelse(nonoise, 'nonoise_', ''),'yeast', file_ind,'_20n30s10g.rda', sep='')) #test_data. + + #score the true model under Bayesian framework. + if(acyclic) { - tmp_bngraph = empty.graph(colnames(test_data$bn_modamat[[i]][[j]])) - amat(tmp_bngraph) = test_data$bn_modamat[[i]][[j]] - tmp_score = score(tmp_bngraph, as.data.frame(test_data$cdata)) - names(tmp_score) = test_data$bn_step[[i]][[j]] - - bnmod_gnw_bnscore = c(bnmod_gnw_bnscore, tmp_score) + true_ndamat = abs(test_data$true_amat) + true_bngraph = empty.graph(colnames(test_data$true_amat)) + amat(true_bngraph) = true_ndamat #if this step throws error, that means the true graph is not DAG. Has to regenerate the random data using GNW. + true_bnscore = score(true_bngraph, as.data.frame(test_data$cdata)) } - } - bnmod_gnw_bnscore = split(bnmod_gnw_bnscore, names(bnmod_gnw_bnscore)) - bnmod_gnw_bnscore = bnmod_gnw_bnscore[order(as.numeric(names(bnmod_gnw_bnscore)))] - bnmod_gnw_bnscore = do.call(cbind, bnmod_gnw_bnscore) - #Add in true score. - bnmod_gnw_bnscore = cbind('0'=rep(true_bnscore, nrow(bnmod_gnw_bnscore)), bnmod_gnw_bnscore) - rownames(bnmod_gnw_bnscore) = NULL - colnames(bnmod_gnw_bnscore) = paste(seq(0,40,2)) - } - - #score the other bm models under Boolean model framework using GNW data. - bmmod_gnw_bmscore = c() - y_bmscore = c() - za_bmscore = c() - for(i in 1:length(test_data$bm_modmodel)) - { - for(j in 1:length(test_data$bm_modmodel[[i]])) - { - tmp_bmmodel = test_data$bm_modmodel[[i]][[j]] + #score the true model under Boolean model framework. + true_bmmodel = amat_to_bm(test_data$true_amat) overlap_gene = unname(colnames(test_data$cdata)) - tmp_bmmodel@target = overlap_gene - tmp_score = calc_mscore(bmodel=tmp_bmmodel, istate=test_data$istate, fcdata=fcdata, overlap_gene=overlap_gene, max_varperrule=max_varperrule, detail=T) + if(single_cell) + { + if(subset_bool) + { + set.seed(1) + sind = sample(1:nrow(test_data$sc_cdata), 50) + fcdata = test_data$sc_cdata[sind, ] + } else + { + fcdata = test_data$sc_cdata + } + } else + { + if(subset_bool) + { + set.seed(1) + sind = sample(1:nrow(test_data$cdata), 50) + fcdata = test_data$cdata[sind, ] + } else + { + fcdata = test_data$cdata + } + } + true_bmscore = calc_mscore(bmodel=true_bmmodel, istate=test_data$istate, fcdata=fcdata, overlap_gene=overlap_gene, max_varperrule=max_varperrule) + + #score the other bn models under Bayesian framework using GNW data. + bnmod_gnw_bnscore = c() + if(acyclic) + { + for(i in 1:length(test_data$bn_modamat)) + { + for(j in 1:length(test_data$bn_modamat[[i]])) + { + tmp_bngraph = empty.graph(colnames(test_data$bn_modamat[[i]][[j]])) + amat(tmp_bngraph) = test_data$bn_modamat[[i]][[j]] + tmp_score = score(tmp_bngraph, as.data.frame(test_data$cdata)) + #tmp_score = score(tmp_bngraph, as.data.frame(bm_cdata)) + names(tmp_score) = test_data$bn_step[[i]][[j]] + + bnmod_gnw_bnscore = c(bnmod_gnw_bnscore, tmp_score) + } + } + bnmod_gnw_bnscore = split(bnmod_gnw_bnscore, names(bnmod_gnw_bnscore)) + bnmod_gnw_bnscore = bnmod_gnw_bnscore[order(as.numeric(names(bnmod_gnw_bnscore)))] + bnmod_gnw_bnscore = do.call(cbind, bnmod_gnw_bnscore) + #Add in true score. + bnmod_gnw_bnscore = cbind('0'=rep(true_bnscore, nrow(bnmod_gnw_bnscore)), bnmod_gnw_bnscore) + rownames(bnmod_gnw_bnscore) = NULL + colnames(bnmod_gnw_bnscore) = paste(seq(0,40,2)) + } - names(tmp_score) = rep(j, length(tmp_score)) + #score the other bm models under Boolean model framework using GNW data. + bmmod_gnw_bmscore = c() + y_bmscore = c() + za_bmscore = c() + zb_bmscore = c() + for(i in 1:length(test_data$bm_modmodel)) + { + for(j in 1:length(test_data$bm_modmodel[[i]])) + { + + tmp_bmmodel = test_data$bm_modmodel[[i]][[j]] + overlap_gene = unname(colnames(test_data$cdata)) + tmp_bmmodel@target = overlap_gene + tmp_score = calc_mscore(bmodel=tmp_bmmodel, istate=test_data$istate, fcdata=fcdata, overlap_gene=overlap_gene, max_varperrule=max_varperrule, detail=T) + + names(tmp_score) = rep(j, length(tmp_score)) + + #c('f', 'y', 'zb') + bmmod_gnw_bmscore = c(bmmod_gnw_bmscore, tmp_score[1]) + y_bmscore = c(y_bmscore, tmp_score[2]) + za_bmscore = c(za_bmscore, tmp_score[3]) + zb_bmscore = c(zb_bmscore, tmp_score[4]) + } + } + bmmod_gnw_bmscore = split(bmmod_gnw_bmscore, names(bmmod_gnw_bmscore)) + bmmod_gnw_bmscore = bmmod_gnw_bmscore[order(as.numeric(names(bmmod_gnw_bmscore)))] + bmmod_gnw_bmscore = do.call(cbind, bmmod_gnw_bmscore) + #Add in true score. + bmmod_gnw_bmscore = cbind('0'=rep(true_bmscore, nrow(bmmod_gnw_bmscore)), bmmod_gnw_bmscore) + rownames(bmmod_gnw_bmscore) = NULL + colnames(bmmod_gnw_bmscore) = paste(seq(0,40,2)) - bmmod_gnw_bmscore = c(bmmod_gnw_bmscore, tmp_score[1]) + y_bmscore = split(y_bmscore, names(y_bmscore)) + y_bmscore = y_bmscore[order(as.numeric(names(y_bmscore)))] + y_bmscore = do.call(cbind, y_bmscore) + rownames(y_bmscore) = NULL + colnames(y_bmscore) = paste(seq(2,40,2)) + + za_bmscore = split(za_bmscore, names(za_bmscore)) + za_bmscore = za_bmscore[order(as.numeric(names(za_bmscore)))] + za_bmscore = do.call(cbind, za_bmscore) + rownames(za_bmscore) = NULL + colnames(za_bmscore) = paste(seq(2,40,2)) + + zb_bmscore = split(zb_bmscore, names(zb_bmscore)) + zb_bmscore = zb_bmscore[order(as.numeric(names(zb_bmscore)))] + zb_bmscore = do.call(cbind, zb_bmscore) + rownames(zb_bmscore) = NULL + colnames(zb_bmscore) = paste(seq(2,40,2)) + + save(bnmod_gnw_bnscore, bmmod_gnw_bmscore, y_bmscore, za_bmscore, zb_bmscore, file=paste(ifelse(acyclic, 'acyclic', 'cyclic'), ifelse(single_cell, '_sc', '_nonsc'), '_yeast', file_ind, '_scores.rda', sep='')) + #save(bnmod_gnw_bnscore, bmmod_gnw_bmscore, y_bmscore, zb_bmscore, file=paste(ifelse(acyclic, 'acyclic', 'cyclic'), ifelse(single_cell, '_sc', '_nonsc'), '_yeast', file_ind, '_scores.rda', sep='')) } - } - bmmod_gnw_bmscore = split(bmmod_gnw_bmscore, names(bmmod_gnw_bmscore)) - bmmod_gnw_bmscore = bmmod_gnw_bmscore[order(as.numeric(names(bmmod_gnw_bmscore)))] - bmmod_gnw_bmscore = do.call(cbind, bmmod_gnw_bmscore) - #Add in true score. - bmmod_gnw_bmscore = cbind('0'=rep(true_bmscore, nrow(bmmod_gnw_bmscore)), bmmod_gnw_bmscore) - rownames(bmmod_gnw_bmscore) = NULL - colnames(bmmod_gnw_bmscore) = paste(seq(0,40,2)) - - save(bnmod_gnw_bnscore, bmmod_gnw_bmscore, file=paste(ifelse(acyclic, 'acyclic', 'cyclic'), '_yeast', file_ind, '_scores.rda', sep='')) -} - -################################################################################################################################## - -#Setup the true networks. -true_model_list = list() -for(file_ind in 1:5) -{ - #Load data. - load(paste(path, ifelse(acyclic, 'acyclic', 'cyclic'), '_data/test_data_', ifelse(nonoise, 'nonoise_', ''),'yeast', file_ind,'_20n30s10g.rda', sep='')) #test_data. - - true_bmmodel = amat_to_bm(test_data$true_amat) - true_model_list = c(true_model_list, list(true_bmmodel)) -} - -#Make plots of true models. -for(i in 1:length(true_model_list)) -{ - png(paste(i, ifelse(acyclic, '_acyclic', '_cyclic'), ifelse(nonoise, '_nonoise', ''), '_true_network.png', sep=''), width=2000, height=2000, res=300) - plotBM(true_model_list[[i]]) - title(paste('True ', ifelse(acyclic, 'acyclic', 'cyclic'), ' network', i), cex.main=3) - dev.off() -} - -#Setup the scores for modified models. -bnscore_list = list() -bmscore_list = list() -for(file_ind in 1:5) -{ - load(paste(ifelse(acyclic, 'acyclic', 'cyclic'), '_yeast', file_ind, '_scores.rda', sep='')) #bnmod_gnw_bnscore, bmmod_gnw_bmscore - - bnscore_list = c(bnscore_list, list(bnmod_gnw_bnscore)) - bmscore_list = c(bmscore_list, list(bmmod_gnw_bmscore)) -} - -bnscore_df = melt(bnscore_list) -bnscore_df = bnscore_df[,-1] -bnscore_df[,3] = paste('Network', bnscore_df[,3]) -bmscore_df = melt(bmscore_list) -bmscore_df = bmscore_df[,-1] -bmscore_df[,3] = paste('Network', bmscore_df[,3]) -colnames(bnscore_df) = c('steps', 'score', 'network') -colnames(bmscore_df) = c('steps', 'score', 'network') - -bnscore_mid_df = melt(lapply(bnscore_list, function(x) colMeans(x))) -bnscore_mid_df[,2] = paste('Network', bnscore_mid_df[,2]) -bnscore_mid_df = cbind(rep(seq(0,40,2), 5), bnscore_mid_df) - -bnscore_se = melt(lapply(bnscore_list, function(x) apply(x, 2, function(x) sd(x)/sqrt(length(x))))) -bnscore_se = bnscore_se[,1,drop=F] -bnscore_mid_df = cbind(bnscore_mid_df, bnscore_mid_df[,2]-bnscore_se[,1], bnscore_mid_df[,2]+bnscore_se[,1]) -colnames(bnscore_mid_df) = c('steps', 'score', 'network', 'low', 'high') - -bmscore_mid_df = melt(lapply(bmscore_list, function(x) colMeans(x))) -bmscore_mid_df[,2] = paste('Network', bmscore_mid_df[,2]) -bmscore_mid_df = cbind(rep(seq(0,40,2), 5), bmscore_mid_df) + + ################################################################################################################################## + + #Setup the true networks. + true_model_list = list() + for(file_ind in 1:5) + { + #Load data. + load(paste(path, ifelse(acyclic, 'acyclic', 'cyclic'), '_data/test_data_', ifelse(nonoise, 'nonoise_', ''),'yeast', file_ind,'_20n30s10g.rda', sep='')) #test_data. + + true_bmmodel = amat_to_bm(test_data$true_amat) + true_model_list = c(true_model_list, list(true_bmmodel)) + } + + #Make plots of true models. + for(i in 1:length(true_model_list)) + { + png(paste(i, ifelse(acyclic, '_acyclic', '_cyclic'), '_true_network.png', sep=''), width=2000, height=2000, res=300) + plotBM(true_model_list[[i]]) + title(paste('True ', ifelse(acyclic, 'acyclic', 'cyclic'), ' network', i), cex.main=3) + dev.off() + } + + #Setup the scores for modified models. + bnscore_list = list() + bmscore_list = list() + for(file_ind in 1:5) + { + load(paste(ifelse(acyclic, 'acyclic', 'cyclic'), ifelse(single_cell, '_sc', '_nonsc'), '_yeast', file_ind, '_scores.rda', sep='')) #bnmod_gnw_bnscore, bmmod_gnw_bmscore -bmscore_se = melt(lapply(bmscore_list, function(x) apply(x, 2, function(x) sd(x)/sqrt(length(x))))) -bmscore_se = bmscore_se[,1,drop=F] -bmscore_mid_df = cbind(bmscore_mid_df, bmscore_mid_df[,2]-bmscore_se[,1], bmscore_mid_df[,2]+bmscore_se[,1]) -colnames(bmscore_mid_df) = c('steps', 'score', 'network', 'low', 'high') + bnscore_list = c(bnscore_list, list(bnmod_gnw_bnscore)) + bmscore_list = c(bmscore_list, list(bmmod_gnw_bmscore)) + } + + if(acyclic) + { + bnscore_df = melt(bnscore_list) + bnscore_df = bnscore_df[,-1] + bnscore_df[,3] = paste('Network', bnscore_df[,3]) + colnames(bnscore_df) = c('steps', 'score', 'network') + + bnscore_mid_df = melt(lapply(bnscore_list, function(x) colMeans(x))) + bnscore_mid_df[,2] = paste('Network', bnscore_mid_df[,2]) + bnscore_mid_df = cbind(rep(seq(0,40,2), 5), bnscore_mid_df) + + bnscore_se = melt(lapply(bnscore_list, function(x) apply(x, 2, function(x) sd(x)/sqrt(length(x))))) + bnscore_se = bnscore_se[,1,drop=F] + bnscore_mid_df = cbind(bnscore_mid_df, bnscore_mid_df[,2]-bnscore_se[,1], bnscore_mid_df[,2]+bnscore_se[,1]) + colnames(bnscore_mid_df) = c('steps', 'score', 'network', 'low', 'high') + } -#Make plot objects of scoring functions. -if(acyclic) -{ - p1_bn_mid = ggplot(bnscore_mid_df, aes(x=bnscore_mid_df[, 'steps'], y=bnscore_mid_df[,'score'])) + - geom_errorbar(aes(ymin=bnscore_mid_df[, 'low'], ymax=bnscore_mid_df[, 'high'])) + - geom_line() + xlab('Number of different edges') + ylab('Scores') + ggtitle('BIC scoring function') + - facet_wrap(~network, scales='free_y', ncol=1) + - theme(text = element_text(size=20)) - - p2_bm_mid = ggplot(bmscore_mid_df, aes(x=bmscore_mid_df[, 'steps'], y=bmscore_mid_df[,'score'])) + - geom_errorbar(aes(ymin=bmscore_mid_df[, 'low'], ymax=bmscore_mid_df[, 'high'])) + - geom_line() + xlab('Number of different edges') + ylab('Scores') + ggtitle('BSS scoring function') + - facet_wrap(~network, scales='free_y', ncol=1) + - theme(text = element_text(size=20)) - - png(paste('boolbaye', ifelse(acyclic, '_acyclic', '_cyclic'), ifelse(nonoise, '_nonoise', ''), '_mean_compare_score.png', sep=''), width=3000, height=5000, res=300) - grid.arrange(p1_bn_mid, p2_bm_mid, ncol=2) - dev.off() -} else -{ - p2_bm_mid = ggplot(bmscore_mid_df, aes(x=bmscore_mid_df[, 'steps'], y=bmscore_mid_df[,'score'])) + - geom_errorbar(aes(ymin=bmscore_mid_df[, 'low'], ymax=bmscore_mid_df[, 'high'])) + - geom_line() + xlab('Number of different edges') + ylab('Scores') + ggtitle('BSS scoring function') + - facet_wrap(~network, scales='free_y', ncol=1) + - theme(text = element_text(size=20)) - - png(paste('boolbaye', ifelse(acyclic, '_acyclic', '_cyclic'), ifelse(nonoise, '_nonoise', ''), '_mean_compare_score.png', sep=''), width=3000, height=5000, res=300) - grid.arrange(p2_bm_mid, ncol=2) - dev.off() + bmscore_df = melt(bmscore_list) + bmscore_df = bmscore_df[,-1] + bmscore_df[,3] = paste('Network', bmscore_df[,3]) + colnames(bmscore_df) = c('steps', 'score', 'network') + + bmscore_mid_df = melt(lapply(bmscore_list, function(x) colMeans(x))) + bmscore_mid_df[,2] = paste('Network', bmscore_mid_df[,2]) + bmscore_mid_df = cbind(rep(seq(0,40,2), 5), bmscore_mid_df) + + bmscore_se = melt(lapply(bmscore_list, function(x) apply(x, 2, function(x) sd(x)/sqrt(length(x))))) + bmscore_se = bmscore_se[,1,drop=F] + bmscore_mid_df = cbind(bmscore_mid_df, bmscore_mid_df[,2]-bmscore_se[,1], bmscore_mid_df[,2]+bmscore_se[,1]) + colnames(bmscore_mid_df) = c('steps', 'score', 'network', 'low', 'high') + + #Make plot objects of scoring functions. + if(acyclic) + { + p1_bn_box = ggplot(bnscore_df, aes(x=factor(bnscore_df[,'steps']), y=bnscore_df[,'score'])) + + geom_boxplot() + xlab('Number of different edges') + ylab('Scores') + ggtitle('BIC scoring function') + + scale_x_discrete(labels=unique(bnscore_df[,'steps'])) + + facet_wrap(~network, scales='free_y', ncol=1) + + theme(text = element_text(size=20), axis.text.x = element_text(size=10)) + + p2_bm_box = ggplot(bmscore_df, aes(x=factor(bmscore_df[,'steps']), y=bmscore_df[,'score'])) + + geom_boxplot() + xlab('Number of different edges') + ylab('Scores') + ggtitle('BSS scoring function') + + scale_x_discrete(labels=unique(bmscore_df[,'steps'])) + + facet_wrap(~network, scales='free_y', ncol=1) + + theme(text = element_text(size=20), axis.text.x = element_text(size=10)) + + png(paste('boolbaye', ifelse(acyclic, '_acyclic', '_cyclic'), ifelse(single_cell, '_sc', '_nonsc'), '_boxplot_compare_score.png', sep=''), width=3000, height=5000, res=300) + grid.arrange(p1_bn_box, p2_bm_box, ncol=2) + dev.off() + + p1_bn_mid = ggplot(bnscore_mid_df, aes(x=bnscore_mid_df[, 'steps'], y=bnscore_mid_df[,'score'])) + + geom_errorbar(aes(ymin=bnscore_mid_df[, 'low'], ymax=bnscore_mid_df[, 'high'])) + + geom_line() + xlab('Number of different edges') + ylab('Scores') + ggtitle('BIC scoring function') + + facet_wrap(~network, scales='free_y', ncol=1) + + theme(text = element_text(size=20)) + + p2_bm_mid = ggplot(bmscore_mid_df, aes(x=bmscore_mid_df[, 'steps'], y=bmscore_mid_df[,'score'])) + + geom_errorbar(aes(ymin=bmscore_mid_df[, 'low'], ymax=bmscore_mid_df[, 'high'])) + + geom_line() + xlab('Number of different edges') + ylab('Scores') + ggtitle('BSS scoring function') + + facet_wrap(~network, scales='free_y', ncol=1) + + theme(text = element_text(size=20)) + + png(paste('boolbaye', ifelse(acyclic, '_acyclic', '_cyclic'), ifelse(single_cell, '_sc', '_nonsc'), '_mean_compare_score.png', sep=''), width=3000, height=5000, res=300) + grid.arrange(p1_bn_mid, p2_bm_mid, ncol=2) + dev.off() + } else + { + p2_bm_box = ggplot(bmscore_df, aes(x=factor(bmscore_df[,'steps']), y=bmscore_df[,'score'])) + + geom_boxplot() + xlab('Number of different edges') + ylab('Scores') + ggtitle('BSS scoring function') + + scale_x_discrete(labels=unique(bmscore_df[,'steps'])) + + facet_wrap(~network, scales='free_y', ncol=1) + + theme(text = element_text(size=20), axis.text.x = element_text(size=10)) + + png(paste('boolbaye', ifelse(acyclic, '_acyclic', '_cyclic'), ifelse(single_cell, '_sc', '_nonsc'), '_boxplot_compare_score.png', sep=''), width=3000, height=5000, res=300) + grid.arrange(p2_bm_box, ncol=2) + dev.off() + + p2_bm_mid = ggplot(bmscore_mid_df, aes(x=bmscore_mid_df[, 'steps'], y=bmscore_mid_df[,'score'])) + + geom_errorbar(aes(ymin=bmscore_mid_df[, 'low'], ymax=bmscore_mid_df[, 'high'])) + + geom_line() + xlab('Number of different edges') + ylab('Scores') + ggtitle('BSS scoring function') + + facet_wrap(~network, scales='free_y', ncol=1) + + theme(text = element_text(size=20)) + + png(paste('boolbaye', ifelse(acyclic, '_acyclic', '_cyclic'), ifelse(single_cell, '_sc', '_nonsc'), '_mean_compare_score.png', sep=''), width=3000, height=5000, res=300) + grid.arrange(p2_bm_mid, ncol=2) + dev.off() + } + } } - diff --git a/gnw_btr_model_inference.R b/gnw_btr_model_inference.R old mode 100644 new mode 100755 index 2363e2f..c7095e7 --- a/gnw_btr_model_inference.R +++ b/gnw_btr_model_inference.R @@ -2,21 +2,24 @@ library(BTR) library(doParallel) -path='~/bool_final/' +path='~/btr_output/' setwd(path) inter_bool = T max_varperrule = 6 acyclic = T -initial_model = T +initial_model = F +single_cell = F #Setting up for parallel processing. -registerDoParallel(cores=4) #this automatically calls mclapply() when no cl is given. +num_cores = 4 +registerDoParallel(cores=num_cores) #this automatically calls mclapply() when no cl is given. #Setting random seed for reproducibility. set.seed(1) test_result = c() +#file_ind = 1 for(file_ind in 1:5) { #Load data. @@ -27,11 +30,23 @@ for(file_ind in 1:5) if(initial_model) { ind_i = sample(seq(1,length(test_data$bm_modmodel)), 1) - ind_j = 10 - result = model_train(cdata=test_data$cdata, ddata=test_data$ddata, bmodel=test_data$bm_modmodel[[ind_i]][[ind_j]], istate=test_data$istate, and_bool=inter_bool, verbose=T, self_loop=F) #Very time consuming + ind_j = 5 + if(single_cell) + { + result = model_train(cdata=test_data$sc_cdata, bmodel=test_data$bm_modmodel[[ind_i]][[ind_j]], istate=test_data$istate, and_bool=inter_bool, verbose=T, self_loop=F) + } else + { + result = model_train(cdata=test_data$cdata, bmodel=test_data$bm_modmodel[[ind_i]][[ind_j]], istate=test_data$istate, and_bool=inter_bool, verbose=T, self_loop=F) + } } else { - result = model_train(cdata=test_data$cdata, ddata=test_data$ddata, istate=test_data$istate, and_bool=inter_bool, verbose=T, self_loop=F) #Very time consuming + if(single_cell) + { + result = model_train(cdata=test_data$sc_cdata, istate=test_data$istate, and_bool=inter_bool, verbose=T, self_loop=F) + } else + { + result = model_train(cdata=test_data$cdata, istate=test_data$istate, and_bool=inter_bool, verbose=T, self_loop=F) + } } adj_mat = abs(bm_to_amat(result)) end_time = proc.time() @@ -48,7 +63,7 @@ for(file_ind in 1:5) } #Open file connection for writing output. -file_name = paste('result_btr-', ifelse(initial_model, 'wi', 'wo'), '_train_', ifelse(acyclic, 'acyclic', 'cyclic'), '_yeast.rda', sep='') +file_name = paste('result_btr-', ifelse(initial_model, 'wi', 'wo'), '_train_', ifelse(acyclic, 'acyclic_', 'cyclic_'), ifelse(single_cell, 'sc', 'nonsc'), '_yeast1.rda', sep='') save(test_result, file=file_name) #Cleaning up. diff --git a/gnw_comparison_network_inference.R b/gnw_comparison_network_inference.R index 0c6a4af..6b9aaa5 100644 --- a/gnw_comparison_network_inference.R +++ b/gnw_comparison_network_inference.R @@ -6,159 +6,162 @@ library(reshape2) library(Hmisc) #for capitalisation. #(1) Load results and combine them. -inter_bool = T -max_varperrule = 6 -acyclic = 'cyclic' #acyclic, cyclic, both - -# sect_1 ------------------------------------------------------------------ - -path='~/res1/' -setwd(path) - -if(acyclic=='acyclic') -{ - files = list.files(path=path, pattern='^result.+_train_acyclic_yeast[.]rda', full.names = T) -} else if(acyclic=='cyclic') -{ - files = list.files(path=path, pattern='^result.+_train_cyclic_yeast[.]rda', full.names = T) -} else if(acyclic=='both') -{ - files = list.files(path=path, pattern='^result.+_train_[a-z]+_yeast[.]rda', full.names = T) -} - -# sect_2 ------------------------------------------------------------------ - -all_true_adj_mat = list() -all_adj_mat = list() -all_perf_score = c() -all_time_taken = list() -for(i in 1:length(files)) +for(acyclic in c('acyclic', 'cyclic', 'both')) { - if(acyclic=='acyclic') + for(single_cell in c(T, F)) { - tmp_name = gsub('.+result_(.+)_train_acyclic_yeast[.]rda', '\\1', files[i]) #extract name of method. - } else if(acyclic=='cyclic') - { - tmp_name = gsub('.+result_(.+)_train_cyclic_yeast[.]rda', '\\1', files[i]) #extract name of method. - } else if(acyclic=='both') - { - tmp_name = gsub('.+result_(.+)_train_.+_yeast[.]rda', '\\1', files[i]) #extract name of method. - } - - load(files[i]) #test_result. "true_adj_mat", "adj_mat", "perf_score", "time_taken" - - algo_true_adj_mat = list() - algo_adj_mat = list() - algo_perf_score = c() - algo_time_taken = list() - for(j in 1:length(test_result)) - { - algo_true_adj_mat = c(algo_true_adj_mat, list(test_result[[j]]$true_adj_mat)) - algo_adj_mat = c(algo_adj_mat, list(test_result[[j]]$adj_mat)) - algo_perf_score = c(algo_perf_score, list(test_result[[j]]$perf_score)) - algo_time_taken = c(algo_time_taken, list(as.vector(test_result[[j]]$time_taken))) - } - algo_perf_score = do.call(rbind, algo_perf_score) - algo_time_taken = do.call(rbind, algo_time_taken) - - if(tmp_name %in% names(all_true_adj_mat)) #only happen in acyclic=both. - { - all_true_adj_mat[[tmp_name]] = c(all_true_adj_mat[[tmp_name]], algo_true_adj_mat) - all_adj_mat[[tmp_name]] = c(all_adj_mat[[tmp_name]], algo_adj_mat) - all_perf_score[[tmp_name]] = rbind(all_perf_score[[tmp_name]], algo_perf_score) - all_time_taken[[tmp_name]] = rbind(all_time_taken[[tmp_name]], algo_time_taken) - } else - { - all_true_adj_mat = c(all_true_adj_mat, setNames(list(algo_true_adj_mat), tmp_name)) - all_adj_mat = c(all_adj_mat, setNames(list(algo_adj_mat), tmp_name)) - all_perf_score = c(all_perf_score, setNames(list(algo_perf_score), tmp_name)) - all_time_taken = c(all_time_taken, setNames(list(algo_time_taken), tmp_name)) + # sect_1 ------------------------------------------------------------------ + + path='~/btr_output' + setwd(path) + + if(acyclic=='acyclic') + { + files = list.files(path=path, pattern=sprintf('^result.+_train_acyclic_%s_yeast[.]rda', ifelse(single_cell, 'sc', 'nonsc')), full.names = T) + } else if(acyclic=='cyclic') + { + files = list.files(path=path, pattern=sprintf('^result.+_train_cyclic_%s_yeast[.]rda', ifelse(single_cell, 'sc', 'nonsc')), full.names = T) + } else if(acyclic=='both') + { + files = list.files(path=path, pattern=sprintf('^result.+_train_[a-z]+_%s_yeast[.]rda', ifelse(single_cell, 'sc', 'nonsc')), full.names = T) + } + + # sect_2 ------------------------------------------------------------------ + + all_true_adj_mat = list() + all_adj_mat = list() + all_perf_score = c() + all_time_taken = list() + for(i in 1:length(files)) + { + if(acyclic=='acyclic') + { + tmp_name = gsub(sprintf('.+result_(.+)_train_acyclic_%s_yeast[.]rda', ifelse(single_cell, 'sc', 'nonsc')), '\\1', files[i]) #extract name of method. + } else if(acyclic=='cyclic') + { + tmp_name = gsub(sprintf('.+result_(.+)_train_cyclic_%s_yeast[.]rda', ifelse(single_cell, 'sc', 'nonsc')), '\\1', files[i]) #extract name of method. + } else if(acyclic=='both') + { + tmp_name = gsub(sprintf('.+result_(.+)_train_.+_%s_yeast[.]rda', ifelse(single_cell, 'sc', 'nonsc')), '\\1', files[i]) #extract name of method. + } + + load(files[i]) #test_result. "true_adj_mat", "adj_mat", "perf_score", "time_taken" + + algo_true_adj_mat = list() + algo_adj_mat = list() + algo_perf_score = c() + algo_time_taken = list() + for(j in 1:length(test_result)) + { + algo_true_adj_mat = c(algo_true_adj_mat, list(test_result[[j]]$true_adj_mat)) + algo_adj_mat = c(algo_adj_mat, list(test_result[[j]]$adj_mat)) + algo_perf_score = c(algo_perf_score, list(test_result[[j]]$perf_score)) + algo_time_taken = c(algo_time_taken, list(as.vector(test_result[[j]]$time_taken))) + } + algo_perf_score = do.call(rbind, algo_perf_score) + algo_time_taken = do.call(rbind, algo_time_taken) + + if(tmp_name %in% names(all_true_adj_mat)) #only happen in acyclic=both. + { + all_true_adj_mat[[tmp_name]] = c(all_true_adj_mat[[tmp_name]], algo_true_adj_mat) + all_adj_mat[[tmp_name]] = c(all_adj_mat[[tmp_name]], algo_adj_mat) + all_perf_score[[tmp_name]] = rbind(all_perf_score[[tmp_name]], algo_perf_score) + all_time_taken[[tmp_name]] = rbind(all_time_taken[[tmp_name]], algo_time_taken) + } else + { + all_true_adj_mat = c(all_true_adj_mat, setNames(list(algo_true_adj_mat), tmp_name)) + all_adj_mat = c(all_adj_mat, setNames(list(algo_adj_mat), tmp_name)) + all_perf_score = c(all_perf_score, setNames(list(algo_perf_score), tmp_name)) + all_time_taken = c(all_time_taken, setNames(list(algo_time_taken), tmp_name)) + } + } + + # sect_3 ------------------------------------------------------------------ + + #Prepare data for plotting. + method_name = toupper(as.vector(sapply(names(all_perf_score), function(x) rep(x, nrow(all_perf_score[[1]]))))) + network_name = rep(paste('net_', seq(1,nrow(all_perf_score[[1]])), sep=''), length(all_perf_score)) + comb_df = do.call(rbind, all_perf_score) + + stopifnot(nrow(comb_df)==length(method_name)) + + comb_df = data.frame(method_name, network_name, comb_df, stringsAsFactors=F) + comb_df[is.na(comb_df)] = 0 + colnames(comb_df) = c(colnames(comb_df)[1:2], toupper(colnames(comb_df)[3:10])) + colnames(comb_df)[10] = 'F-score' + + #Setting up consistent colour. + plot_col = rainbow_hcl(length(unique(comb_df[,'method_name'])), alpha=0.8) + names(plot_col) = unique(comb_df[,'method_name']) + + plot_df = melt(comb_df) + + #1st plot. + comb_p1 = ggplot(plot_df, aes(x=plot_df[,'method_name'], y=plot_df[,"value"])) + + geom_boxplot() + geom_jitter() + + theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=20)) + + xlab('Inference algorithms') + ylab('') + ggtitle(Hmisc:::capitalize(acyclic)) + + facet_wrap(~variable, scales='free_y', ncol=2) + + png(paste('all_algorithms_', acyclic, ifelse(single_cell, '_sc', '_nonsc'), '_score.png', sep=''), width=3000, height=4000, res=300) + plot(comb_p1) + dev.off() + + #2nd plot + tmp_list = list() + for(i in unique(method_name)) + { + tmp_row = c(i, colMeans(subset(comb_df[,c(-1,-2)], comb_df[,'method_name']==i)), + apply(subset(comb_df[,c(-1,-2)], comb_df[,'method_name']==i), 2, function(x) sd(x)/sqrt(length(x)))) + #tmp_row = c(i, apply(subset(comb_df[,c(-1,-2)], comb_df[,'method_name']==i), 2, sum)) + names(tmp_row) = c('method_name', 'p', 'np', 'r', 'a', 's', 'plr', 'nlr', 'f', 'p_se', 'np_se', 'r_se', 'a_se', 's_se', 'plr_se', 'nlr_se', 'f_se') + tmp_list = c(tmp_list, list(tmp_row)) + } + combsum_df = do.call(rbind, tmp_list) + tmp_rownames = combsum_df[,1] + combsum_df = combsum_df[,-1] + combsum_df = apply(combsum_df, 2, as.numeric) + combsum_df = data.frame(combsum_df) + combsum_df = cbind(method_name=tmp_rownames, combsum_df) + + plot_mid_df = melt(combsum_df) + plot_val_df = subset(plot_mid_df, !grepl('[a-z]_se', plot_mid_df[,2])) + plot_se_df = subset(plot_mid_df, grepl('[a-z]_se', plot_mid_df[,2])) + plot_val_df = cbind(plot_val_df, plot_val_df[,3]-plot_se_df[,3], plot_val_df[,3]+plot_se_df[,3]) + colnames(plot_val_df) = c('method_name', 'variable', 'value', 'low', 'high') + + combmid_p1 = ggplot(plot_val_df, aes(x=plot_val_df[,'method_name'], y=plot_val_df[,'value'], fill=method_name)) + + geom_bar(stat="identity") + + geom_errorbar(aes(ymin=plot_val_df[,"low"], ymax=plot_val_df[,"high"]), width=.5) + + theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=20)) + guides(fill=FALSE) + + scale_fill_manual("Legend", values = plot_col) + + xlab('Inference algorithms') + ylab('') + ggtitle(Hmisc:::capitalize(acyclic)) + + facet_wrap(~variable, scales='free_y', ncol=2) + + png(paste('all_algorithms_', acyclic, ifelse(single_cell, '_sc', '_nonsc'), '_meanscore.png', sep=''), width=3000, height=4000, res=300) + plot(combmid_p1) + dev.off() + + #Individual plots for F-scores + comb_p8 = ggplot(comb_df, aes(x=reorder(comb_df[,'method_name'], -comb_df[,"F-score"], FUN=median), y=comb_df[,"F-score"])) + + geom_boxplot() + geom_jitter() + + theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=20)) + + xlab('Inference algorithms') + ylab('F-score') + ggtitle(Hmisc:::capitalize(acyclic)) + + png(paste('all_algorithms_', acyclic, ifelse(single_cell, '_sc', '_nonsc'), '_fscore.png', sep=''), width=2000, height=2000, res=300) + plot(comb_p8) + dev.off() + + combmid_p8 = ggplot(combsum_df, aes(x=reorder(combsum_df[,'method_name'], -combsum_df[,"f"]), y=f, fill=method_name)) + + geom_bar(stat="identity") + + geom_errorbar(aes(ymin=combsum_df[,"f"] - combsum_df[,'f_se'], ymax=combsum_df[,"f"] + combsum_df[,'f_se']), width=.5) + + theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=20)) + guides(fill=FALSE) + + scale_fill_manual("Legend", values = plot_col) + + xlab('Inference algorithms') + ylab('F-score') + ggtitle(Hmisc:::capitalize(acyclic)) + + png(paste('all_algorithms_', acyclic, ifelse(single_cell, '_sc', '_nonsc'), '_fmeanscore.png', sep=''), width=2000, height=2000, res=300) + plot(combmid_p8) + dev.off() } } -# sect_3 ------------------------------------------------------------------ - -#Prepare data for plotting. -method_name = toupper(as.vector(sapply(names(all_perf_score), function(x) rep(x, nrow(all_perf_score[[1]]))))) -network_name = rep(paste('net_', seq(1,nrow(all_perf_score[[1]])), sep=''), length(all_perf_score)) -comb_df = do.call(rbind, all_perf_score) - -stopifnot(nrow(comb_df)==length(method_name)) - -comb_df = data.frame(method_name, network_name, comb_df, stringsAsFactors=F) -comb_df[is.na(comb_df)] = 0 -colnames(comb_df) = c(colnames(comb_df)[1:2], toupper(colnames(comb_df)[3:10])) -colnames(comb_df)[10] = 'F-score' - -#Setting up consistent colour. -plot_col = rainbow_hcl(length(unique(comb_df[,'method_name'])), alpha=0.8) -names(plot_col) = unique(comb_df[,'method_name']) - -plot_df = melt(comb_df) - -#1st plot. -comb_p1 = ggplot(plot_df, aes(x=plot_df[,'method_name'], y=plot_df[,"value"])) + - geom_boxplot() + geom_jitter() + - theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=20)) + - xlab('Inference algorithms') + ylab('') + ggtitle(capitalize(acyclic)) + - facet_wrap(~variable, scales='free_y', ncol=2) - -png(paste('all_algorithms_', acyclic, '_score.png', sep=''), width=3000, height=4000, res=300) -plot(comb_p1) -dev.off() - -#2nd plot -tmp_list = list() -for(i in unique(method_name)) -{ - tmp_row = c(i, colMeans(subset(comb_df[,c(-1,-2)], comb_df[,'method_name']==i)), - apply(subset(comb_df[,c(-1,-2)], comb_df[,'method_name']==i), 2, function(x) sd(x)/sqrt(length(x)))) - #tmp_row = c(i, apply(subset(comb_df[,c(-1,-2)], comb_df[,'method_name']==i), 2, sum)) - names(tmp_row) = c('method_name', 'p', 'np', 'r', 'a', 's', 'plr', 'nlr', 'f', 'p_se', 'np_se', 'r_se', 'a_se', 's_se', 'plr_se', 'nlr_se', 'f_se') - tmp_list = c(tmp_list, list(tmp_row)) -} -combsum_df = do.call(rbind, tmp_list) -tmp_rownames = combsum_df[,1] -combsum_df = combsum_df[,-1] -combsum_df = apply(combsum_df, 2, as.numeric) -combsum_df = data.frame(combsum_df) -combsum_df = cbind(method_name=tmp_rownames, combsum_df) - -plot_mid_df = melt(combsum_df) -plot_val_df = subset(plot_mid_df, !grepl('[a-z]_se', plot_mid_df[,2])) -plot_se_df = subset(plot_mid_df, grepl('[a-z]_se', plot_mid_df[,2])) -plot_val_df = cbind(plot_val_df, plot_val_df[,3]-plot_se_df[,3], plot_val_df[,3]+plot_se_df[,3]) -colnames(plot_val_df) = c('method_name', 'variable', 'value', 'low', 'high') - -combmid_p1 = ggplot(plot_val_df, aes(x=plot_val_df[,'method_name'], y=plot_val_df[,'value'], fill=method_name)) + - geom_bar(stat="identity") + - geom_errorbar(aes(ymin=plot_val_df[,"low"], ymax=plot_val_df[,"high"]), width=.5) + - theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=20)) + guides(fill=FALSE) + - scale_fill_manual("Legend", values = plot_col) + - xlab('Inference algorithms') + ylab('') + ggtitle(capitalize(acyclic)) + - facet_wrap(~variable, scales='free_y', ncol=2) - -png(paste('all_algorithms_', acyclic, '_meanscore.png', sep=''), width=3000, height=4000, res=300) -plot(combmid_p1) -dev.off() - -#Individual plots for F-scores -comb_p8 = ggplot(comb_df, aes(x=reorder(comb_df[,'method_name'], -comb_df[,"F-score"], FUN=median), y=comb_df[,"F-score"])) + - geom_boxplot() + geom_jitter() + - theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=20)) + - xlab('Inference algorithms') + ylab('F-score') + ggtitle(capitalize(acyclic)) - -png(paste('all_algorithms_', acyclic, '_fscore.png', sep=''), width=2000, height=2000, res=300) -plot(comb_p8) -dev.off() - -combmid_p8 = ggplot(combsum_df, aes(x=reorder(combsum_df[,'method_name'], -combsum_df[,"f"]), y=f, fill=method_name)) + - geom_bar(stat="identity") + - geom_errorbar(aes(ymin=combsum_df[,"f"] - combsum_df[,'f_se'], ymax=combsum_df[,"f"] + combsum_df[,'f_se']), width=.5) + - theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=20)) + guides(fill=FALSE) + - scale_fill_manual("Legend", values = plot_col) + - xlab('Inference algorithms') + ylab('F-score') + ggtitle(capitalize(acyclic)) - -png(paste('all_algorithms_', acyclic, '_fmeanscore.png', sep=''), width=2000, height=2000, res=300) -plot(combmid_p8) -dev.off() diff --git a/krum_wilson_grow_hsc.R b/krum_wilson_grow_hsc.R index c17dd93..747004b 100644 --- a/krum_wilson_grow_hsc.R +++ b/krum_wilson_grow_hsc.R @@ -2,7 +2,7 @@ library(BTR) library(doParallel) -path='~/bool_final/' +path='~/btr_output/' setwd(path) #Setting up for parallel processing. @@ -20,12 +20,9 @@ bmodel = initialise_model(krum_bmodel) istate = initialise_data(krum_istate) #(3)Filtering expression data. -tmp_data = initialise_raw_data(wilson_raw_data, max_expr = 'low') #do not filter data at this stage. keep the whole matrix. -cdata = tmp_data[[1]] #continuous data -ddata = tmp_data[[2]] #discretised data +cdata = initialise_raw_data(wilson_raw_data, max_expr = 'low') #do not filter data at this stage. keep the whole matrix. cell_ind = grepl('cmp', rownames(cdata)) | grepl('gmp', rownames(cdata)) | grepl('mep', rownames(cdata)) fcdata = cdata[cell_ind,] #select only relevant cells. -fddata = ddata[cell_ind,] #(4)Adding extra genes into the model. extra_genes = setdiff(colnames(wilson_raw_data), bmodel@target) @@ -38,7 +35,7 @@ tmp_istate = matrix(round(tmp_istate), nrow=1, dimnames=list(1, names(tmp_istate grown_istate = cbind(istate, tmp_istate) grown_istate = initialise_data(grown_istate) -result = model_train(cdata=fcdata, ddata=fddata, bmodel=grown_bmodel, istate=grown_istate, verbose=T) +result = model_train(cdata=fcdata, bmodel=grown_bmodel, istate=grown_istate, verbose=T) filename = 'krum_wilson_myeloid_grow_trained_bmodel.rda' save(result, file=filename)