diff --git a/DESCRIPTION b/DESCRIPTION index aaa4b0b..368900a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,13 @@ Package: BoolTraineR Type: Package Title: Tools For Training and Analysing Asynchronous Boolean Models -Version: 1.0.1 +Version: 1.1.1 Date: 2015-10-22 Author: Chee Yee Lim -Maintainer: Chee Yee Lim -Description: This package contains tools for Boolean model manipulation, as well as the search for the best Boolean model. -Depends: +Maintainer: Chee Yee Lim +Description: This package contains tools for inferring asynchronous Boolean + models from single-cell expression data. +Depends: R (>= 3.0.3), methods Imports: @@ -15,8 +16,13 @@ Imports: foreach (>= 1.4.1), doParallel (>= 1.0.8), poweRlaw (>= 0.30.0), - MASS (>= 7.3-44), - diptest (>= 0.75-7) + diptest (>= 0.75-7), + igraph (>= 1.0.1) LinkingTo: Rcpp License: GPL-3 LazyData: true +Suggests: + knitr, + rmarkdown +VignetteBuilder: knitr +RoxygenNote: 5.0.1 diff --git a/NAMESPACE b/NAMESPACE index eb35b6c..e2ec62b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,16 +2,13 @@ export(BoolModel) export(amat_to_bm) -export(bin_to_real) export(bm_to_amat) export(bm_to_df) export(calc_mscore) export(calc_roc) -export(check_bmodel) export(compress_bmodel) export(decompress_bmodel) export(df_to_bm) -export(equi_model) export(gen_randata) export(gen_randata_bn) export(gen_two_rmodel) @@ -21,14 +18,15 @@ export(grow_bmodel) export(initialise_data) export(initialise_model) export(initialise_raw_data) -export(model_consensus) +export(minmod_model) export(model_dist) export(model_setdiff) export(model_train) -export(outcyto_model) -export(outcyto_stategraph) +export(model_train_sa) export(outgenysis_model) -export(param_bimodal) +export(outgraph_model) +export(outstate_graph) +export(plotBM) export(printBM) export(simulate_model) export(unique_raw_data) diff --git a/R/RcppExports.R b/R/RcppExports.R index ee594f4..ae6d217 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,49 +1,6 @@ # This file was generated by Rcpp::compileAttributes # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -#' @title Find a match between two data frames. -#' -#' @description -#' (&&&Not for public use&&&)This function finds a match between two df of states. Used in match_state(). Return an row index vector indicating which row of mstate matches the rows in xstate. -#' -#' @param mstate data frame. It should be a state(row) x gene(column) df. -#' @param xstate data frame. It should be a state(row) x gene(column) df. -match_state_loop <- function(mstate, xstate) { - .Call('BoolTraineR_match_state_loop', PACKAGE = 'BoolTraineR', mstate, xstate) -} - -#' @title Calculating pairwise scores between model and data states. -#' -#' @description -#' This function calculates the pairwise scores between each row of model and data states. The score is calculated using a custom binary distance measure. -#' -#' @param x_df matrix. It should be numerical matrix of model states. -#' @param y_df matrix. It should be numerical matrix of data states. -rcpp_man_dist <- function(x_df, y_df) { - .Call('BoolTraineR_rcpp_man_dist', PACKAGE = 'BoolTraineR', x_df, y_df) -} - -#' @title Calculating Hamming pairwise scores between model and data states. -#' -#' @description -#' This function calculates the pairwise scores between each row of model and data states. The score is calculated using a custom binary distance measure. -#' -#' @param x_df matrix. It should be logical matrix of model states. -#' @param y_df matrix. It should be logical matrix of data states. -rcpp_ham_dist <- function(x_df, y_df) { - .Call('BoolTraineR_rcpp_ham_dist', PACKAGE = 'BoolTraineR', x_df, y_df) -} - -#' @title Inner core for m_score() -#' -#' @description -#' This function takes in a df with columns ranked wrt each row, and try to assign each row to a unique column without repetition. -#' -#' @param x_df matrix. Matrix with columns ranked wrt each row. -rcpp_m_score <- function(x_df) { - .Call('BoolTraineR_rcpp_m_score', PACKAGE = 'BoolTraineR', x_df) -} - #' @title Calculating validation scores between two adjacency matrices #' #' @description diff --git a/R/compression.R b/R/compression.R index 29a03d3..68b5fc0 100644 --- a/R/compression.R +++ b/R/compression.R @@ -5,39 +5,53 @@ #' This function limits the number of possible variables in the model to 999. #' #' @param bmodel S4 BoolModel object. -#' @param inter_bool logical. Indicate whether to consider AND terms. #' #' @export -get_encodings = function(bmodel, inter_bool) +get_encodings = function(bmodel) { + and_bool = check_and(bmodel) + #Get all possible terms. svar = bmodel@target_var - if(inter_bool) + if(and_bool) { dvar = sapply(combn(svar, 2, simplify=F), function(x) paste(x, collapse='&')) #get all possible interacting pairs. dvar = c(dvar, sapply(combn(svar, 2, simplify=F), function(x) paste(rev(x), collapse='&'))) #get the reversed pattern as well. + + term_pool = c(svar, dvar) + term_pool = c('0', term_pool, '!0', paste('!', term_pool, sep='')) #add in inh terms. + + #Generate index for activation terms. + num_pool = seq(1, length(svar)+1) #get numbers for svar. + num_pool = c(num_pool, as.vector(replicate(2, seq(max(num_pool)+1, max(num_pool)+(length(dvar)/2))))) #get numbers for both forward and reverse dvar. + + #Generate index for inhibition terms. + num_pool = c(num_pool, seq(max(num_pool)+1, max(num_pool)+length(svar)+1)) #get numbers for svar. + num_pool = c(num_pool, as.vector(replicate(2, seq(max(num_pool)+1, max(num_pool)+(length(dvar)/2))))) #get numbers for both forward and reverse dvar. + + num_pool = unname(sapply(num_pool, function(x) ifelse(nchar(x)==1, paste('0', x, sep=''), x))) #convert single digit to double digit. + num_pool = unname(sapply(num_pool, function(x) ifelse(nchar(x)==2, paste('0', x, sep=''), x))) #convert double digit to triple digit. + num_pool = unname(sapply(num_pool, function(x) ifelse(nchar(x)==3, paste('0', x, sep=''), x))) #convert triple digit to quadruple digit. + + names(num_pool) = term_pool } else { - dvar = c() + term_pool = svar + term_pool = c('0', term_pool, '!0', paste('!', term_pool, sep='')) #add in inh terms. + + #Generate index for activation terms. + num_pool = seq(1, length(svar)+1) #get numbers for svar. + + #Generate index for inhibition terms. + num_pool = c(num_pool, seq(max(num_pool)+1, max(num_pool)+length(svar)+1)) #get numbers for svar. + + num_pool = unname(sapply(num_pool, function(x) ifelse(nchar(x)==1, paste('0', x, sep=''), x))) #convert single digit to double digit. + num_pool = unname(sapply(num_pool, function(x) ifelse(nchar(x)==2, paste('0', x, sep=''), x))) #convert double digit to triple digit. + num_pool = unname(sapply(num_pool, function(x) ifelse(nchar(x)==3, paste('0', x, sep=''), x))) #convert triple digit to quadruple digit. + + names(num_pool) = term_pool } - term_pool = c(svar, dvar) - term_pool = c('0', term_pool, '!0', paste('!', term_pool, sep='')) #add in inh terms. - - #Generate index for activation terms. - num_pool = seq(1, length(svar)+1) #get numbers for svar. - num_pool = c(num_pool, as.vector(replicate(2, seq(max(num_pool)+1, max(num_pool)+(length(dvar)/2))))) #get numbers for both forward and reverse dvar. - - #Generate index for inhibition terms. - num_pool = c(num_pool, seq(max(num_pool)+1, max(num_pool)+length(svar)+1)) #get numbers for svar. - num_pool = c(num_pool, as.vector(replicate(2, seq(max(num_pool)+1, max(num_pool)+(length(dvar)/2))))) #get numbers for both forward and reverse dvar. - - num_pool = unname(sapply(num_pool, function(x) ifelse(nchar(x)==1, paste('0', x, sep=''), x))) #convert single digit to double digit. - num_pool = unname(sapply(num_pool, function(x) ifelse(nchar(x)==2, paste('0', x, sep=''), x))) #convert double digit to triple digit. - num_pool = unname(sapply(num_pool, function(x) ifelse(nchar(x)==3, paste('0', x, sep=''), x))) #convert triple digit to quadruple digit. - - names(num_pool) = term_pool - stopifnot(all(!is.na(names(num_pool)))) stopifnot(all(!is.na(term_pool))) diff --git a/R/data_desc.R b/R/data_desc.R index ec8744d..2fbbb5c 100644 --- a/R/data_desc.R +++ b/R/data_desc.R @@ -31,24 +31,6 @@ NULL #' @usage data(bon_istate) NULL -#' @title Initial state from Moignard et al. -#' -#' @description -#' An intial state obtained from data in Moignard et al, determined by taking colMeans over unique rows, and rounding the means to 0-1. -#' Values for genes that are missing in Moignard et al, but are present in Bonzanni et al, are determined by taking values from the original initial state supplied in Bonzanni et al. -#' It contains a set of Boolean values for 20 genes. -#' -#' @format -#' A data frame with 1 row and 20 columns. -#' -#' Rows: each row consists of 1 set of Boolean state. -#' Columns: each column is for 1 gene/variable. -#' -#' @docType data -#' @name bon_moig_istate -#' @usage data(bon_moig_istate) -NULL - #' @title Myeloid Boolean Model from Krumsiek et al. #' #' @description @@ -82,47 +64,47 @@ NULL #' @usage data(krum_istate) NULL -#' @title Raw single cell qRT-PCR expression data from Moignard et al. +#' @title Raw single cell qRT-PCR expression data from Wilson et al. #' #' @description #' A raw single cell expression data obtained from multiple cell types. #' #' @format -#' A data frame with 597 rows and 18 columns. +#' A data frame with 1626 rows and 44 columns. #' #' Rows: each row consists of raw expression values from 1 cell. #' Columns: each column is for 1 gene/variable. #' #' @docType data -#' @name moig_raw_data -#' @usage data(moig_raw_data) +#' @name wilson_raw_data +#' @usage data(wilson_raw_data) NULL -#' @title Discretised single cell qRT-PCR expression data from Moignard et al. +#' @title Raw single cell RNAseq expression data from Wilson et al. #' #' @description -#' A discretised single cell expression data obtained from multiple cell types. +#' A raw single cell expression data obtained from multiple cell types. #' #' @format -#' A data frame with 597 rows and 18 columns. +#' A data frame with 96 rows and 38498 columns. #' -#' Rows: each row consists of discretised expression values from 1 cell. +#' Rows: each row consists of raw expression values from 1 cell. #' Columns: each column is for 1 gene/variable. #' #' @docType data -#' @name moig_data -#' @usage data(moig_data) +#' @name wilson_raw_rnaseq +#' @usage data(wilson_raw_rnaseq) NULL -#' @title Estimated parameters from Wilson et al. data +#' @title Example Boolean Models used in the vignette #' #' @description -#' A list of parameters (based on log normal distribution) estimated from Wilson et al. single-cell qPCR expression data. +#' 3 Boolean models used in the examples of the vignette. #' #' @format -#' A list with 4 numeric vectors, all_mu1, all_mu2, all_sig1, all_sig2. Note that each element in the vector is estimated from a single gene. +#' Each Boolean model is a BoolModel object. #' #' @docType data -#' @name real_param -#' @usage data(real_param) +#' @name example_models +#' @usage data(example_models) NULL \ No newline at end of file diff --git a/R/general.R b/R/general.R index d1a3cf4..3569cfd 100644 --- a/R/general.R +++ b/R/general.R @@ -15,6 +15,27 @@ vcat = function(string, bool) invisible() } +#' @title Check if containing AND terms +#' +#' @description +#' This function checks if a particular Boolean model contains AND terms. +#' +#' @param bmodel BoolModel object. +check_and = function(bmodel) +{ + and_bool = F + + if(any(grepl('&', bmodel@rule_act))) + { + and_bool = T + } else if(any(grepl('&', bmodel@rule_inh))) + { + and_bool = T + } + + return(and_bool) +} + #' @title Extract Boolean terms #' #' @description @@ -99,26 +120,6 @@ match_term = function(t1, t2, mode='logic') } } -#' @title Check for matching states -#' -#' @description -#' This function finds a match between two df of states. Returns a row index vector indicating for each row of mstate, what is the corresponding row in xstate. If a match cannot be found, a 0 will be return. -#' Only columns that are present in both df will be used in comparison. Note that the row index starts from 1 (as in R), not from 0 (as in cpp). -#' -#' @param mstate data frame. It should be a state(row) x gene(column) df. colnames will be used in comparison. -#' @param xstate data frame. It should be a state(row) x gene(column) df. colnames will be used in comparison. -match_state = function(mstate, xstate) -{ - #Filtering the columns in mstate and xstate. - same_col = intersect(colnames(mstate), colnames(xstate)) - fmstate = mstate[, same_col] - fxstate = xstate[, same_col] - - ind = match_state_loop(as.matrix(fmstate), as.matrix(fxstate)) - - return(ind) #zeroes are present due to mismatching in cpp code. -} - #' @title Pick a random minimum value #' #' @description @@ -167,191 +168,6 @@ filter_dflist = function(x, y, uniq_bool=T) } } -#' @title Check if the Boolean model violates constraints. -#' -#' @description -#' This function checks if the Boolean model violates contraints. Return logical value. -#' (1) Each gene rule should not have more terms than max_varperrule. -#' (2) The same term should not occur twice in the same rule. -#' -#' @param bmodel S4 BoolModel object. -#' @param max_varperrule integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6. -#' -#' @export -check_bmodel = function(bmodel, max_varperrule) -{ - check_bool = F - - act_rule = lapply(bmodel@rule_act, function(x) unlist(strsplit(x, '&'))) - inh_rule = lapply(bmodel@rule_inh, function(x) unlist(strsplit(x, '&'))) - - #(1) Check 1 : The same term should not occur twice in the same rule. - check_bool = check_bool | any(sapply(act_rule, function(x) any(duplicated(x)))) - check_bool = check_bool | any(sapply(inh_rule, function(x) any(duplicated(x)))) - - #(2) Check 2 : Each gene rule should not have more terms than max_varperrule. - act_rule = sapply(act_rule, function(x) unique(x)) - act_rule = sapply(act_rule, function(x) x[!(x %in% '0')]) #remove zeroes. - - inh_rule = sapply(inh_rule, function(x) unique(x)) - inh_rule = sapply(inh_rule, function(x) x[!(x %in% '0')]) #remove zeroes. - - check_bool = check_bool | any(sapply(act_rule, function(x) length(x)>max_varperrule)) - check_bool = check_bool | any(sapply(inh_rule, function(x) length(x)>max_varperrule)) - - if(check_bool) - { - return(FALSE) - } else - { - return(TRUE) - } -} - -#' @title Obtain parameters for bimodal distribution from real data -#' -#' @description -#' This function obtains parameters for bimodal distribution. Returns 4 parameters: mu1, mu2, sig1, sig2. -#' -#' @param x matrix. Input expression data. Col-genes, row-samples. -#' @param data_type character. Specify data types: qpcr, rnaseq. -#' -#' @export -param_bimodal = function(x, data_type='qpcr') -{ - require(MASS) - - #(1) Initialise data. - tmp = initialise_raw_data(x, data_type) - x_con = tmp[[1]] #continuous - x_bin = tmp[[2]] #discrete - - #rescale the data to remove zeroes. - x_con = x_con + 0.0001 - - all_mu1 = c() - all_mu2 = c() - all_sig1 = c() - all_sig2 = c() - for(i in 1:ncol(x_bin)) - { - #(2) Extract the parameters for the two modal distributions. - #First modal - low expression, 0s. Second modal - high expression, 1s. - x_lowmode = x_con[x_bin[,i]!=1, i] - x_highmode = x_con[x_bin[,i]==1, i] - - #(3) Estimate parameters - param1 = MASS::fitdistr(x_lowmode, 'lognormal') - param2 = MASS::fitdistr(x_highmode, 'lognormal') - - #For checking. - #hist(rlnorm(1000, param1$estimate[1], param1$estimate[2])) - #hist(rlnorm(1000, param2$estimate[1], param2$estimate[2])) - - all_mu1 = c(all_mu1, param1$estimate[1]) - all_mu2 = c(all_mu2, param2$estimate[1]) - all_sig1 = c(all_sig1, param1$estimate[2]) - all_sig2 = c(all_sig2, param2$estimate[2]) - } - - return(list(all_mu1=all_mu1, all_mu2=all_mu2, all_sig1=all_sig1, all_sig2=all_sig2)) -} - - -#' @title Generate random real numbers from binary values -#' -#' @description -#' This function generates random real numbers from binary values, with supplied parameters. Returns a vector of real values. -#' -#' @param x logical or 0/1 numeric matrix. Col-genes, row-samples. -#' @param param list of parameters given by param_bimodal(). -#' -#' @export -bin_to_real = function(x, param) -{ - require(MASS) - - #(1) Convert logical to numeric. - x = x + 0 - - #(2) Estimate the distribution for the parameters. - mu1_dist = MASS::fitdistr(-param$all_mu1, 'lognormal') - mu2_dist = MASS::fitdistr(-param$all_mu2, 'lognormal') - sig1_dist = MASS::fitdistr(param$all_sig1, 'lognormal') - sig2_dist = MASS::fitdistr(param$all_sig2, 'lognormal') - - y = matrix(NA, ncol=ncol(x), nrow=nrow(x)) - for(i in 1:ncol(x)) - { - #(3) Generating random values from the distribution. - mu1_est = -rlnorm(1, mu1_dist$estimate[1], mu1_dist$estimate[2]) - mu2_est = -rlnorm(1, mu2_dist$estimate[1], mu2_dist$estimate[2]) - sig1_est = rlnorm(1, sig1_dist$estimate[1], sig1_dist$estimate[2]) - sig2_est = rlnorm(1, sig2_dist$estimate[1], sig2_dist$estimate[2]) - - #(4) For each gene, generate random expression values using the obtained random parameters. - for(j in 1:nrow(x)) - { - if(x[j,i]==0) - { - y[j,i] = rlnorm(1, mu1_est, sig1_est) - - if(y[j,i] > 1) - { - y[j,i] = 1 - } else if(y[j,i] < 0) - { - stop('Error in generating continuous values.') - } - } else - { - y[j,i] = rlnorm(1, mu2_est, sig2_est) - - if(y[j,i] > 1) - { - y[j,i] = 1 - } else if(y[j,i] < 0) - { - stop('Error in generating continuous values.') - } - } - } - } - - return(y) -} - -#' @title Check for equivalent models -#' -#' @description -#' This function checks if the two models have the same rules. Return a logical value. Only TRUE if each rule for each gene is the same. -#' -#' @param bmodel1 S4 BoolModel object. -#' @param bmodel2 S4 BoolModel object. -#' @param inter_bool logical. Indicate whether to consider AND terms. -#' @param max_varperrule integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6. -#' -#' @export -equi_model = function(bmodel1, bmodel2, inter_bool, max_varperrule) -{ - stopifnot(length(bmodel1@target)==length(bmodel2@target)) - stopifnot(get_encodings(bmodel1, inter_bool)==get_encodings(bmodel2, inter_bool)) - - ind = get_encodings(bmodel1, inter_bool) - - dist = model_dist(bmodel1, bmodel2, inter_bool, max_varperrule) - - if(dist==0) - { - match = T - } else - { - match = F - } - - return(match) -} - #' @title Calculate distance between Boolean models #' #' @description @@ -359,13 +175,12 @@ equi_model = function(bmodel1, bmodel2, inter_bool, max_varperrule) #' #' @param x S4 BoolModel object. Test model. #' @param y S4 BoolModel object. Reference model. -#' @param inter_bool logical. Indicate whether to consider AND terms. #' @param max_varperrule integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6. #' #' @export -model_dist = function(x, y, inter_bool, max_varperrule) +model_dist = function(x, y, max_varperrule) { - set_diff = unlist(model_setdiff(x, y, inter_bool, max_varperrule)) + set_diff = unlist(model_setdiff(x, y, max_varperrule)) #Calculate total dist. t_dist = length(set_diff) @@ -381,17 +196,16 @@ model_dist = function(x, y, inter_bool, max_varperrule) #' #' @param x S4 BoolModel object. Test model. #' @param y S4 BoolModel object. Reference model. -#' @param inter_bool logical. Indicate whether to consider AND terms. #' @param max_varperrule integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6. #' @param directed logical. If TRUE, return the difference in terms with respect to x. #' #' @export -model_setdiff = function(x, y, inter_bool, max_varperrule, directed=F) +model_setdiff = function(x, y, max_varperrule, directed=F) { stopifnot(length(x@target) == length(y@target)) - stopifnot(get_encodings(x, inter_bool)==get_encodings(y, inter_bool)) + stopifnot(get_encodings(x)==get_encodings(y)) - ind = get_encodings(x, inter_bool) + ind = get_encodings(x) x1 = compress_bmodel(x, ind, max_varperrule) x2 = compress_bmodel(y, ind, max_varperrule) diff --git a/R/methods.R b/R/methods.R index 0fb4c9a..5a132a1 100644 --- a/R/methods.R +++ b/R/methods.R @@ -62,6 +62,46 @@ writeBM = function(bmodel, file, gene.names=F, rownames=F) write.csv(out_df, file=file, quote=F, row.names=F) } +#' @title Plot Boolean Model +#' +#' @description +#' This method plots the network underlying Boolean models by using igraph for quick visualisation. +#' +#' @param bmodel S4 BoolModel object. +#' @param makePlot logical. Whether to make plot or just return the object. Default to T. +#' @param ... Additional parameters to plot.igraph. +#' +#' @export +plotBM = function(bmodel, makePlot=T, ...) +{ + require(igraph) + + #Convert to amat. + am = bm_to_amat(bmodel) + + #Convert into a graph. + g = graph.adjacency(am, mode='directed', weighted=T) + + #Setup edge colour for plotting. + #Activation = black, inhibition = red + E(g)$color = sapply(E(g)$weight, function(x) ifelse(x==1, 'black', 'red')) + + #Setup other colours. + V(g)$frame.color = "white" + V(g)$color = rgb(255, 165, 0, 200, maxColorValue = 255) + + #Setup vertex font size. + V(g)$label.cex = 1.5 + + if(makePlot) + { + #Make the plot. + plot(g, layout=layout_in_circle, ...) + } + + invisible(g) +} + #' @title Convert BoolModel into adjacency matrix #' #' @description diff --git a/R/model_modification.R b/R/model_modification.R index 974320d..0199d3b 100644 --- a/R/model_modification.R +++ b/R/model_modification.R @@ -5,7 +5,9 @@ #' #' @param bm S4 BoolModel object. #' @param index integer. Specifying rule of which gene to modify. If NULL, modifies all rules in the model. Defaults to NULL. -#' @param overlap_gene character vector. Specify which genes are present in both model and data inputs. Only needed when index=NULL. +#' @param overlap_gene character vector. Specify which genes are present in both model and data inputs. +#' +#' @export minmod_model = function(bm, index=NULL, overlap_gene=NULL) { if(is.null(index)) @@ -46,6 +48,8 @@ minmod_model = function(bm, index=NULL, overlap_gene=NULL) #' @param index integer. Specifying rule of which gene to modify. minmod_internal = function(bm, index) { + and_bool = check_and(bm) + arule = bm@rule_act[[index]] irule = bm@rule_inh[[index]] @@ -75,6 +79,7 @@ minmod_internal = function(bm, index) } } } + dellist_arule[sapply(dellist_arule, length)==0] = list('0') #if there is any empty term at the end, add in '0' #Deletion of inh rule. dellist_irule = list() @@ -102,38 +107,45 @@ minmod_internal = function(bm, index) } } } + dellist_irule[sapply(dellist_irule, length)==0] = list('0') #if there is any empty term at the end, add in '0' #Addition of act rule. (single) pos_actterm = bm@target_var[!bm@target_var %in% unlist(strsplit(c(arule, irule), '&'))] - addlist_arule = list() - if(arule[1] == '0') + if(length(pos_actterm) != 0) { - for(i in 1:length(pos_actterm)) + addlist_arule = list() + if(arule[1] == '0') { - addlist_arule = c(addlist_arule, list(pos_actterm[i])) - } - } else - { - for(i in 1:length(pos_actterm)) + for(i in 1:length(pos_actterm)) + { + addlist_arule = c(addlist_arule, list(pos_actterm[i])) + } + } else { - addlist_arule = c(addlist_arule, list(c(arule, pos_actterm[i]))) + for(i in 1:length(pos_actterm)) + { + addlist_arule = c(addlist_arule, list(c(arule, pos_actterm[i]))) + } } - } - - #Addition of act rule. (double) - if(arule[1] != '0') - { - for(i in 1:length(pos_actterm)) + + if(and_bool) { - for(j in 1:length(arule)) + #Addition of act rule. (double) + if(arule[1] != '0') { - if(grepl('&', arule[j])) - { - next - } else + for(i in 1:length(pos_actterm)) { - tmp = sprintf('%s&%s', pos_actterm[i], arule[j]) - addlist_arule = c(addlist_arule, list(c(arule[-j], tmp))) + for(j in 1:length(arule)) + { + if(grepl('&', arule[j])) + { + next + } else + { + tmp = sprintf('%s&%s', pos_actterm[i], arule[j]) + addlist_arule = c(addlist_arule, list(c(arule[-j], tmp))) + } + } } } } @@ -141,35 +153,42 @@ minmod_internal = function(bm, index) #Addition of inh rule. (single) pos_inhterm = bm@target_var[!bm@target_var %in% unlist(strsplit(c(arule, irule), '&'))] - addlist_irule = list() - if(irule[1] == '0') + #pos_inhterm = pos_inhterm[!is.na(pos_inhterm)] + if(length(pos_inhterm)!=0) { - for(i in 1:length(pos_inhterm)) + addlist_irule = list() + if(irule[1] == '0') { - addlist_irule = c(addlist_irule, list(pos_inhterm[i])) - } - } else - { - for(i in 1:length(pos_inhterm)) + for(i in 1:length(pos_inhterm)) + { + addlist_irule = c(addlist_irule, list(pos_inhterm[i])) + } + } else { - addlist_irule = c(addlist_irule, list(c(irule, pos_inhterm[i]))) + for(i in 1:length(pos_inhterm)) + { + addlist_irule = c(addlist_irule, list(c(irule, pos_inhterm[i]))) + } } - } - - #Addition of inh rule. (double) - if(irule[1] != '0') - { - for(i in 1:length(pos_inhterm)) + + if(and_bool) { - for(j in 1:length(irule)) + #Addition of inh rule. (double) + if(irule[1] != '0') { - if(grepl('&', irule[j])) - { - next - } else + for(i in 1:length(pos_inhterm)) { - tmp = sprintf('%s&%s', pos_inhterm[i], irule[j]) - addlist_irule = c(addlist_irule, list(c(irule[-j], tmp))) + for(j in 1:length(irule)) + { + if(grepl('&', irule[j])) + { + next + } else + { + tmp = sprintf('%s&%s', pos_inhterm[i], irule[j]) + addlist_irule = c(addlist_irule, list(c(irule[-j], tmp))) + } + } } } } @@ -226,14 +245,19 @@ minmod_internal = function(bm, index) #' @title Add extra genes to a Boolean model #' #' @description -#' This function adds extra genes to a Boolean model. Input model must be in data frame format, output model will be BoolModel object. +#' This function adds extra genes to a Boolean model. Return a list of BoolModel object and an initial state. #' -#' @param in_model data frame with 2 columns, which are targets and factors #' @param in_gene character vector. Genes to be added into the model. +#' @param in_model data frame or BoolModel object. If it is a data frame, it must have 2 columns, which are targets and update functions. #' #' @export -grow_bmodel = function(in_model, in_gene) +grow_bmodel = function(in_gene, in_model) { + if(class(in_model)=='BoolModel') + { + in_model = bm_to_df(in_model) + } + #Generate a new data frame to be added into the model df. empty_func = '(0) &! (0)' in_row = data.frame(in_gene, empty_func) diff --git a/R/output_format.R b/R/output_format.R index a3b871e..af4fa57 100644 --- a/R/output_format.R +++ b/R/output_format.R @@ -1,13 +1,15 @@ -#' @title Output a Boolean Model into Cytoscape readable format +#' @title Output a Boolean Model into Cytoscape & Gephi readable format #' #' @description -#' This function outputs a Boolean Model in a format that is readable by Cytoscape. Return invisibly the edges (with edge attributes) and node attributes. (i.e. list of 2 dfs) +#' This function outputs a Boolean Model in a format that is readable by Cytoscape and Gephi. Return invisibly the edges (with edge attributes) and node attributes. (i.e. list of 2 dfs) #' #' @param bmodel S4 BoolModel object. -#' @param filepath character vector. Specify path (AND NOT file name). Default to current working directory, i.e. getwd(). Set to NULL to disable file output. +#' @param path character. Specify path (AND NOT file name). Default to current working directory, i.e. getwd(). Set to NULL to disable file output. +#' @param file character. Specify file name. Default to NULL for default file names. +#' @param and_node logical. Specify AND as an individual node. Default to T. #' #' @export -outcyto_model = function(bmodel, filepath=getwd()) +outgraph_model = function(bmodel, path=getwd(), file=NULL, and_node=T) { edge_vec = character() #setup output vector. @@ -38,50 +40,64 @@ outcyto_model = function(bmodel, filepath=getwd()) s_irule[sapply(s_irule, function(x) length(x)==0)] = '0' d_irule[sapply(d_irule, function(x) length(x)==0)] = '0' - #Start writing rules out for single terms. - for(i in 1:length(bmodel@target)) - { - edge_vec = c(edge_vec, paste(s_arule[[i]], 'activates', bmodel@target[i], sep=',')) - edge_vec = c(edge_vec, paste(s_irule[[i]], 'inhibits', bmodel@target[i], sep=',')) - } - - #Write rules out for double terms. - tmp_and = paste('and', seq(1, sum(grepl('&', unlist(d_arule)))+sum(grepl('&', unlist(d_irule)))), sep='_') - ind = 1 - for(i in 1:length(bmodel@target)) + if(and_node) { - #Write arule. - if(d_arule[[i]][1]!='0') + #Start writing rules out for single terms. + for(i in 1:length(bmodel@target)) { - tmp_rule = strsplit(d_arule[[i]], '&') - - for(j in 1:length(tmp_rule)) + edge_vec = c(edge_vec, paste(s_arule[[i]], 'activates', bmodel@target[i], sep=',')) + edge_vec = c(edge_vec, paste(s_irule[[i]], 'inhibits', bmodel@target[i], sep=',')) + } + + #Write rules out for double terms. + tmp_and = paste('and', seq(1, sum(grepl('&', unlist(d_arule)))+sum(grepl('&', unlist(d_irule)))), sep='_') + ind = 1 + for(i in 1:length(bmodel@target)) + { + #Write arule. + if(d_arule[[i]][1]!='0') { - #Join coproteins with ANDs. - edge_vec = c(edge_vec, paste(tmp_rule[[j]], 'activates', tmp_and[ind], sep=',')) + tmp_rule = strsplit(d_arule[[i]], '&') - #Join ANDs with target genes. - edge_vec = c(edge_vec, paste(tmp_and[ind], 'activates', bmodel@target[i], sep=',')) + for(j in 1:length(tmp_rule)) + { + #Join coproteins with ANDs. + edge_vec = c(edge_vec, paste(tmp_rule[[j]], 'activates', tmp_and[ind], sep=',')) + + #Join ANDs with target genes. + edge_vec = c(edge_vec, paste(tmp_and[ind], 'activates', bmodel@target[i], sep=',')) + + ind = ind + 1 + } + } + + #Write irule. + if(d_irule[[i]][1]!='0') + { + tmp_rule = strsplit(d_irule[[i]], '&') - ind = ind + 1 + for(j in 1:length(tmp_rule)) + { + #Join coproteins with ANDs. + edge_vec = c(edge_vec, paste(tmp_rule[[j]], 'activates', tmp_and[ind], sep=',')) #note that this should be activates. + + #Join ANDs with target genes. + edge_vec = c(edge_vec, paste(tmp_and[ind], 'inhibits', bmodel@target[i], sep=',')) + + ind = ind + 1 + } } } - - #Write irule. - if(d_irule[[i]][1]!='0') + } else + { + #Start writing rules out for both single and double terms. + for(i in 1:length(bmodel@target)) { - tmp_rule = strsplit(d_irule[[i]], '&') + edge_vec = c(edge_vec, paste(s_arule[[i]], 'activates', bmodel@target[i], sep=',')) + edge_vec = c(edge_vec, paste(s_irule[[i]], 'inhibits', bmodel@target[i], sep=',')) - for(j in 1:length(tmp_rule)) - { - #Join coproteins with ANDs. - edge_vec = c(edge_vec, paste(tmp_rule[[j]], 'activates', tmp_and[ind], sep=',')) #note that this should be activates. - - #Join ANDs with target genes. - edge_vec = c(edge_vec, paste(tmp_and[ind], 'inhibits', bmodel@target[i], sep=',')) - - ind = ind + 1 - } + edge_vec = c(edge_vec, paste(d_arule[[i]], 'activates', bmodel@target[i], sep=',')) + edge_vec = c(edge_vec, paste(d_irule[[i]], 'inhibits', bmodel@target[i], sep=',')) } } @@ -91,20 +107,42 @@ outcyto_model = function(bmodel, filepath=getwd()) #Convert the vector into a matrix. edge_df = data.frame(do.call(rbind, strsplit(edge_vec, ','))) - colnames(edge_df) = c('start_node', 'interaction', 'end_node') - - #Generating node attributes. (to distinguish gene nodes from AND nodes) - node_vec = c(bmodel@target, tmp_and) - node_df = data.frame(node_names=node_vec, node_types=ifelse(grepl('and', node_vec), 'ands', 'genes')) + colnames(edge_df) = c('Source', 'Directed', 'Target') + if(and_node) + { + #Generating node attributes. (to distinguish gene nodes from AND nodes) + node_vec = c(bmodel@target, tmp_and) + node_df = data.frame(Id=node_vec, node_types=ifelse(grepl('and', node_vec), 'ands', 'genes')) + } + #Output into files. - if(!is.null(filepath)) + if(!is.null(path)) { - write.csv(edge_df, file=paste(filepath, '/cytoscape_edges.csv', sep=''), quote=F) - write.csv(node_df, file=paste(filepath, '/cytoscape_nodes.csv', sep=''), quote=F) + if(is.null(file)) + { + write.csv(edge_df, file=paste(path, '/edges.csv', sep=''), quote=F, row.names=F) + if(and_node) + { + write.csv(node_df, file=paste(path, '/nodes.csv', sep=''), quote=F, row.names=F) + } + } else + { + write.csv(edge_df, file=paste(path, '/', file, '_edges.csv', sep=''), quote=F, row.names=F) + if(and_node) + { + write.csv(node_df, file=paste(path, '/', file, '_nodes.csv', sep=''), quote=F, row.names=F) + } + } } - invisible(list(edge_df, node_df)) + if(and_node) + { + invisible(list(edge_df, node_df)) + } else + { + invisible(list(edge_df)) + } } #' @title Output a Boolean Model into Genysis readable format @@ -113,10 +151,11 @@ outcyto_model = function(bmodel, filepath=getwd()) #' This function outputs a Boolean Model in a format that is readable by Genysis. Return invisibly the formatted vector. #' #' @param bmodel S4 BoolModel object. -#' @param filepath character vector. Specify path (AND NOT file name). Default to current working directory, i.e. getwd(). Set to NULL to disable file output. -#' +#' @param path character. Specify path (AND NOT file name). Default to current working directory, i.e. getwd(). Set to NULL to disable file output. +#' @param file character. Specify file name. Default to NULL for default file names. +#' #' @export -outgenysis_model = function(bmodel, filepath=getwd()) +outgenysis_model = function(bmodel, path=getwd(), file=NULL) { gene = bmodel@target arule = bmodel@rule_act @@ -150,10 +189,15 @@ outgenysis_model = function(bmodel, filepath=getwd()) } #Output into files. - if(!is.null(filepath)) + if(!is.null(path)) { - exist_files = list.files(path=filepath, pattern='^genysis_input_[0-9]+.txt') - filename = sprintf('genysis_input_%s.txt', length(exist_files)+1) + if(is.null(file)) + { + filename = sprintf('genysis_input.txt') + } else + { + filename = sprintf('%s_genysis_input.txt', file) + } fcon = file(filename, open='w') writeLines(out_vec, fcon) @@ -163,10 +207,10 @@ outgenysis_model = function(bmodel, filepath=getwd()) invisible(out_vec) } -#' @title Generate state transition graph readable by Cytoscapes +#' @title Generate state transition graph #' #' @description -#' This function generates a state transition graph using a Boolean model and its state space. Each node represent a state. All nodes in this graph is linked by an edge only if the 2 states have different value in only 1 gene. The output is readable by Cytoscape. +#' This function generates a state transition graph using a Boolean model and its state space. Each node represent a state. All nodes in this graph is linked by an edge only if the 2 states have different value in only 1 gene. The output is readable by Cytoscape and Gephi. #' #' @param mstate data frame. It should be a state(row) x gene(column) df. #' @param bmodel S4 BoolModel object. @@ -175,7 +219,7 @@ outgenysis_model = function(bmodel, filepath=getwd()) #' @param filepath character vector. Specify path (AND NOT file name). Default to current working directory, i.e. getwd(). Set to NULL to disable file output. #' #' @export -outcyto_stategraph = function(mstate, bmodel, directed=F, record.both=F, filepath=getwd()) +outstate_graph = function(mstate, bmodel, directed=F, record.both=F, filepath=getwd()) { cat(sprintf('Generating state transition network...\n')) @@ -215,8 +259,8 @@ outcyto_stategraph = function(mstate, bmodel, directed=F, record.both=F, filepat } cat('.\n') - net_all = data.frame(start_node=character(1), interaction=character(1), - end_node=character(1), stringsAsFactors=F) #Both 1 and stringsAsFactors are essential, but 1 will give an empty first row. + net_all = data.frame(Source=character(1), Directed=character(1), + Target=character(1), stringsAsFactors=F) #Both 1 and stringsAsFactors are essential, but 1 will give an empty first row. for(i in 1:nrow(mstate)) { if(length(adj_cells[[i]] != 0)) #To exclude cell that does not have partner. @@ -237,8 +281,8 @@ outcyto_stategraph = function(mstate, bmodel, directed=F, record.both=F, filepat if(record.both) { - nonnet_all = data.frame(start_node=character(1), interaction=character(1), - end_node=character(1), stringsAsFactors=F) #Both 1 and stringsAsFactors are essential, but 1 will give an empty first row. + nonnet_all = data.frame(Source=character(1), Directed=character(1), + Target=character(1), stringsAsFactors=F) #Both 1 and stringsAsFactors are essential, but 1 will give an empty first row. for(i in 1:nrow(mstate)) { if(length(nonadj_cells[[i]] != 0)) #To exclude cell that does not have partner. @@ -260,8 +304,8 @@ outcyto_stategraph = function(mstate, bmodel, directed=F, record.both=F, filepat #Output into files. if(!is.null(filepath)) { - write.csv(net_all, paste(filepath, '/cytoscape_statespace_edges.txt', sep=''), quote=F, row.names=F) - write.csv(nonnet_all, paste(filepath, '/cytoscape_notstatespace_edges.txt', sep=''), quote=F, row.names=F) + write.csv(net_all, paste(filepath, '/statespace_edges.txt', sep=''), quote=F, row.names=F) + write.csv(nonnet_all, paste(filepath, '/notstatespace_edges.txt', sep=''), quote=F, row.names=F) } invisible(list(net_all, nonnet_all)) @@ -270,7 +314,7 @@ outcyto_stategraph = function(mstate, bmodel, directed=F, record.both=F, filepat #Output into files. if(!is.null(filepath)) { - write.csv(net_all, paste(filepath, '/cytoscape_statespace_edges.txt', sep=''), quote=F, row.names=F) + write.csv(net_all, paste(filepath, '/statespace_edges.txt', sep=''), quote=F, row.names=F) } invisible(net_all) diff --git a/R/rand_model.R b/R/rand_model.R index 07d7764..db07224 100644 --- a/R/rand_model.R +++ b/R/rand_model.R @@ -7,9 +7,9 @@ #' @param x character vector. A vector of all single terms to be used. #' @param np integer. Number of gene variables in a rule. NOT max_varperrule here. #' @param tar_ind numerical. Indicate which gene is the rule for. Used in preventing self-loop. -#' @param ibool logical. Indicates whether to include AND terms or not. Default to F. +#' @param and_bool logical. Indicates whether to include AND terms or not. #' @param self_loop logical. Indicates whether to allow self_loop. Default to F. -gen_singlerule = function(x, np, tar_ind, ibool=F, self_loop=F) +gen_singlerule = function(x, np, tar_ind, and_bool, self_loop=F) { #Convert x to the form of 'v1s'. if(!(all(grepl('v[0-9]+s', x)))) @@ -54,7 +54,7 @@ gen_singlerule = function(x, np, tar_ind, ibool=F, self_loop=F) rule_act = c() - if(ibool) + if(and_bool) { #Determine which single variables to combine into double variables. ra_ind = as.logical(replicate(rnum_act,round(runif(1)))) @@ -101,7 +101,7 @@ gen_singlerule = function(x, np, tar_ind, ibool=F, self_loop=F) tmp_rinh = sample(x[!(x %in% unlist(strsplit(rule_act, '&')))], rnum_inh) #exclude terms already used in rule_act. } - if(ibool) + if(and_bool) { #Determine which single variables to combine into double variables. ri_ind = as.logical(replicate(rnum_inh, round(runif(1)))) @@ -145,12 +145,12 @@ gen_singlerule = function(x, np, tar_ind, ibool=F, self_loop=F) #' #' @param var character vector. A vector of single genes/variables to be used in the model. #' @param mvar integer. Maximum number of variables in act or inh rule. Default to length(var). -#' @param inter_bool logical. Indicates whether to include AND terms or not. Default to F. +#' @param and_bool logical. Indicates whether to include AND terms or not. #' @param self_loop logical. Indicates whether to allow self_loop. Default to F. #' #' @details #' The number of terms in a function for a gene is modelled by power-law distribution. -gen_one_rmodel = function(var, mvar=length(var), inter_bool=F, self_loop=F) +gen_one_rmodel = function(var, mvar=length(var), and_bool, self_loop=F) { require(poweRlaw) @@ -164,7 +164,7 @@ gen_one_rmodel = function(var, mvar=length(var), inter_bool=F, self_loop=F) num_partner = poweRlaw::rpldis(length(var), xmin=2, alpha=3) #xmin = the minimum value of resulting random integer, alpha = scaling factor of the distribution. According to literature, for gene network, this should be 3. (or 2 mvar] = mvar #power law distribution can gives very high number, therefore must cap it. - arule = sapply(1:length(num_partner), function(x) gen_singlerule(var, num_partner[x], x, inter_bool, self_loop)) + arule = sapply(1:length(num_partner), function(x) gen_singlerule(var, num_partner[x], x, and_bool, self_loop)) arule = apply(arule, 1, c) #arule is a list of lists, with list[[1]] = all act rules, list[[2]] = all inh rules. bmodel = BoolModel(target=var, target_var=paste('v',seq(1,length(var)),'s', sep=''), rule_act=arule[[1]], rule_inh=arule[[2]]) @@ -181,10 +181,10 @@ gen_one_rmodel = function(var, mvar=length(var), inter_bool=F, self_loop=F) #' @param steps integer. Specify the number of steps between the two Boolean models. If steps=0, give completely random starting model. #' @param num_genes integer. Number of genes in the Boolean models. #' @param max_varperrule integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6. -#' @param inter_bool logical. Indicate whether to consider AND terms. +#' @param and_bool logical. Indicate whether to consider AND terms. #' #' @export -gen_randata = function(n, steps, num_genes, max_varperrule, inter_bool) +gen_randata = function(n, steps, num_genes, max_varperrule, and_bool) { var = paste('v', seq(1, num_genes), 's', sep='') @@ -202,7 +202,7 @@ gen_randata = function(n, steps, num_genes, max_varperrule, inter_bool) istate = initialise_data(istate) #Setting the starting and ending models. - bmodel_pair = gen_two_rmodel(var, steps, max_varperrule, inter_bool) + bmodel_pair = gen_two_rmodel(var, steps, max_varperrule, and_bool) bmodel_start = bmodel_pair[[1]] bmodel_end = bmodel_pair[[2]] @@ -237,10 +237,9 @@ gen_randata = function(n, steps, num_genes, max_varperrule, inter_bool) #' @param steps integer. Specify the number of steps between the two Boolean models. If steps=0, give completely random starting model. #' @param num_genes integer. Number of genes in the Boolean models. #' @param max_varperrule integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6. -#' @param inter_bool logical. Indicate whether to consider AND terms. #' #' @export -gen_randata_bn = function(n, steps, num_genes, max_varperrule, inter_bool) +gen_randata_bn = function(n, steps, num_genes, max_varperrule) { if (!requireNamespace("bnlearn", quietly = TRUE)) { stop("Package bnlearn needed for this function to work. Please install it.", @@ -434,7 +433,7 @@ gen_two_rmodel_dag = function(var, steps, mvar=length(var), in_amat=NULL, acycli #' @param var character vector. A vector of single genes/variables to be used in the model. #' @param steps integer. Number of steps apart between the two models. If steps=0, give completely random starting model. #' @param mvar integer. Maximum number of variables in act or inh rule. Default to length(var). -#' @param inter_bool logical. Indicates whether to include AND terms or not. Default to F. +#' @param and_bool logical. Indicates whether to include AND terms or not. Default to F. #' @param in_bmodel BoolModel object. The starting model supplied. #' @param self_loop logical. Indicates whether to allow self_loop. Default to F. #' @@ -442,7 +441,7 @@ gen_two_rmodel_dag = function(var, steps, mvar=length(var), in_amat=NULL, acycli #' The number of terms in a function for a gene is modelled by power-law distribution. #' #' @export -gen_two_rmodel = function(var, steps, mvar=length(var), inter_bool=F, in_bmodel=NULL, self_loop=F) +gen_two_rmodel = function(var, steps, mvar=length(var), and_bool=F, in_bmodel=NULL, self_loop=F) { if(mvar > length(var)) { @@ -451,7 +450,7 @@ gen_two_rmodel = function(var, steps, mvar=length(var), inter_bool=F, in_bmodel= if(is.null(in_bmodel)) { - bmodel1 = gen_one_rmodel(var, mvar, inter_bool, self_loop) + bmodel1 = gen_one_rmodel(var, mvar, and_bool, self_loop) } else { if(class(in_bmodel)!='BoolModel') @@ -465,14 +464,15 @@ gen_two_rmodel = function(var, steps, mvar=length(var), inter_bool=F, in_bmodel= if(steps==0) { - bmodel2 = gen_one_rmodel(var, mvar, inter_bool, self_loop) + bmodel2 = gen_one_rmodel(var, mvar, and_bool, self_loop) } else { cur_model = bmodel1 bmodel2 = bmodel1 - while(length(unlist(model_setdiff(cur_model, bmodel1, inter_bool, mvar))) steps) + # if(length(unlist(model_setdiff(cur_model, bmodel1, mvar))) > steps) # { # browser() # stop('Error in code.') @@ -531,12 +531,12 @@ gen_two_rmodel = function(var, steps, mvar=length(var), inter_bool=F, in_bmodel= next_model = sample(minmod_model(cur_model, rule_dind)$dellist, 1)[[1]] } - stopifnot(length(unlist(model_setdiff(cur_model, next_model, inter_bool, mvar)))==1) + stopifnot(length(unlist(model_setdiff(cur_model, next_model, mvar)))==1) cur_model = next_model } bmodel2 = cur_model - stopifnot(length(unlist(model_setdiff(bmodel2, bmodel1, inter_bool, mvar)))==steps) + stopifnot(length(unlist(model_setdiff(bmodel2, bmodel1, mvar)))==steps) } return(list(bmodel1, bmodel2)) diff --git a/R/score_calculation.R b/R/score_calculation.R index 502293d..5501305 100644 --- a/R/score_calculation.R +++ b/R/score_calculation.R @@ -71,8 +71,7 @@ m_score = function(x, bmodel, max_varperrule, detail=F) calc_mscore = function(bmodel, istate, fcdata, overlap_gene, max_varperrule, detail=F) { #(1) Simulate each of these models. - tmp = simulate_model(bmodel, istate) - mdata = tmp + mdata = simulate_model(bmodel, istate) #(2) Perform gene filtering on model state space. fmdata = filter_dflist(mdata, overlap_gene) @@ -84,13 +83,67 @@ calc_mscore = function(bmodel, istate, fcdata, overlap_gene, max_varperrule, det } #(3) Score each model state wrt to data state. - #return pairwise scores between each model and data states. - score_matrix = rcpp_man_dist(fmdata, fcdata) #The first must be the model state. This returns a matrix of row=data, col=model. - #score_matrix = rcpp_ham_dist(fmdata, fcdata) + score_matrix = man_dist(fcdata, fmdata) #The first must be the data state. This returns a matrix of row=data, col=model. - final_score = m_score(score_matrix, bmodel, max_varperrule, detail=detail) #return a matrix (1x2) of two model scores, penalised score & unpenalised score. the lower the better. + #(4) Calculate score for distance between states + y = mean(apply(score_matrix, 2, min)) #best - return(final_score) + #(5) Calculate penalty term + #(A) To penalise having too low or too high number of model states, when compared to the number of data states. + #Ideally, the number of model states >= the number of data states. + #abs(number of model states - number of data states) + za = abs(ncol(score_matrix) - nrow(score_matrix)) / (nrow(score_matrix) * length(bmodel@target)) #best + + #(B) To penalise having too many variables in the rules. + var_len = list() #combine act and inh rules for each variable. + for(i in 1:length(bmodel@target)) + { + var_len = c(var_len, list(c(bmodel@rule_act[[i]], bmodel@rule_inh[[i]]))) + } + + #calculate number and fraction of each variable. + var_len = sapply(var_len, function(x) x[!grepl('0', x)]) #to remove '0' + num_var = sapply(var_len, function(x) length(strsplit(paste(x, collapse=''), 'v')[[1]])-1 ) #to count the number of v[0-9]s terms. e.g. v1s&v2s will be counted as 2 terms. + num_var[num_var<0] = 0 #if the rule for the variable is completely empty, has to set the negative values to 0. + + frac_var = (num_var-max_varperrule) + zb_ind = num_var > max_varperrule + zb = sum(frac_var[zb_ind]) + + #(6 Calculate final score. + #Specify the constants for each penalty term. + a = 1 + b = 1 + + f = y + a*za + b*zb #f ranges from 0 to infinity. + + if(detail) + { + output = c(f, y, za, zb) + names(output) = c('f', 'y', 'za', 'zb') + } else + { + output = c(f) + names(output) = c('f') + } + + return(output) +} + +#' @title Calculates pairwise Manhattan distances between two matrices +#' +#' @description +#' This function calculates pairwise Manhattan distances between two matrices. +#' +#' @param x matrix +#' @param y matrix +man_dist = function(x, y) +{ + z = matrix(0, nrow = nrow(x), ncol = nrow(y)) + for (k in 1:nrow(y)) { + z[, k] <- colSums(abs(t(x) - y[k, ])) + } + return(z) } #' @title Calculate true positive, true negative, false positive and false negative diff --git a/R/search.R b/R/search.R index f296717..4812212 100644 --- a/R/search.R +++ b/R/search.R @@ -1,57 +1,230 @@ -#' @title Training Model +#' @title Training Model (using simulated annealing) #' #' @description #' This function performs model training to find the best model, using information from data. It requires an initial state supplied to perform the search, and an initial model can also be supplied to be included in the initial population. #' Note that if a model is supplied, and the genes in the model is different from the genes in the data, only the genes overlapping between model and data will be retained for further analysis. #' -#' @param bmodel Boolean model in data frame. If NULL, use a random Boolean model. Default to NULL. -#' @param edata list of 2 data frames. Initialised continuous and discretised expression data. Each data frame should have state(row) x gene(column). -#' @param istate data frame. Must have only 1 row, which represents 1 initial state. -#' @param max_varperrule integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6. -#' @param tol numeric. Specify the tolerance in ending condition. Default to 1e-6. It cannot be lower than .Machine$double.eps ^ 0.5. -#' @param inter_bool logical. Indicate whether to consider AND terms. Default to TRUE. -#' @param verbose logical. Specifies whether to give detailed output. Default to F. -#' @param self_loop logical. Indicates whether to allow self_loop in random starting model. Only used if is.null(bmodel). Default to F. +#' @param edata data frame of expression data. Should have state(row) x gene(column). +#' @param bmodel Boolean model in data frame. If NULL, use a random Boolean model. Defaults to NULL. +#' @param istate data frame. Must have only 1 row, which represents 1 initial state. Defaults to NULL. +#' @param max_varperrule integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must be higher than number of genes. Defaults to 6. +#' @param and_bool logical. Whether to consider AND terms. IF bmodel is not NULL, defaults to whether AND interaction is included in bmodel. If bmodel is NULL, then defaults to TRUE. +#' @param self_loop logical. Whether to allow self_loop in random starting model. Only used if is.null(bmodel). Default to F. +#' @param restart integer. Number of restart from the best solution. Defaults to 0. +#' @param verbose logical. Whether to give detailed output to the screen. Defaults to F. #' #' @export -model_train = function(bmodel=NULL, edata, istate, max_varperrule=6, tol=1e-6, inter_bool, verbose=F, self_loop=F) +model_train_sa = function(edata, bmodel=NULL, istate=NULL, max_varperrule=6, and_bool=T, self_loop=F, restart=0, verbose=F) { + ##################Implement restart########################## + vcat('Preparing data for analysis.\n', verbose) - if(class(edata)!='list' | length(edata)!=2) + #Initialise expression data. + tmp_data = initialise_raw_data(edata) #returns a list of two data frames. + cdata = initialise_data(tmp_data[[1]]) #continuous data + ddata = initialise_data(tmp_data[[2]]) #discretised data + + #Initialise model. + if(is.null(bmodel)) + { + bmodel = gen_one_rmodel(colnames(edata), max_varperrule, and_bool, self_loop) + } else { - stop('edata: Supply two expression data frames in a list.') + if(class(bmodel) != 'BoolModel') + { + bmodel = initialise_model(bmodel) + } + + if(check_and(bmodel) != and_bool) + { + and_bool = check_and(bmodel) + } } - #Initialise input data. + #Initialise initial state. + if(is.null(istate)) + { + istate = rbinom(length(bmodel@target), 1, 0.5) + #Getting a random initial state. + while(mean(istate) > 0.9 | mean(istate) < 0.1) #do not want initial state that is too homogenous. + { + istate = rbinom(length(bmodel@target), 1, 0.5) + } + istate = data.frame(matrix(istate, nrow=1)) + colnames(istate) = bmodel@target + } istate = initialise_data(istate, aslogic=T) - cdata = initialise_data(edata[[1]]) - ddata = initialise_data(edata[[2]]) + + #Filtering expression data. + overlap_gene = intersect(colnames(cdata), y=bmodel@target) + nonoverlap_gene = bmodel@target[!(bmodel@target %in% overlap_gene)] + names(overlap_gene) = bmodel@target_var[bmodel@target %in% overlap_gene] + names(nonoverlap_gene) = bmodel@target_var[!(bmodel@target %in% overlap_gene)] + + fddata = filter_dflist(ddata, overlap_gene, F) + fcdata = filter_dflist(cdata, overlap_gene, F) + + fcdata = unique_raw_data(fddata, fcdata) #removes duplicates in continuous data. + fddata = unique(fddata) + + vcat('Start training.\n', verbose) + + #(3) Calling final combined search. + cur_score = NA + cur_model = bmodel + cur_step = 1 + cur_temp = 1 + min_temp = 0.00001 + alpha = 0.9 + max_ite = 100 #iterations in same step. + while(cur_temp > min_temp) + { + vcat(sprintf('Current iteration: %s.\n', cur_step), verbose) + + vcat('Stage 1: Exploring neighbouring models.\n', verbose) + mod_model = unlist(minmod_model(cur_model, overlap_gene=overlap_gene)) + vcat(sprintf('Total neighbouring models: %s.\n', length(mod_model)), verbose) + + vcat('Stage 2: Evaluating next model.\n', verbose) + cur_ite = 1 + while(cur_ite <= max_ite) + { + model_ind = sample(1:length(mod_model), 1) + next_model = mod_model[[model_ind]] + mod_model = mod_model[-model_ind] + + #print(printBM(next_model)) #debug + + next_score = calc_mscore(bmodel=next_model, istate=istate, fcdata=fcdata, overlap_gene=overlap_gene, max_varperrule=max_varperrule) + + #Breaking conditions. + if(length(mod_model) == 0) + { + cur_score = next_score + cur_model = next_model + + if(cur_score < best_score) #store best solution ever, regardless of the final ending point. + { + best_score = next_score + best_model = next_model + } + break + } + + if(is.na(cur_score)) + { + #For first iteration. + cur_score = next_score + cur_model = next_model + + best_score = next_score + best_model = next_model + } else + { + #For subsequent iteration. + accept_prob = exp((cur_score - next_score)/cur_temp) #if next solution is better than current solution, accept_prob always more than 1. + + if(accept_prob > runif(1)) #move forward if the prob is more than a random number between 0-1. + { + cur_score = next_score + cur_model = next_model + + #writeBM(cur_model, 'tmp_model.csv') #debug + + if(cur_score < best_score) #store best solution ever, regardless of the final ending point. + { + best_score = next_score + best_model = next_model + } + } + } + cur_ite = cur_ite + 1 + } + + cur_temp = cur_temp*alpha #Reduce subsequent temperature. + cur_step = cur_step + 1 + } + vcat(sprintf('Final iteration: %s.\n', cur_step), verbose) + + output = list(best_score=best_score, best_model=best_model, + cur_score=cur_score, cur_model=cur_model, overlap_gene=overlap_gene, nonoverlap_gene=nonoverlap_gene) + + return(output) +} + +#' @title Training Model +#' +#' @description +#' This function performs model training to find the best model, using information from data. It requires an initial state supplied to perform the search, and an initial model can also be supplied to be included in the initial population. +#' Note that if a model is supplied, and the genes in the model is different from the genes in the data, only the genes overlapping between model and data will be retained for further analysis. +#' +#' @param edata data frame of expression data. Should have state(row) x gene(column). +#' @param bmodel Boolean model in data frame. If NULL, use a random Boolean model. Defaults to NULL. +#' @param istate data frame. Must have only 1 row, which represents 1 initial state. Defaults to NULL. +#' @param max_varperrule integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must be higher than number of genes. Defaults to 6. +#' @param and_bool logical. Whether to consider AND terms. IF bmodel is not NULL, defaults to whether AND interaction is included in bmodel. If bmodel is NULL, then defaults to TRUE. +#' @param self_loop logical. Whether to allow self_loop in random starting model. Only used if is.null(bmodel). Default to F. +#' @param con_thre numerical. Threshold used to generating the final consensus model. Must be between 0 and 1. +#' @param tol numeric. Tolerance in ending condition. Default to 1e-6. It cannot be lower than .Machine$double.eps ^ 0.5. +#' @param verbose logical. Whether to give detailed output to the screen. Defaults to F. +#' @param detailed_output logical. Whether to return only the model inferred, or all the details obtained during optimisation. Defaults to F. +#' +#' @export +model_train = function(edata, bmodel=NULL, istate=NULL, max_varperrule=6, and_bool=T, self_loop=F, con_thre=0.3, tol=1e-6, verbose=F, detailed_output=F) +{ + vcat('Preparing data for analysis.\n', verbose) + + #Initialise expression data. + tmp_data = initialise_raw_data(edata) #returns a list of two data frames. + cdata = initialise_data(tmp_data[[1]]) #continuous data + ddata = initialise_data(tmp_data[[2]]) #discretised data #Initialise model. if(is.null(bmodel)) { - bmodel = gen_one_rmodel(colnames(istate), max_varperrule, inter_bool, self_loop) - } else if(class(bmodel) != 'BoolModel') + bmodel = gen_one_rmodel(colnames(edata), max_varperrule, and_bool, self_loop) + } else { - bmodel = initialise_model(bmodel) + if(class(bmodel) != 'BoolModel') + { + bmodel = initialise_model(bmodel) + } + + if(check_and(bmodel) != and_bool) + { + and_bool = check_and(bmodel) + } } + #Initialise initial state. + if(is.null(istate)) + { + istate = rbinom(length(bmodel@target), 1, 0.5) + #Getting a random initial state. + while(mean(istate) > 0.9 | mean(istate) < 0.1) #do not want initial state that is too homogenous. + { + istate = rbinom(length(bmodel@target), 1, 0.5) + } + istate = data.frame(matrix(istate, nrow=1)) + colnames(istate) = bmodel@target + } + istate = initialise_data(istate, aslogic=T) + #Filtering expression data. - stopifnot(colnames(edata[[1]])==colnames(edata[[2]])) - overlap_gene = intersect(colnames(edata[[1]]), y=bmodel@target) + overlap_gene = intersect(colnames(cdata), y=bmodel@target) nonoverlap_gene = bmodel@target[!(bmodel@target %in% overlap_gene)] names(overlap_gene) = bmodel@target_var[bmodel@target %in% overlap_gene] names(nonoverlap_gene) = bmodel@target_var[!(bmodel@target %in% overlap_gene)] - + fddata = filter_dflist(ddata, overlap_gene, F) fcdata = filter_dflist(cdata, overlap_gene, F) - + fcdata = unique_raw_data(fddata, fcdata) #removes duplicates in continuous data. fddata = unique(fddata) vcat('Start training.\n', verbose) - + #(3) Calling final combined search. best_model = c() best_score = c() @@ -77,10 +250,7 @@ model_train = function(bmodel=NULL, edata, istate, max_varperrule=6, tol=1e-6, i } vcat('Stage 1: Exploring neighbouring models.\n', verbose) - mod_model = foreach(i=1:length(mod_model)) %dopar% { - c(mod_model[[i]], unlist(minmod_model(mod_model[[i]], overlap_gene=overlap_gene))) - } - mod_model = unlist(mod_model) + mod_model = sample(unlist(minmod_model(mod_model[[i]], overlap_gene=overlap_gene)), 1) vcat(sprintf('Total neighbouring models: %s.\n', length(mod_model)), verbose) if(length(mod_model)>1000000) @@ -140,96 +310,34 @@ model_train = function(bmodel=NULL, edata, istate, max_varperrule=6, tol=1e-6, i vcat(sprintf('Final iteration: %s.\n', cur_step), verbose) vcat('Stage 4: Performing consensus analysis.\n', verbose) - consensus = model_consensus(best_model, inter_bool=inter_bool, max_varperrule=max_varperrule) + consensus = model_consensus(best_model, max_varperrule=max_varperrule) + res_con = consensus[consensus > con_thre] + final_model = decompress_bmodel(as.numeric(names(res_con)), get_encodings(bmodel), gene=bmodel@target) - output = list(consensus=consensus, best_model=best_model, best_score=best_score, ite_score=all_best_score, overlap_gene=overlap_gene, nonoverlap_gene=nonoverlap_gene) - return(output) -} - -#' @title Simplifying Model -#' -#' @description -#' This method takes in a model and remove redundant terms wrt to a single initial state. -#' Note that this model simplification is random, and the simplified model is not guaranteed to be the simplest model possible. It is only guaranteed to be a simpler model that can give the same state space as the orignal input model. -#' -#' @param bmodel S4 BoolModel object. -#' @param istate data frame. Must have only 1 row, which represents 1 initial state. -#' @param inter_bool logical. Indicate whether to consider AND terms. -#' @param max_varperrule integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6. -#' @param verbose logical. Specifies whether to give detailed output. Default to F. -model_simplify = function(bmodel, istate, inter_bool, max_varperrule, verbose=F) -{ - vcat('Stage 1: Calculating score of initial model.\n', verbose) - #Get the states of the original model. - overlap_gene = bmodel@target - fcdata = simulate_model(bmodel, istate) - fcdata = fcdata+0 #convert logical to numeric. - - ori_score = calc_mscore(bmodel, istate, fcdata, overlap_gene, max_varperrule, simplify_bool=T) - stopifnot(ori_score==0) - - next_bmodel = bmodel - ite = 1 - while(TRUE) + if(detailed_output) { - cat(sprintf('Simplification iteration: %s\n', ite)) - #Generate list of minimally deleted models. - - vcat('Stage 2: Exploring neighbouring models.\n', verbose) - - mod_model = c(next_bmodel, minmod_model(next_bmodel, ibool=inter_bool, overlap_gene=overlap_gene)$del_list) - #Breaking condition. - if(length(mod_model) == 1) #model can no longer be simplified. - { - final_bmodel = mod_model[[1]] - break - } - vcat(sprintf('Total neighbouring models: %s.\n', length(mod_model)), verbose) - - vcat('Stage 3: Simulating and calculating scores for models.\n', verbose) - model_res = foreach(i=1:length(mod_model), .combine='c') %dopar% { - model_score = calc_mscore(bmodel=mod_model[[i]], istate=istate, fcdata=fcdata, overlap_gene=overlap_gene, max_varperrule=max_varperrule, simplify_bool=T) - names(model_score)=i - model_score - } - - all_final_score = unname(model_res) - - stopifnot(!is.null(model_res)) - stopifnot(length(all_final_score)==length(mod_model)) - - #Breaking condition. - if(!any(all_final_score[-1] == 0)) #model can no longer be simplified. - { - final_bmodel = mod_model[[1]] - break - } - - #Pick a random equivalent model for next iteration. - best_ind = which.random.min(all_final_score) - next_bmodel = mod_model[[best_ind]] - - ite = ite + 1 + output = list(consensus=consensus, final_model=final_model, best_model=best_model, + best_score=best_score, ite_score=all_best_score, overlap_gene=overlap_gene, nonoverlap_gene=nonoverlap_gene) + } else + { + output = final_model } - return(final_bmodel) + return(output) } -#' @title Intersection of input genes +#' @title Intersection of genes #' #' @description -#' This function finds the intersection of input genes and provide a score for them. Return a consensus model or a vector of scores. +#' This function finds the intersection of genes and provide a score for them. Return a consensus model or a vector of scores. #' #' @param bmodel_list list of BoolModel. -#' @param inter_bool logical. Indicate whether to consider AND terms. #' @param max_varperrule integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6. #' @param format character. Specifies which format to return. Possible values: 'vec', 'df'. Default to 'vec'. -#' -#' @export -model_consensus = function(bmodel_list, inter_bool, max_varperrule, format='vec') +model_consensus = function(bmodel_list, max_varperrule, format='vec') { #(1) Convert all bmodels to encoded forms. - encoding = get_encodings(bmodel_list[[1]], inter_bool) + encoding = get_encodings(bmodel_list[[1]]) encmodel_list = lapply(bmodel_list, function(x) compress_bmodel(x, encoding, max_varperrule)) #(2) Check and remove duplicated models. diff --git a/README.md b/README.md deleted file mode 100644 index a13fc13..0000000 --- a/README.md +++ /dev/null @@ -1 +0,0 @@ -# booltrainer diff --git a/booltrainer.Rproj b/booltrainer.Rproj new file mode 100644 index 0000000..8ecf4c0 --- /dev/null +++ b/booltrainer.Rproj @@ -0,0 +1,18 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: knitr +LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,namespace diff --git a/data/bon_moig_istate.rda b/data/bon_moig_istate.rda deleted file mode 100644 index cd909b3..0000000 Binary files a/data/bon_moig_istate.rda and /dev/null differ diff --git a/data/bon_sstate.rda b/data/bon_sstate.rda deleted file mode 100644 index f9e727d..0000000 Binary files a/data/bon_sstate.rda and /dev/null differ diff --git a/data/bon_wilson_istate.rda b/data/bon_wilson_istate.rda deleted file mode 100644 index aed1c92..0000000 Binary files a/data/bon_wilson_istate.rda and /dev/null differ diff --git a/data/bonkrum_geneid_map.rda b/data/bonkrum_geneid_map.rda deleted file mode 100644 index 8622a3d..0000000 Binary files a/data/bonkrum_geneid_map.rda and /dev/null differ diff --git a/data/example_models.rda b/data/example_models.rda new file mode 100644 index 0000000..cadc607 Binary files /dev/null and b/data/example_models.rda differ diff --git a/data/krum_sstate.rda b/data/krum_sstate.rda deleted file mode 100644 index fec409b..0000000 Binary files a/data/krum_sstate.rda and /dev/null differ diff --git a/data/krum_wilson_istate.rda b/data/krum_wilson_istate.rda deleted file mode 100644 index 5363ea3..0000000 Binary files a/data/krum_wilson_istate.rda and /dev/null differ diff --git a/data/moig_raw_data.rda b/data/moig_raw_data.rda deleted file mode 100644 index 353b3fc..0000000 Binary files a/data/moig_raw_data.rda and /dev/null differ diff --git a/data/real_param.rda b/data/real_param.rda deleted file mode 100644 index 184a934..0000000 Binary files a/data/real_param.rda and /dev/null differ diff --git a/man/bin_to_real.Rd b/man/bin_to_real.Rd deleted file mode 100644 index c30e481..0000000 --- a/man/bin_to_real.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/general.R -\name{bin_to_real} -\alias{bin_to_real} -\title{Generate random real numbers from binary values} -\usage{ -bin_to_real(x, param) -} -\arguments{ -\item{x}{logical or 0/1 numeric matrix. Col-genes, row-samples.} - -\item{param}{list of parameters given by param_bimodal().} -} -\description{ -This function generates random real numbers from binary values, with supplied parameters. Returns a vector of real values. -} - diff --git a/man/bon_moig_istate.Rd b/man/bon_moig_istate.Rd deleted file mode 100644 index 00e8acc..0000000 --- a/man/bon_moig_istate.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/data_desc.R -\docType{data} -\name{bon_moig_istate} -\alias{bon_moig_istate} -\title{Initial state from Moignard et al.} -\format{A data frame with 1 row and 20 columns. - -Rows: each row consists of 1 set of Boolean state. -Columns: each column is for 1 gene/variable.} -\usage{ -data(bon_moig_istate) -} -\description{ -An intial state obtained from data in Moignard et al, determined by taking colMeans over unique rows, and rounding the means to 0-1. -Values for genes that are missing in Moignard et al, but are present in Bonzanni et al, are determined by taking values from the original initial state supplied in Bonzanni et al. -It contains a set of Boolean values for 20 genes. -} - diff --git a/man/check_and.Rd b/man/check_and.Rd new file mode 100644 index 0000000..d638251 --- /dev/null +++ b/man/check_and.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/general.R +\name{check_and} +\alias{check_and} +\title{Check if containing AND terms} +\usage{ +check_and(bmodel) +} +\arguments{ +\item{bmodel}{BoolModel object.} +} +\description{ +This function checks if a particular Boolean model contains AND terms. +} + diff --git a/man/check_bmodel.Rd b/man/check_bmodel.Rd deleted file mode 100644 index 49e14d2..0000000 --- a/man/check_bmodel.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/general.R -\name{check_bmodel} -\alias{check_bmodel} -\title{Check if the Boolean model violates constraints.} -\usage{ -check_bmodel(bmodel, max_varperrule) -} -\arguments{ -\item{bmodel}{S4 BoolModel object.} - -\item{max_varperrule}{integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6.} -} -\description{ -This function checks if the Boolean model violates contraints. Return logical value. -(1) Each gene rule should not have more terms than max_varperrule. -(2) The same term should not occur twice in the same rule. -} - diff --git a/man/equi_model.Rd b/man/equi_model.Rd deleted file mode 100644 index 8406803..0000000 --- a/man/equi_model.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/general.R -\name{equi_model} -\alias{equi_model} -\title{Check for equivalent models} -\usage{ -equi_model(bmodel1, bmodel2, inter_bool, max_varperrule) -} -\arguments{ -\item{bmodel1}{S4 BoolModel object.} - -\item{bmodel2}{S4 BoolModel object.} - -\item{inter_bool}{logical. Indicate whether to consider AND terms.} - -\item{max_varperrule}{integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6.} -} -\description{ -This function checks if the two models have the same rules. Return a logical value. Only TRUE if each rule for each gene is the same. -} - diff --git a/man/example_models.Rd b/man/example_models.Rd new file mode 100644 index 0000000..7355b88 --- /dev/null +++ b/man/example_models.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/data_desc.R +\docType{data} +\name{example_models} +\alias{example_models} +\title{Example Boolean Models used in the vignette} +\format{Each Boolean model is a BoolModel object.} +\usage{ +data(example_models) +} +\description{ +3 Boolean models used in the examples of the vignette. +} + diff --git a/man/gen_one_rmodel.Rd b/man/gen_one_rmodel.Rd index 79695ce..5cdbfea 100644 --- a/man/gen_one_rmodel.Rd +++ b/man/gen_one_rmodel.Rd @@ -4,14 +4,14 @@ \alias{gen_one_rmodel} \title{Generate a random Boolean model} \usage{ -gen_one_rmodel(var, mvar = length(var), inter_bool = F, self_loop = F) +gen_one_rmodel(var, mvar = length(var), and_bool, self_loop = F) } \arguments{ \item{var}{character vector. A vector of single genes/variables to be used in the model.} \item{mvar}{integer. Maximum number of variables in act or inh rule. Default to length(var).} -\item{inter_bool}{logical. Indicates whether to include AND terms or not. Default to F.} +\item{and_bool}{logical. Indicates whether to include AND terms or not.} \item{self_loop}{logical. Indicates whether to allow self_loop. Default to F.} } diff --git a/man/gen_randata.Rd b/man/gen_randata.Rd index 988fc93..cdc0960 100644 --- a/man/gen_randata.Rd +++ b/man/gen_randata.Rd @@ -4,7 +4,7 @@ \alias{gen_randata} \title{Generate sets of random data} \usage{ -gen_randata(n, steps, num_genes, max_varperrule, inter_bool) +gen_randata(n, steps, num_genes, max_varperrule, and_bool) } \arguments{ \item{n}{integer. Number of sets of random data required.} @@ -15,7 +15,7 @@ gen_randata(n, steps, num_genes, max_varperrule, inter_bool) \item{max_varperrule}{integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6.} -\item{inter_bool}{logical. Indicate whether to consider AND terms.} +\item{and_bool}{logical. Indicate whether to consider AND terms.} } \description{ This function generates specified sets of random data, which include initial states, two Boolean models (starting and destination), and continuous+discrete data of the destination model. diff --git a/man/gen_randata_bn.Rd b/man/gen_randata_bn.Rd index 3c0f84f..d673660 100644 --- a/man/gen_randata_bn.Rd +++ b/man/gen_randata_bn.Rd @@ -4,7 +4,7 @@ \alias{gen_randata_bn} \title{Generate sets of random data} \usage{ -gen_randata_bn(n, steps, num_genes, max_varperrule, inter_bool) +gen_randata_bn(n, steps, num_genes, max_varperrule) } \arguments{ \item{n}{integer. Number of sets of random data required.} @@ -14,8 +14,6 @@ gen_randata_bn(n, steps, num_genes, max_varperrule, inter_bool) \item{num_genes}{integer. Number of genes in the Boolean models.} \item{max_varperrule}{integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6.} - -\item{inter_bool}{logical. Indicate whether to consider AND terms.} } \description{ (Requires bnlearn) This function generates specified sets of random data, which include initial states, two Boolean models (starting and destination), and continuous+discrete data of the destination model. diff --git a/man/gen_singlerule.Rd b/man/gen_singlerule.Rd index 2744119..2968654 100644 --- a/man/gen_singlerule.Rd +++ b/man/gen_singlerule.Rd @@ -4,7 +4,7 @@ \alias{gen_singlerule} \title{Generate random act and inh rule for a single gene} \usage{ -gen_singlerule(x, np, tar_ind, ibool = F, self_loop = F) +gen_singlerule(x, np, tar_ind, and_bool, self_loop = F) } \arguments{ \item{x}{character vector. A vector of all single terms to be used.} @@ -13,7 +13,7 @@ gen_singlerule(x, np, tar_ind, ibool = F, self_loop = F) \item{tar_ind}{numerical. Indicate which gene is the rule for. Used in preventing self-loop.} -\item{ibool}{logical. Indicates whether to include AND terms or not. Default to F.} +\item{and_bool}{logical. Indicates whether to include AND terms or not.} \item{self_loop}{logical. Indicates whether to allow self_loop. Default to F.} } diff --git a/man/gen_two_rmodel.Rd b/man/gen_two_rmodel.Rd index 4840900..9ccfb22 100644 --- a/man/gen_two_rmodel.Rd +++ b/man/gen_two_rmodel.Rd @@ -4,7 +4,7 @@ \alias{gen_two_rmodel} \title{Generate two random Boolean models with a specified number of steps apart} \usage{ -gen_two_rmodel(var, steps, mvar = length(var), inter_bool = F, +gen_two_rmodel(var, steps, mvar = length(var), and_bool = F, in_bmodel = NULL, self_loop = F) } \arguments{ @@ -14,7 +14,7 @@ gen_two_rmodel(var, steps, mvar = length(var), inter_bool = F, \item{mvar}{integer. Maximum number of variables in act or inh rule. Default to length(var).} -\item{inter_bool}{logical. Indicates whether to include AND terms or not. Default to F.} +\item{and_bool}{logical. Indicates whether to include AND terms or not. Default to F.} \item{in_bmodel}{BoolModel object. The starting model supplied.} diff --git a/man/get_encodings.Rd b/man/get_encodings.Rd index 41f9a9f..f09f216 100644 --- a/man/get_encodings.Rd +++ b/man/get_encodings.Rd @@ -4,12 +4,10 @@ \alias{get_encodings} \title{Get corresponding encodings for compression or decompression.} \usage{ -get_encodings(bmodel, inter_bool) +get_encodings(bmodel) } \arguments{ \item{bmodel}{S4 BoolModel object.} - -\item{inter_bool}{logical. Indicate whether to consider AND terms.} } \description{ This function gets all possible terms (single or double terms) and their corresponding encodings. diff --git a/man/grow_bmodel.Rd b/man/grow_bmodel.Rd index 12c4e68..2599274 100644 --- a/man/grow_bmodel.Rd +++ b/man/grow_bmodel.Rd @@ -4,14 +4,14 @@ \alias{grow_bmodel} \title{Add extra genes to a Boolean model} \usage{ -grow_bmodel(in_model, in_gene) +grow_bmodel(in_gene, in_model) } \arguments{ -\item{in_model}{data frame with 2 columns, which are targets and factors} - \item{in_gene}{character vector. Genes to be added into the model.} + +\item{in_model}{data frame or BoolModel object. If it is a data frame, it must have 2 columns, which are targets and update functions.} } \description{ -This function adds extra genes to a Boolean model. Input model must be in data frame format, output model will be BoolModel object. +This function adds extra genes to a Boolean model. Return a list of BoolModel object and an initial state. } diff --git a/man/man_dist.Rd b/man/man_dist.Rd new file mode 100644 index 0000000..f2be266 --- /dev/null +++ b/man/man_dist.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/score_calculation.R +\name{man_dist} +\alias{man_dist} +\title{Calculates pairwise Manhattan distances between two matrices} +\usage{ +man_dist(x, y) +} +\arguments{ +\item{x}{matrix} + +\item{y}{matrix} +} +\description{ +This function calculates pairwise Manhattan distances between two matrices. +} + diff --git a/man/match_state.Rd b/man/match_state.Rd deleted file mode 100644 index cdcbcf4..0000000 --- a/man/match_state.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/general.R -\name{match_state} -\alias{match_state} -\title{Check for matching states} -\usage{ -match_state(mstate, xstate) -} -\arguments{ -\item{mstate}{data frame. It should be a state(row) x gene(column) df. colnames will be used in comparison.} - -\item{xstate}{data frame. It should be a state(row) x gene(column) df. colnames will be used in comparison.} -} -\description{ -This function finds a match between two df of states. Returns a row index vector indicating for each row of mstate, what is the corresponding row in xstate. If a match cannot be found, a 0 will be return. -Only columns that are present in both df will be used in comparison. Note that the row index starts from 1 (as in R), not from 0 (as in cpp). -} - diff --git a/man/match_state_loop.Rd b/man/match_state_loop.Rd deleted file mode 100644 index c1a00cd..0000000 --- a/man/match_state_loop.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/RcppExports.R -\name{match_state_loop} -\alias{match_state_loop} -\title{Find a match between two data frames.} -\usage{ -match_state_loop(mstate, xstate) -} -\arguments{ -\item{mstate}{data frame. It should be a state(row) x gene(column) df.} - -\item{xstate}{data frame. It should be a state(row) x gene(column) df.} -} -\description{ -(&&&Not for public use&&&)This function finds a match between two df of states. Used in match_state(). Return an row index vector indicating which row of mstate matches the rows in xstate. -} - diff --git a/man/minmod_model.Rd b/man/minmod_model.Rd index ac9dc93..13ea234 100644 --- a/man/minmod_model.Rd +++ b/man/minmod_model.Rd @@ -11,7 +11,7 @@ minmod_model(bm, index = NULL, overlap_gene = NULL) \item{index}{integer. Specifying rule of which gene to modify. If NULL, modifies all rules in the model. Defaults to NULL.} -\item{overlap_gene}{character vector. Specify which genes are present in both model and data inputs. Only needed when index=NULL.} +\item{overlap_gene}{character vector. Specify which genes are present in both model and data inputs.} } \description{ This function generates all possible boolean models minimally modified. Returns a lists of 2 lists, deleted models and added models diff --git a/man/model_consensus.Rd b/man/model_consensus.Rd index 3011357..e77c7c1 100644 --- a/man/model_consensus.Rd +++ b/man/model_consensus.Rd @@ -2,20 +2,18 @@ % Please edit documentation in R/search.R \name{model_consensus} \alias{model_consensus} -\title{Intersection of input genes} +\title{Intersection of genes} \usage{ -model_consensus(bmodel_list, inter_bool, max_varperrule, format = "vec") +model_consensus(bmodel_list, max_varperrule, format = "vec") } \arguments{ \item{bmodel_list}{list of BoolModel.} -\item{inter_bool}{logical. Indicate whether to consider AND terms.} - \item{max_varperrule}{integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6.} \item{format}{character. Specifies which format to return. Possible values: 'vec', 'df'. Default to 'vec'.} } \description{ -This function finds the intersection of input genes and provide a score for them. Return a consensus model or a vector of scores. +This function finds the intersection of genes and provide a score for them. Return a consensus model or a vector of scores. } diff --git a/man/model_dist.Rd b/man/model_dist.Rd index 4f37692..f785b0f 100644 --- a/man/model_dist.Rd +++ b/man/model_dist.Rd @@ -4,15 +4,13 @@ \alias{model_dist} \title{Calculate distance between Boolean models} \usage{ -model_dist(x, y, inter_bool, max_varperrule) +model_dist(x, y, max_varperrule) } \arguments{ \item{x}{S4 BoolModel object. Test model.} \item{y}{S4 BoolModel object. Reference model.} -\item{inter_bool}{logical. Indicate whether to consider AND terms.} - \item{max_varperrule}{integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6.} } \description{ diff --git a/man/model_setdiff.Rd b/man/model_setdiff.Rd index 7d8fce1..2fa5646 100644 --- a/man/model_setdiff.Rd +++ b/man/model_setdiff.Rd @@ -4,15 +4,13 @@ \alias{model_setdiff} \title{Find the set difference between two Boolean models} \usage{ -model_setdiff(x, y, inter_bool, max_varperrule, directed = F) +model_setdiff(x, y, max_varperrule, directed = F) } \arguments{ \item{x}{S4 BoolModel object. Test model.} \item{y}{S4 BoolModel object. Reference model.} -\item{inter_bool}{logical. Indicate whether to consider AND terms.} - \item{max_varperrule}{integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6.} \item{directed}{logical. If TRUE, return the difference in terms with respect to x.} diff --git a/man/model_simplify.Rd b/man/model_simplify.Rd deleted file mode 100644 index a9ef270..0000000 --- a/man/model_simplify.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/search.R -\name{model_simplify} -\alias{model_simplify} -\title{Simplifying Model} -\usage{ -model_simplify(bmodel, istate, inter_bool, max_varperrule, verbose = F) -} -\arguments{ -\item{bmodel}{S4 BoolModel object.} - -\item{istate}{data frame. Must have only 1 row, which represents 1 initial state.} - -\item{inter_bool}{logical. Indicate whether to consider AND terms.} - -\item{max_varperrule}{integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6.} - -\item{verbose}{logical. Specifies whether to give detailed output. Default to F.} -} -\description{ -This method takes in a model and remove redundant terms wrt to a single initial state. -Note that this model simplification is random, and the simplified model is not guaranteed to be the simplest model possible. It is only guaranteed to be a simpler model that can give the same state space as the orignal input model. -} - diff --git a/man/model_train.Rd b/man/model_train.Rd index fc2fe63..6511567 100644 --- a/man/model_train.Rd +++ b/man/model_train.Rd @@ -4,25 +4,30 @@ \alias{model_train} \title{Training Model} \usage{ -model_train(bmodel = NULL, edata, istate, max_varperrule = 6, tol = 1e-06, - inter_bool, verbose = F, self_loop = F) +model_train(edata, bmodel = NULL, istate = NULL, max_varperrule = 6, + and_bool = T, self_loop = F, con_thre = 0.3, tol = 1e-06, + verbose = F, detailed_output = F) } \arguments{ -\item{bmodel}{Boolean model in data frame. If NULL, use a random Boolean model. Default to NULL.} +\item{edata}{data frame of expression data. Should have state(row) x gene(column).} -\item{edata}{list of 2 data frames. Initialised continuous and discretised expression data. Each data frame should have state(row) x gene(column).} +\item{bmodel}{Boolean model in data frame. If NULL, use a random Boolean model. Defaults to NULL.} -\item{istate}{data frame. Must have only 1 row, which represents 1 initial state.} +\item{istate}{data frame. Must have only 1 row, which represents 1 initial state. Defaults to NULL.} -\item{max_varperrule}{integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6.} +\item{max_varperrule}{integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must be higher than number of genes. Defaults to 6.} -\item{tol}{numeric. Specify the tolerance in ending condition. Default to 1e-6. It cannot be lower than .Machine$double.eps ^ 0.5.} +\item{and_bool}{logical. Whether to consider AND terms. IF bmodel is not NULL, defaults to whether AND interaction is included in bmodel. If bmodel is NULL, then defaults to TRUE.} -\item{inter_bool}{logical. Indicate whether to consider AND terms. Default to TRUE.} +\item{self_loop}{logical. Whether to allow self_loop in random starting model. Only used if is.null(bmodel). Default to F.} -\item{verbose}{logical. Specifies whether to give detailed output. Default to F.} +\item{con_thre}{numerical. Threshold used to generating the final consensus model. Must be between 0 and 1.} -\item{self_loop}{logical. Indicates whether to allow self_loop in random starting model. Only used if is.null(bmodel). Default to F.} +\item{tol}{numeric. Tolerance in ending condition. Default to 1e-6. It cannot be lower than .Machine$double.eps ^ 0.5.} + +\item{verbose}{logical. Whether to give detailed output to the screen. Defaults to F.} + +\item{detailed_output}{logical. Whether to return only the model inferred, or all the details obtained during optimisation. Defaults to F.} } \description{ This function performs model training to find the best model, using information from data. It requires an initial state supplied to perform the search, and an initial model can also be supplied to be included in the initial population. diff --git a/man/model_train_sa.Rd b/man/model_train_sa.Rd new file mode 100644 index 0000000..d83130e --- /dev/null +++ b/man/model_train_sa.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/search.R +\name{model_train_sa} +\alias{model_train_sa} +\title{Training Model (using simulated annealing)} +\usage{ +model_train_sa(edata, bmodel = NULL, istate = NULL, max_varperrule = 6, + and_bool = T, self_loop = F, restart = 0, verbose = F) +} +\arguments{ +\item{edata}{data frame of expression data. Should have state(row) x gene(column).} + +\item{bmodel}{Boolean model in data frame. If NULL, use a random Boolean model. Defaults to NULL.} + +\item{istate}{data frame. Must have only 1 row, which represents 1 initial state. Defaults to NULL.} + +\item{max_varperrule}{integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must be higher than number of genes. Defaults to 6.} + +\item{and_bool}{logical. Whether to consider AND terms. IF bmodel is not NULL, defaults to whether AND interaction is included in bmodel. If bmodel is NULL, then defaults to TRUE.} + +\item{self_loop}{logical. Whether to allow self_loop in random starting model. Only used if is.null(bmodel). Default to F.} + +\item{restart}{integer. Number of restart from the best solution. Defaults to 0.} + +\item{verbose}{logical. Whether to give detailed output to the screen. Defaults to F.} +} +\description{ +This function performs model training to find the best model, using information from data. It requires an initial state supplied to perform the search, and an initial model can also be supplied to be included in the initial population. +Note that if a model is supplied, and the genes in the model is different from the genes in the data, only the genes overlapping between model and data will be retained for further analysis. +} + diff --git a/man/moig_data.Rd b/man/moig_data.Rd deleted file mode 100644 index ff6152d..0000000 --- a/man/moig_data.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/data_desc.R -\docType{data} -\name{moig_data} -\alias{moig_data} -\title{Discretised single cell qRT-PCR expression data from Moignard et al.} -\format{A data frame with 597 rows and 18 columns. - -Rows: each row consists of discretised expression values from 1 cell. -Columns: each column is for 1 gene/variable.} -\usage{ -data(moig_data) -} -\description{ -A discretised single cell expression data obtained from multiple cell types. -} - diff --git a/man/outcyto_model.Rd b/man/outcyto_model.Rd deleted file mode 100644 index fe31101..0000000 --- a/man/outcyto_model.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/output_format.R -\name{outcyto_model} -\alias{outcyto_model} -\title{Output a Boolean Model into Cytoscape readable format} -\usage{ -outcyto_model(bmodel, filepath = getwd()) -} -\arguments{ -\item{bmodel}{S4 BoolModel object.} - -\item{filepath}{character vector. Specify path (AND NOT file name). Default to current working directory, i.e. getwd(). Set to NULL to disable file output.} -} -\description{ -This function outputs a Boolean Model in a format that is readable by Cytoscape. Return invisibly the edges (with edge attributes) and node attributes. (i.e. list of 2 dfs) -} - diff --git a/man/outgenysis_model.Rd b/man/outgenysis_model.Rd index ed5a17a..8c0a9cc 100644 --- a/man/outgenysis_model.Rd +++ b/man/outgenysis_model.Rd @@ -4,12 +4,14 @@ \alias{outgenysis_model} \title{Output a Boolean Model into Genysis readable format} \usage{ -outgenysis_model(bmodel, filepath = getwd()) +outgenysis_model(bmodel, path = getwd(), file = NULL) } \arguments{ \item{bmodel}{S4 BoolModel object.} -\item{filepath}{character vector. Specify path (AND NOT file name). Default to current working directory, i.e. getwd(). Set to NULL to disable file output.} +\item{path}{character. Specify path (AND NOT file name). Default to current working directory, i.e. getwd(). Set to NULL to disable file output.} + +\item{file}{character. Specify file name. Default to NULL for default file names.} } \description{ This function outputs a Boolean Model in a format that is readable by Genysis. Return invisibly the formatted vector. diff --git a/man/outgraph_model.Rd b/man/outgraph_model.Rd new file mode 100644 index 0000000..c3edd9d --- /dev/null +++ b/man/outgraph_model.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/output_format.R +\name{outgraph_model} +\alias{outgraph_model} +\title{Output a Boolean Model into Cytoscape & Gephi readable format} +\usage{ +outgraph_model(bmodel, path = getwd(), file = NULL, and_node = T) +} +\arguments{ +\item{bmodel}{S4 BoolModel object.} + +\item{path}{character. Specify path (AND NOT file name). Default to current working directory, i.e. getwd(). Set to NULL to disable file output.} + +\item{file}{character. Specify file name. Default to NULL for default file names.} + +\item{and_node}{logical. Specify AND as an individual node. Default to T.} +} +\description{ +This function outputs a Boolean Model in a format that is readable by Cytoscape and Gephi. Return invisibly the edges (with edge attributes) and node attributes. (i.e. list of 2 dfs) +} + diff --git a/man/outcyto_stategraph.Rd b/man/outstate_graph.Rd similarity index 82% rename from man/outcyto_stategraph.Rd rename to man/outstate_graph.Rd index 0ceabb9..0ec40fe 100644 --- a/man/outcyto_stategraph.Rd +++ b/man/outstate_graph.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/output_format.R -\name{outcyto_stategraph} -\alias{outcyto_stategraph} -\title{Generate state transition graph readable by Cytoscapes} +\name{outstate_graph} +\alias{outstate_graph} +\title{Generate state transition graph} \usage{ -outcyto_stategraph(mstate, bmodel, directed = F, record.both = F, +outstate_graph(mstate, bmodel, directed = F, record.both = F, filepath = getwd()) } \arguments{ @@ -19,6 +19,6 @@ outcyto_stategraph(mstate, bmodel, directed = F, record.both = F, \item{filepath}{character vector. Specify path (AND NOT file name). Default to current working directory, i.e. getwd(). Set to NULL to disable file output.} } \description{ -This function generates a state transition graph using a Boolean model and its state space. Each node represent a state. All nodes in this graph is linked by an edge only if the 2 states have different value in only 1 gene. The output is readable by Cytoscape. +This function generates a state transition graph using a Boolean model and its state space. Each node represent a state. All nodes in this graph is linked by an edge only if the 2 states have different value in only 1 gene. The output is readable by Cytoscape and Gephi. } diff --git a/man/param_bimodal.Rd b/man/param_bimodal.Rd deleted file mode 100644 index f96000d..0000000 --- a/man/param_bimodal.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/general.R -\name{param_bimodal} -\alias{param_bimodal} -\title{Obtain parameters for bimodal distribution from real data} -\usage{ -param_bimodal(x, data_type = "qpcr") -} -\arguments{ -\item{x}{matrix. Input expression data. Col-genes, row-samples.} - -\item{data_type}{character. Specify data types: qpcr, rnaseq.} -} -\description{ -This function obtains parameters for bimodal distribution. Returns 4 parameters: mu1, mu2, sig1, sig2. -} - diff --git a/man/plotBM.Rd b/man/plotBM.Rd new file mode 100644 index 0000000..5028161 --- /dev/null +++ b/man/plotBM.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/methods.R +\name{plotBM} +\alias{plotBM} +\title{Plot Boolean Model} +\usage{ +plotBM(bmodel, makePlot = T, ...) +} +\arguments{ +\item{bmodel}{S4 BoolModel object.} + +\item{makePlot}{logical. Whether to make plot or just return the object. Default to T.} + +\item{...}{Additional parameters to plot.igraph.} +} +\description{ +This method plots the network underlying Boolean models by using igraph for quick visualisation. +} + diff --git a/man/rcpp_ham_dist.Rd b/man/rcpp_ham_dist.Rd deleted file mode 100644 index 3e91fcb..0000000 --- a/man/rcpp_ham_dist.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/RcppExports.R -\name{rcpp_ham_dist} -\alias{rcpp_ham_dist} -\title{Calculating Hamming pairwise scores between model and data states.} -\usage{ -rcpp_ham_dist(x_df, y_df) -} -\arguments{ -\item{x_df}{matrix. It should be logical matrix of model states.} - -\item{y_df}{matrix. It should be logical matrix of data states.} -} -\description{ -This function calculates the pairwise scores between each row of model and data states. The score is calculated using a custom binary distance measure. -} - diff --git a/man/rcpp_m_score.Rd b/man/rcpp_m_score.Rd deleted file mode 100644 index d29e7d1..0000000 --- a/man/rcpp_m_score.Rd +++ /dev/null @@ -1,15 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/RcppExports.R -\name{rcpp_m_score} -\alias{rcpp_m_score} -\title{Inner core for m_score()} -\usage{ -rcpp_m_score(x_df) -} -\arguments{ -\item{x_df}{matrix. Matrix with columns ranked wrt each row.} -} -\description{ -This function takes in a df with columns ranked wrt each row, and try to assign each row to a unique column without repetition. -} - diff --git a/man/rcpp_man_dist.Rd b/man/rcpp_man_dist.Rd deleted file mode 100644 index 4f5b249..0000000 --- a/man/rcpp_man_dist.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/RcppExports.R -\name{rcpp_man_dist} -\alias{rcpp_man_dist} -\title{Calculating pairwise scores between model and data states.} -\usage{ -rcpp_man_dist(x_df, y_df) -} -\arguments{ -\item{x_df}{matrix. It should be numerical matrix of model states.} - -\item{y_df}{matrix. It should be numerical matrix of data states.} -} -\description{ -This function calculates the pairwise scores between each row of model and data states. The score is calculated using a custom binary distance measure. -} - diff --git a/man/real_param.Rd b/man/real_param.Rd deleted file mode 100644 index f971605..0000000 --- a/man/real_param.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/data_desc.R -\docType{data} -\name{real_param} -\alias{real_param} -\title{Estimated parameters from Wilson et al. data} -\format{A list with 4 numeric vectors, all_mu1, all_mu2, all_sig1, all_sig2. Note that each element in the vector is estimated from a single gene.} -\usage{ -data(real_param) -} -\description{ -A list of parameters (based on log normal distribution) estimated from Wilson et al. single-cell qPCR expression data. -} - diff --git a/man/moig_raw_data.Rd b/man/wilson_raw_data.Rd similarity index 62% rename from man/moig_raw_data.Rd rename to man/wilson_raw_data.Rd index 76cd91f..f3e2fa5 100644 --- a/man/moig_raw_data.Rd +++ b/man/wilson_raw_data.Rd @@ -1,15 +1,15 @@ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/data_desc.R \docType{data} -\name{moig_raw_data} -\alias{moig_raw_data} -\title{Raw single cell qRT-PCR expression data from Moignard et al.} -\format{A data frame with 597 rows and 18 columns. +\name{wilson_raw_data} +\alias{wilson_raw_data} +\title{Raw single cell qRT-PCR expression data from Wilson et al.} +\format{A data frame with 1626 rows and 44 columns. Rows: each row consists of raw expression values from 1 cell. Columns: each column is for 1 gene/variable.} \usage{ -data(moig_raw_data) +data(wilson_raw_data) } \description{ A raw single cell expression data obtained from multiple cell types. diff --git a/man/wilson_raw_rnaseq.Rd b/man/wilson_raw_rnaseq.Rd new file mode 100644 index 0000000..3c94553 --- /dev/null +++ b/man/wilson_raw_rnaseq.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/data_desc.R +\docType{data} +\name{wilson_raw_rnaseq} +\alias{wilson_raw_rnaseq} +\title{Raw single cell RNAseq expression data from Wilson et al.} +\format{A data frame with 96 rows and 38498 columns. + +Rows: each row consists of raw expression values from 1 cell. +Columns: each column is for 1 gene/variable.} +\usage{ +data(wilson_raw_rnaseq) +} +\description{ +A raw single cell expression data obtained from multiple cell types. +} + diff --git a/src/BoolTraineR.dll b/src/BoolTraineR.dll new file mode 100644 index 0000000..27ff8e1 Binary files /dev/null and b/src/BoolTraineR.dll differ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 0597841..7e0bd3a 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,53 +5,6 @@ using namespace Rcpp; -// match_state_loop -Rcpp::NumericVector match_state_loop(Rcpp::NumericMatrix mstate, Rcpp::NumericMatrix xstate); -RcppExport SEXP BoolTraineR_match_state_loop(SEXP mstateSEXP, SEXP xstateSEXP) { -BEGIN_RCPP - Rcpp::RObject __result; - Rcpp::RNGScope __rngScope; - Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type mstate(mstateSEXP); - Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type xstate(xstateSEXP); - __result = Rcpp::wrap(match_state_loop(mstate, xstate)); - return __result; -END_RCPP -} -// rcpp_man_dist -Rcpp::NumericVector rcpp_man_dist(Rcpp::NumericMatrix x_df, Rcpp::NumericMatrix y_df); -RcppExport SEXP BoolTraineR_rcpp_man_dist(SEXP x_dfSEXP, SEXP y_dfSEXP) { -BEGIN_RCPP - Rcpp::RObject __result; - Rcpp::RNGScope __rngScope; - Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type x_df(x_dfSEXP); - Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type y_df(y_dfSEXP); - __result = Rcpp::wrap(rcpp_man_dist(x_df, y_df)); - return __result; -END_RCPP -} -// rcpp_ham_dist -Rcpp::NumericVector rcpp_ham_dist(Rcpp::LogicalMatrix x_df, Rcpp::LogicalMatrix y_df); -RcppExport SEXP BoolTraineR_rcpp_ham_dist(SEXP x_dfSEXP, SEXP y_dfSEXP) { -BEGIN_RCPP - Rcpp::RObject __result; - Rcpp::RNGScope __rngScope; - Rcpp::traits::input_parameter< Rcpp::LogicalMatrix >::type x_df(x_dfSEXP); - Rcpp::traits::input_parameter< Rcpp::LogicalMatrix >::type y_df(y_dfSEXP); - __result = Rcpp::wrap(rcpp_ham_dist(x_df, y_df)); - return __result; -END_RCPP -} -// rcpp_m_score -Rcpp::IntegerVector rcpp_m_score(Rcpp::IntegerMatrix x_df); -RcppExport SEXP BoolTraineR_rcpp_m_score(SEXP x_dfSEXP) { -BEGIN_RCPP - Rcpp::RObject __result; - Rcpp::RNGScope __rngScope; - Rcpp::traits::input_parameter< Rcpp::IntegerMatrix >::type x_df(x_dfSEXP); - __result = Rcpp::wrap(rcpp_m_score(x_df)); - return __result; -END_RCPP -} // rcpp_validate Rcpp::NumericVector rcpp_validate(Rcpp::NumericMatrix inf_mat, Rcpp::NumericMatrix true_mat); RcppExport SEXP BoolTraineR_rcpp_validate(SEXP inf_matSEXP, SEXP true_matSEXP) { diff --git a/src/RcppExports.o b/src/RcppExports.o new file mode 100644 index 0000000..91c4697 Binary files /dev/null and b/src/RcppExports.o differ diff --git a/src/general.cpp b/src/general.cpp deleted file mode 100644 index fa6ffdf..0000000 --- a/src/general.cpp +++ /dev/null @@ -1,56 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -//internal C++ functions. not exported. -bool all_cpp(Rcpp::LogicalVector x) { - /*#Function to test for equality between two vectors.*/ - - return is_true(Rcpp::all(x==TRUE)); //#is_true is used to return a bool type. all() is an Rcpp sugar. -} - -//external C++ functions. exported. -//' @title Find a match between two data frames. -//' -//' @description -//' (&&&Not for public use&&&)This function finds a match between two df of states. Used in match_state(). Return an row index vector indicating which row of mstate matches the rows in xstate. -//' -//' @param mstate data frame. It should be a state(row) x gene(column) df. -//' @param xstate data frame. It should be a state(row) x gene(column) df. -// [[Rcpp::export]] //#must be called in front of each C++ function to be exported. -Rcpp::NumericVector match_state_loop(Rcpp::NumericMatrix mstate, Rcpp::NumericMatrix xstate) { - int nrow_ms = mstate.nrow(); - int nrow_xs = xstate.nrow(); - Rcpp::NumericVector ind(nrow_ms); //#must specify length if using vector. - - int turn = 0; - for(auto i=0; i -#include -#include -#include -#include #include -#include -#include -#include #include #include //for std::invalid_argument -#include //for std::abs(float) -#include //for std::accumulate using namespace std; //external C++ functions. exported. -//Rcpp::NumericVector rcpp_test(Rcpp::NumericVector x) { -// std::vector y = Rcpp::as >(x); -// return(Rcpp::wrap(y)); -//} - -//' @title Calculating pairwise scores between model and data states. -//' -//' @description -//' This function calculates the pairwise scores between each row of model and data states. The score is calculated using a custom binary distance measure. -//' -//' @param x_df matrix. It should be numerical matrix of model states. -//' @param y_df matrix. It should be numerical matrix of data states. -// [[Rcpp::export]] -Rcpp::NumericVector rcpp_man_dist(Rcpp::NumericMatrix x_df, Rcpp::NumericMatrix y_df) { - int xdf_row = x_df.nrow(); - int ydf_row = y_df.nrow(); - - if(x_df.ncol() != y_df.ncol()) { - throw std::invalid_argument( "Two input matrices should have the same number of columns." ); - } - - std::vector tmpout(xdf_row * ydf_row); - int ind = 0; - for(auto i=0; i x = Rcpp::as >(xr); - std::vector y = Rcpp::as >(yr); - std::vector z; - - //Calculate the distance between the two vectors. (Manhattan distance) - //Calculates the absolute distance between two of each variables. - std::transform(x.begin(), x.end(), y.begin(), std::back_inserter(z), - [](double x, double y) { return std::abs(x-y); }); - - //Get the sum of all differences. - double o = std::accumulate(z.begin(), z.end(), 0.0); - - //Add the value to the output vector. - tmpout[ind] = o; - ind += 1; - - //std::copy(x.begin(), x.end(), std::ostream_iterator(std::cout, ",")); - //Rcpp::Rcout << std::endl; - } - } - - //Convert Cpp vector into R matrix. - Rcpp::NumericVector output = Rcpp::wrap(tmpout); - output.attr("dim") = Rcpp::Dimension(ydf_row, xdf_row); - - return output; -} - -//' @title Calculating Hamming pairwise scores between model and data states. -//' -//' @description -//' This function calculates the pairwise scores between each row of model and data states. The score is calculated using a custom binary distance measure. -//' -//' @param x_df matrix. It should be logical matrix of model states. -//' @param y_df matrix. It should be logical matrix of data states. -// [[Rcpp::export]] -Rcpp::NumericVector rcpp_ham_dist(Rcpp::LogicalMatrix x_df, Rcpp::LogicalMatrix y_df) { - int xdf_row = x_df.nrow(); - int ydf_row = y_df.nrow(); - - std::vector m_vec(xdf_row * ydf_row); - - int ind = 0; - for(auto i=0; i x = Rcpp::as >(xr); - std::vector y = Rcpp::as >(yr); - - //std::copy(x.begin(), x.end(), std::ostream_iterator(std::cout, ",")); - //Rcpp::Rcout << "After conversion!" << std::endl; - - //Calculate the summation of x and y between each element. - std::vector z; - if(x.size() == y.size()) { - for(unsigned k=0; k x = Rcpp::as >(xr); - - //Obtain the position for the minimum value. - std::vector::iterator min_val = std::min_element(std::begin(x), std::end(x)); //return the first minimum element. Note that the input result should not contain multiple minimum. - yr[i] = std::distance(std::begin(x), min_val); - - //Check if the value is already present, then pick the next min. - Rcpp::IntegerVector unique_yr = Rcpp::unique(yr); - Rcpp::IntegerVector sub_i = Rcpp::seq_len(i); - Rcpp::IntegerVector test_yr = yr[sub_i]; - bool dup_check = Rcpp::as(Rcpp::any(Rcpp::duplicated(test_yr))); - int ind = 2; //start with finding 2nd min value. - // if the model state space is smaller than data state space, it is not possible to assign a unique model state to each data state. - while (dup_check && unique_yr.size() < xdf_col && ind < xdf_row) { - //Rcpp::Rcout << i << " : " << ind << std::endl; - //Rcpp::Rcout << dup_check << std::endl; - //std::copy(yr.begin(), yr.end(), std::ostream_iterator(Rcpp::Rcout, " ")); - //Rcpp::Rcout << std::endl; - - std::vector::iterator alt_val = std::find(std::begin(x), std::end(x), ind); //find 2nd minimum. - yr[i] = std::distance(std::begin(x), alt_val); - - sub_i = Rcpp::seq_len(i); - test_yr = yr[sub_i]; - dup_check = Rcpp::as(Rcpp::any(Rcpp::duplicated(test_yr))); - unique_yr = Rcpp::unique(yr); - ind += 1; //for next iteration, look for the 3rd (and subsequent) values. - } - - //Checking results. - sub_i = Rcpp::seq_len(i); - test_yr = yr[sub_i]; - dup_check = Rcpp::as(Rcpp::any(Rcpp::duplicated(test_yr))); - if(xdf_col >= xdf_row && dup_check) { //if there are more columns than rows, there should not be any duplicated values. - throw std::invalid_argument( "Duplicated values are still present." ); - } - } - - return yr + 1; //change C++ positions to R positions. -} - //' @title Calculating validation scores between two adjacency matrices //' //' @description diff --git a/src/score_calculation.o b/src/score_calculation.o new file mode 100644 index 0000000..ff2f0b1 Binary files /dev/null and b/src/score_calculation.o differ diff --git a/src/simulation.o b/src/simulation.o new file mode 100644 index 0000000..06fd1bd Binary files /dev/null and b/src/simulation.o differ diff --git a/vignettes/booltrainer.Rmd b/vignettes/booltrainer.Rmd new file mode 100644 index 0000000..8dfaa71 --- /dev/null +++ b/vignettes/booltrainer.Rmd @@ -0,0 +1,479 @@ +--- +title: "Using BoolTraineR to reconstruct asynchronous Boolean models" +author: "Chee Yee Lim" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: true + number_sections: true + rmarkdown::pdf_document: + toc: true + number_sections: true + rmarkdown::md_document: + variant: markdown_github + toc: true +vignette: > + %\VignetteIndexEntry{Using BoolTraineR to reconstruct asynchronous Boolean models} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, echo=FALSE} +library('BoolTraineR') +``` + +# Brief introduction + +`BoolTraineR` is a model learning algorithm for reconstructing and training asynchronous Boolean models using single-cell expression data. Refer to the paper for more details on the concepts behind the algorithm. This vignette serves as a tutorial to demonstrate example workflows that can be adapted to individual cases experienced by users. + +Running `BoolTraineR` is straightforward. However, note that depending on the (1) size of single-cell expression data and (2) complexity of Boolean model, `BoolTraineR` may take a long time to complete the computation. In such cases, it is advisable to use the built-in parallel processing capability of `BoolTraineR`. This can be easily achieved by using `doParallel` package, as illustrated in the example. + +Note that the examples presented in this vignette are different from the results presented in our paper. The examples presented here have been simplified to speed up the processing time. + +# Installation + +`BoolTraineR` can be installed from CRAN. + +```{r, eval = FALSE} +install.packages('BoolTraineR') +``` + +Or from Github for the latest version. + +```{r, eval = FALSE} +install_github("cheeyeelim/booltrainer") +``` + +Also install `doParallel` package if you intend to use parallel processing. + +# Input data format + +Depending on the analysis, only 3 types of data will ever be needed. The format of the data required is discussed below. + +1. Expression data. +A matrix with genes on the columns, and cells on the row. + +The expression data should be preprocessed as in any standard sequencing data processing pipelines, which includes quality control filtering and normalisation. + +Use `initialise_raw_data` to convert expression data into a suitable format for model inference. + +```{r, eval=FALSE} +data(wilson_raw_data) +round(wilson_raw_data[1:5,1:5], 4) + +edata = initialise_raw_data(wilson_raw_data) +``` +```{r, echo=FALSE} +data(wilson_raw_data) +knitr::kable(round(wilson_raw_data[1:5,1:5], 4)) +``` + +2. Initial Boolean model. +A data frame with two columns, targets and update functions. + +Note that if an update function contains both activation and inhibition genes, they must be expressed with a separate clause containing only activation genes, and a separate clause containing only inhibition genes. (See the update functions of Gata1 and Gata2 for examples) + +Use `initialise_model` to convert the input Boolean model into a BoolModel object. + +```{r, eval=FALSE} +data(krum_bmodel) +head(krum_bmodel) + +bmodel = initialise_model(krum_bmodel) +``` +```{r, echo=FALSE} +data(krum_bmodel) +knitr::kable(head(krum_bmodel)) +``` + +3. Initial state. + +A single row data frame with genes as the columns. The expression state of each gene must be in binarised form, i.e. 0s and 1s. + +Note that all the genes that are present in the initial Boolean model must also be present here. + +```{r, eval=FALSE} +data(krum_istate) +head(krum_istate) +``` +```{r, echo=FALSE} +data(krum_istate) +knitr::kable(head(krum_istate)) +``` + +# Output format + +BoolTraineR supports several output formats for Boolean models, as shown below. + +* `outgraph_model` - Outputs a Boolean model in a tab-delimited file with each line being an edge (i.e. gene interaction). This function also outputs a node attribute file, which can be used to distinguish gene and AND nodes in a graph plotting software. This format is readable by both Cytoscape and Gephi. +* `outgenysis_model` - Outputs a Boolean model in a space-delimited file with each line being an edge (i.e. gene interaction). This format is readable by genYsis (used for steady state analysis). +* `writeBM` - Outputs a Boolean model in a comma-delimited file similar in format to the input file format (i.e. two columns: genes and update functions). + +BoolTraineR can also output a state transition graph. + +* `outstate_graph` - Outputs a state space of a Boolean model simulated with an initial state. This format is readable by both Cytoscape and Gephi. + +# Useful functions in BoolTraineR + +Besides training Boolean models, BoolTraineR can be used for simulating a Boolean model asynchronously and calculate the score of a Boolean model with respect to a data. + +* `model_train` - Core function in `BoolTraineR` that performs Boolean model inference. +* `simulate_model` - Simulate a Boolean model asynchronously using an initial state, and return its state space. +* `calc_mscore` - Calculate a distance score for a Boolean model with respect to an expression data. +* `model_dist` - Calculate the number of genes in the update functions that differ between two Boolean models. +* `model_setdiff` - Show the genes in the update functions that differ between two Boolean models. + +# Example workflows + +Three example workflows will be discussed in this vignette: (1) Inferring model without an initial model, (2) Inferring model with an initial model, (3) Extending model with more genes. The two workflows are largely similar, which only differ in the data preparation step. + +## Inferring model without an initial model + +This workflow is intended for use on inferring a Boolean model without an initial model. + +When no initial model is used, BoolTraineR will reconstruct gene interactions from a list of user-specified genes. If the number of genes in the expression data is low (e.g. in qPCR), it is also possible to use all the genes in the expression data. + +### Full workflow + +Full workflow is included here for easy referencing. Each step is discussed in further details below. + +```{r, eval=FALSE, tidy=TRUE} +set.seed(0) #use to ensure reproducibility. remove in actual use. + +#(1) Setup paths and environment. +library(BoolTraineR) + +#If intending to use parallel processing, uncomment the following lines. +#library(doParallel) +#num_core = 4 #specify the number of cores to be used. +#doParallel::registerDoParallel(cores=num_core) + +#(2) Load data. +data(wilson_raw_data) #load a data frame of expression data. +edata = wilson_raw_data + +#(3) Filter cell types. +cell_ind = grepl('cmp', rownames(edata)) | grepl('gmp', rownames(edata)) | grepl('mep', rownames(edata)) #select cells to be included. +edata = edata[cell_ind,] + +#(4) Filter genes. +gene_ind = c('fli1', 'gata1', 'gata2', 'gfi1', 'scl', 'sfpi1') #select genes to be included. +edata = edata[, gene_ind] + +#(5) Inferring Boolean model. +final_model = model_train(edata, max_varperrule=4, verbose=T) + +#(6) Visualise the Boolean model generated. +plotBM(final_model) +``` + +### Initial setup + +The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. + +```{r} +set.seed(0) #use to ensure reproducibility. remove in actual use. + +#(1) Setup paths and environment. +library(BoolTraineR) + +#If intending to use parallel processing, uncomment the following lines. +#library(doParallel) +#num_core = 4 #specify the number of cores to be used. +#doParallel::registerDoParallel(cores=num_core) +``` + +### Data preparation + +Only the expression data is needed for inferring a Boolean model without an initial model. + +To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. + +```{r, eval=FALSE} +#(2) Load data. +data(wilson_raw_data) #load a data frame of expression data. +edata = wilson_raw_data +``` + +Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. + +```{r, tidy=TRUE , eval=FALSE} +#(3) Filter cell types. +cell_ind = grepl('cmp', rownames(edata)) | grepl('gmp', rownames(edata)) | grepl('mep', rownames(edata)) #select cells to be included. +edata = edata[cell_ind,] + +#(4) Filter genes. +gene_ind = c('fli1', 'gata1', 'gata2', 'gfi1', 'scl', 'sfpi1') #select genes to be included. +edata = edata[, gene_ind] +``` + +### Run model training + +To reconstruct a Boolean model from an expression data, run `model_train`. + +In this example, `model_train` takes a few seconds to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above. + +You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise quickly using `plotBM`, which is based on `igraph` package. For easier manipulation, output the Boolean model using `outgraph_model` and display it with Cytoscape or Gephi. + +```{r, eval=FALSE} +#(5) Inferring Boolean model. +final_model = model_train(edata, max_varperrule=4, verbose=T) + +#(6) Visualise the Boolean model generated. +plotBM(final_model) +``` +```{r, echo=FALSE, fig.show='hold', message=FALSE, dpi=75, fig.width=6, fig.height=6} +data(example_models) +plotBM(emodel1) +``` + +## Inferring model with an initial model + +This workflow is intended for use on inferring a Boolean model with an initial model. + +When an initial model is used, note that only genes that are both present in the initial model and expression data will be used for reconstructing gene interactions. Any genes in the initial model that do not have corresponding expression values in the data will keep their original gene interactions as specified in the initial model without any modifications. + +### Full workflow + +Full workflow is included here for easy referencing. Each step is discussed in further details below. + +```{r, eval=FALSE, tidy=TRUE} +set.seed(0) #use to ensure reproducibility. remove in actual use. + +#(1) Setup paths and environment. +library(BoolTraineR) + +#If intending to use parallel processing, uncomment the following lines. +#library(doParallel) +#num_core = 4 #specify the number of cores to be used. +#doParallel::registerDoParallel(cores=num_core) + +#(2) Load data. +data(krum_bmodel) #load a data frame of Boolean model. +data(krum_istate) #load a data frame of initial state. +data(wilson_raw_data) #load a data frame of expression data. + +bmodel = initialise_model(krum_bmodel) +istate = krum_istate +edata = wilson_raw_data + +#(3) Filter cell types. +cell_ind = grepl('cmp', rownames(edata)) | grepl('gmp', rownames(edata)) | grepl('mep', rownames(edata)) #select cells to be included. +edata = edata[cell_ind,] + +#(4) Inferring Boolean model. +final_model = model_train(edata, bmodel, istate, max_varperrule=4, verbose=T) + +#(5) Visualise the Boolean model generated. +plotBM(final_model) +``` + +### Initial setup + +The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. + +```{r} +set.seed(0) #use to ensure reproducibility. remove in actual use. + +#(1) Setup paths and environment. +library(BoolTraineR) + +#If intending to use parallel processing, uncomment the following lines. +#library(doParallel) +#num_core = 4 #specify the number of cores to be used. +#doParallel::registerDoParallel(cores=num_core) +``` + +### Data preparation + +3 pieces of data are needed to infer a Boolean model with an initial model: an expression data, an initial Boolean model and an initial state. + +To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. + +`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. + +```{r, eval=FALSE} +#(2) Load data. +data(krum_bmodel) #load a data frame of Boolean model. +data(krum_istate) #load a data frame of initial state. +data(wilson_raw_data) #load a data frame of expression data. + +bmodel = initialise_model(krum_bmodel) +istate = krum_istate +edata = wilson_raw_data +``` + +Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. + +```{r, tidy=TRUE , eval=FALSE} +#(3) Filter cell types. +cell_ind = grepl('cmp', rownames(edata)) | grepl('gmp', rownames(edata)) | grepl('mep', rownames(edata)) #select cells to be included. +edata = edata[cell_ind,] +``` + +### Run model training + +To reconstruct a Boolean model from an expression data, run `model_train`. + +In this example, `model_train` takes a few seconds to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above. + +You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise using `plotBM`, which is based on `igraph` package. For easier manipulation, output the Boolean model using `outgraph_model` and display it with Cytoscape or Gephi. + +```{r, eval=FALSE} +#(4) Inferring Boolean model. +final_model = model_train(edata, bmodel, istate, max_varperrule=4, verbose=T) + +#(5) Visualise the Boolean model generated. +plotBM(final_model) +``` +```{r, echo=FALSE, fig.show='hold', message=FALSE, dpi=75, fig.width=6, fig.height=6} +data(example_models) +plotBM(emodel2) +``` + +## Extending model with more genes + +This workflow is intended for use on extending an initial Boolean model with additional genes. + +When an initial model is used, note that only genes that are both present in the initial model and expression data will be used for reconstructing gene interactions. Any genes in the initial model that do not have corresponding expression values in the data will keep their original gene interactions as specified in the initial model without any modifications. + +### Full workflow + +Full workflow is included here for easy referencing. Each step is discussed in further details below. + +*Note that this example takes a few minutes to run on a single core. The use of parallel processing is recommended.* + +```{r, eval=FALSE, tidy=TRUE} +set.seed(0) #use to ensure reproducibility. remove in actual use. + +#(1) Setup paths and environment. +library(BoolTraineR) + +#If intending to use parallel processing, uncomment the following lines. +#library(doParallel) +#num_core = 4 #specify the number of cores to be used. +#doParallel::registerDoParallel(cores=num_core) + +#(2) Load data. +data(krum_bmodel) #load a data frame of Boolean model. +data(krum_istate) #load a data frame of initial state. +data(wilson_raw_data) #load a data frame of expression data. + +bmodel = initialise_model(krum_bmodel) +istate = krum_istate +edata = wilson_raw_data + +#(3) Filter cell types. +cell_ind = grepl('cmp', rownames(edata)) | grepl('gmp', rownames(edata)) | grepl('mep', rownames(edata)) #select cells to be included. +edata = edata[cell_ind,] + +#(4) Adding extra genes to the initial Boolean model. +#extra_genes = setdiff(colnames(wilson_raw_data), bmodel@target) #to view available genes to be added. +print(extra_genes) #to view available genes to be added. +add_gene = 'ldb1' #genes to be added: ldb1 +grown_bmodel = grow_bmodel(add_gene, bmodel) + +#(5) Estimating initial state for the extra genes. +tmp_data = initialise_raw_data(wilson_raw_data)[[1]] #preprocess data. +tmp_istate = mean(tmp_data[grepl('cmp', rownames(tmp_data)), add_gene]) #estimating initial state from CMPs. +tmp_istate = matrix(round(tmp_istate), nrow=1) +colnames(tmp_istate) = add_gene +grown_istate = cbind(istate, tmp_istate) +grown_istate = initialise_data(grown_istate) + +#(6) Inferring Boolean model. +final_model = model_train(edata, grown_bmodel, grown_istate, max_varperrule=4, verbose=T) + +#(7) Visualise the Boolean model generated. +plotBM(final_model) +``` + +### Initial setup + +The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. + +```{r} +set.seed(0) #use to ensure reproducibility. remove in actual use. + +#(1) Setup paths and environment. +library(BoolTraineR) + +#If intending to use parallel processing, uncomment the following lines. +#library(doParallel) +#num_core = 4 #specify the number of cores to be used. +#doParallel::registerDoParallel(cores=num_core) +``` + +### Data preparation + +3 pieces of data are needed to infer a Boolean model with an initial model: an expression data, an initial Boolean model and an initial state. + +To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. + +`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. + +```{r, eval=FALSE} +#(2) Load data. +data(krum_bmodel) #load a data frame of Boolean model. +data(krum_istate) #load a data frame of initial state. +data(wilson_raw_data) #load a data frame of expression data. + +bmodel = initialise_model(krum_bmodel) +istate = krum_istate +edata = wilson_raw_data +``` + +Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. + +```{r, tidy=TRUE , eval=FALSE} +#(3) Filter cell types. +cell_ind = grepl('cmp', rownames(edata)) | grepl('gmp', rownames(edata)) | grepl('mep', rownames(edata)) #select cells to be included. +edata = edata[cell_ind,] +``` + +### Add extra genes to the initial Boolean model + +Extra genes can be added to the initial model using `grow_bmodel`. The function will add extra genes into the initial model with empty update functions. + +```{r, eval=FALSE} +#(4) Adding extra genes to the initial Boolean model. +#extra_genes = setdiff(colnames(wilson_raw_data), bmodel@target) #to view available genes to be added. +add_gene = 'ldb1' #genes to be added: ldb1 +grown_bmodel = grow_bmodel(add_gene, bmodel) +``` + +### Estimate initial state for the extra genes + +Initial state needs to be modify to include the initial expression of the extra genes. The initial state of the extra genes can be set manually, or it can be estimated from the data if the data contain multiple cell types with known relationships. In this example, CMPs are known to be at developmental upstream of erythro-myeloid differentiation, therefore initial state can be estimated by taking the average expression of the extra genes in CMPs. + +```{r, eval=FALSE} +#(5) Estimating initial state for the extra genes. +tmp_data = initialise_raw_data(wilson_raw_data)[[1]] #preprocess data. +tmp_istate = mean(tmp_data[grepl('cmp', rownames(tmp_data)), add_gene]) #estimating initial state from CMPs. +tmp_istate = matrix(round(tmp_istate), nrow=1) +colnames(tmp_istate) = add_gene +grown_istate = cbind(istate, tmp_istate) +grown_istate = initialise_data(grown_istate) +``` + +### Run model training + +To reconstruct a Boolean model from an expression data, run `model_train`. + +In this example, `model_train` takes a few minutes to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above. + +You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise using `plotBM`, which is based on `igraph` package. For easier manipulation, output the Boolean model using `outgraph_model` and display it with Cytoscape or Gephi. + +*Note that this example takes a long time to run. The use of parallel processing is recommended.* + +```{r, eval=FALSE} +#(6) Inferring Boolean model. +final_model = model_train(edata, grown_bmodel, grown_istate, max_varperrule=4, verbose=T) + +#(7) Visualise the Boolean model generated. +plotBM(final_model) +``` +```{r, echo=FALSE, fig.show='hold', message=FALSE, dpi=75, fig.width=6, fig.height=6} +data(example_models) +plotBM(emodel3) +``` diff --git a/vignettes/booltrainer.html b/vignettes/booltrainer.html new file mode 100644 index 0000000..aad23b7 --- /dev/null +++ b/vignettes/booltrainer.html @@ -0,0 +1,594 @@ + + + + + + + + + + + + + + +Using BoolTraineR to reconstruct asynchronous Boolean models + + + + + + + + + + + + + + + + + + + + +
+

1 Brief introduction

+

BoolTraineR is a model learning algorithm for reconstructing and training asynchronous Boolean models using single-cell expression data. Refer to the paper for more details on the concepts behind the algorithm. This vignette serves as a tutorial to demonstrate example workflows that can be adapted to individual cases experienced by users.

+

Running BoolTraineR is straightforward. However, note that depending on the (1) size of single-cell expression data and (2) complexity of Boolean model, BoolTraineR may take a long time to complete the computation. In such cases, it is advisable to use the built-in parallel processing capability of BoolTraineR. This can be easily achieved by using doParallel package, as illustrated in the example.

+

Note that the examples presented in this vignette are different from the results presented in our paper. The examples presented here have been simplified to speed up the processing time.

+
+
+

2 Installation

+

BoolTraineR can be installed from CRAN.

+
install.packages('BoolTraineR')
+

Or from Github for the latest version.

+
install_github("cheeyeelim/booltrainer")
+

Also install doParallel package if you intend to use parallel processing.

+
+
+

3 Input data format

+

Depending on the analysis, only 3 types of data will ever be needed. The format of the data required is discussed below.

+
    +
  1. Expression data. A matrix with genes on the columns, and cells on the row.
  2. +
+

The expression data should be preprocessed as in any standard sequencing data processing pipelines, which includes quality control filtering and normalisation.

+

Use initialise_raw_data to convert expression data into a suitable format for model inference.

+
data(wilson_raw_data)
+round(wilson_raw_data[1:5,1:5], 4)
+
+edata = initialise_raw_data(wilson_raw_data)
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
bptfcbfa2t3hcsf1rdnmt3aeif2b1
lmpp_0021.02612.39442.68471.66362.0203
lmpp_0032.64961.78001.68211.59412.7736
lmpp_00410.30800.58894.2653-0.55650.0026
lmpp_0070.54191.863110.84680.17571.0873
lmpp_0080.92092.66372.85492.19652.3663
+
    +
  1. Initial Boolean model. A data frame with two columns, targets and update functions.
  2. +
+

Note that if an update function contains both activation and inhibition genes, they must be expressed with a separate clause containing only activation genes, and a separate clause containing only inhibition genes. (See the update functions of Gata1 and Gata2 for examples)

+

Use initialise_model to convert the input Boolean model into a BoolModel object.

+
data(krum_bmodel)
+head(krum_bmodel)
+
+bmodel = initialise_model(krum_bmodel)
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
targetsfactors
gata2gata2 & ! ((gata1 & fog1) | sfpi1)
gata1(gata1 | gata2 | fli1) & ! sfpi1
fog1gata1
eklfgata1 & ! fli1
fli1gata1 & ! eklf
sclgata1 & ! sfpi1
+
    +
  1. Initial state.
  2. +
+

A single row data frame with genes as the columns. The expression state of each gene must be in binarised form, i.e. 0s and 1s.

+

Note that all the genes that are present in the initial Boolean model must also be present here.

+
data(krum_istate)
+head(krum_istate)
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
cjuncebpafli1gata1gata2eklfsfpi1gfi1sclegrnabfog1
initial_state01001010000
+
+
+

4 Output format

+

BoolTraineR supports several output formats for Boolean models, as shown below.

+
    +
  • outgraph_model - Outputs a Boolean model in a tab-delimited file with each line being an edge (i.e. gene interaction). This function also outputs a node attribute file, which can be used to distinguish gene and AND nodes in a graph plotting software. This format is readable by both Cytoscape and Gephi.
  • +
  • outgenysis_model - Outputs a Boolean model in a space-delimited file with each line being an edge (i.e. gene interaction). This format is readable by genYsis (used for steady state analysis).
  • +
  • writeBM - Outputs a Boolean model in a comma-delimited file similar in format to the input file format (i.e. two columns: genes and update functions).
  • +
+

BoolTraineR can also output a state transition graph.

+
    +
  • outstate_graph - Outputs a state space of a Boolean model simulated with an initial state. This format is readable by both Cytoscape and Gephi.
  • +
+
+
+

5 Useful functions in BoolTraineR

+

Besides training Boolean models, BoolTraineR can be used for simulating a Boolean model asynchronously and calculate the score of a Boolean model with respect to a data.

+
    +
  • model_train - Core function in BoolTraineR that performs Boolean model inference.
  • +
  • simulate_model - Simulate a Boolean model asynchronously using an initial state, and return its state space.
  • +
  • calc_mscore - Calculate a distance score for a Boolean model with respect to an expression data.
  • +
  • model_dist - Calculate the number of genes in the update functions that differ between two Boolean models.
  • +
  • model_setdiff - Show the genes in the update functions that differ between two Boolean models.
  • +
+
+
+

6 Example workflows

+

Three example workflows will be discussed in this vignette: (1) Inferring model without an initial model, (2) Inferring model with an initial model, (3) Extending model with more genes. The two workflows are largely similar, which only differ in the data preparation step.

+
+

6.1 Inferring model without an initial model

+

This workflow is intended for use on inferring a Boolean model without an initial model.

+

When no initial model is used, BoolTraineR will reconstruct gene interactions from a list of user-specified genes. If the number of genes in the expression data is low (e.g. in qPCR), it is also possible to use all the genes in the expression data.

+
+

6.1.1 Full workflow

+

Full workflow is included here for easy referencing. Each step is discussed in further details below.

+
set.seed(0)  #use to ensure reproducibility. remove in actual use.
+
+# (1) Setup paths and environment.
+library(BoolTraineR)
+
+# If intending to use parallel processing, uncomment the following lines.
+# library(doParallel) num_core = 4 #specify the number of cores to be used.
+# doParallel::registerDoParallel(cores=num_core)
+
+# (2) Load data.
+data(wilson_raw_data)  #load a data frame of expression data.
+edata = wilson_raw_data
+
+# (3) Filter cell types.
+cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", 
+    rownames(edata))  #select cells to be included.
+edata = edata[cell_ind, ]
+
+# (4) Filter genes.
+gene_ind = c("fli1", "gata1", "gata2", "gfi1", "scl", "sfpi1")  #select genes to be included.
+edata = edata[, gene_ind]
+
+# (5) Inferring Boolean model.
+final_model = model_train(edata, max_varperrule = 4, verbose = T)
+
+# (6) Visualise the Boolean model generated.
+plotBM(final_model)
+
+
+

6.1.2 Initial setup

+

The first step is to load the BoolTraineR package. If you are intending to use parallel processing, you will also need to load the doParallel package. Then specify how many cores you intend to use using registerDoParallel from the doParallel package.

+
set.seed(0) #use to ensure reproducibility. remove in actual use.
+
+#(1) Setup paths and environment.
+library(BoolTraineR)
+
+#If intending to use parallel processing, uncomment the following lines. 
+#library(doParallel)
+#num_core = 4 #specify the number of cores to be used.
+#doParallel::registerDoParallel(cores=num_core)
+
+
+

6.1.3 Data preparation

+

Only the expression data is needed for inferring a Boolean model without an initial model.

+

To load the data into R, use read.table or read.csv. In this example, we are using the example data included with the package, so we are accessing it by using data.

+
#(2) Load data.
+data(wilson_raw_data) #load a data frame of expression data.
+edata = wilson_raw_data
+

Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete.

+
# (3) Filter cell types.
+cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", 
+    rownames(edata))  #select cells to be included.
+edata = edata[cell_ind, ]
+
+# (4) Filter genes.
+gene_ind = c("fli1", "gata1", "gata2", "gfi1", "scl", "sfpi1")  #select genes to be included.
+edata = edata[, gene_ind]
+
+
+

6.1.4 Run model training

+

To reconstruct a Boolean model from an expression data, run model_train.

+

In this example, model_train takes a few seconds to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above.

+

You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise quickly using plotBM, which is based on igraph package. For easier manipulation, output the Boolean model using outgraph_model and display it with Cytoscape or Gephi.

+
#(5) Inferring Boolean model.
+final_model = model_train(edata, max_varperrule=4, verbose=T)
+
+#(6) Visualise the Boolean model generated.
+plotBM(final_model)
+

+
+
+
+

6.2 Inferring model with an initial model

+

This workflow is intended for use on inferring a Boolean model with an initial model.

+

When an initial model is used, note that only genes that are both present in the initial model and expression data will be used for reconstructing gene interactions. Any genes in the initial model that do not have corresponding expression values in the data will keep their original gene interactions as specified in the initial model without any modifications.

+
+

6.2.1 Full workflow

+

Full workflow is included here for easy referencing. Each step is discussed in further details below.

+
set.seed(0)  #use to ensure reproducibility. remove in actual use.
+
+# (1) Setup paths and environment.
+library(BoolTraineR)
+
+# If intending to use parallel processing, uncomment the following lines.
+# library(doParallel) num_core = 4 #specify the number of cores to be used.
+# doParallel::registerDoParallel(cores=num_core)
+
+# (2) Load data.
+data(krum_bmodel)  #load a data frame of Boolean model.
+data(krum_istate)  #load a data frame of initial state.
+data(wilson_raw_data)  #load a data frame of expression data.
+
+bmodel = initialise_model(krum_bmodel)
+istate = krum_istate
+edata = wilson_raw_data
+
+# (3) Filter cell types.
+cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", 
+    rownames(edata))  #select cells to be included.
+edata = edata[cell_ind, ]
+
+# (4) Inferring Boolean model.
+final_model = model_train(edata, bmodel, istate, max_varperrule = 4, verbose = T)
+
+# (5) Visualise the Boolean model generated.
+plotBM(final_model)
+
+
+

6.2.2 Initial setup

+

The first step is to load the BoolTraineR package. If you are intending to use parallel processing, you will also need to load the doParallel package. Then specify how many cores you intend to use using registerDoParallel from the doParallel package.

+
set.seed(0) #use to ensure reproducibility. remove in actual use.
+
+#(1) Setup paths and environment.
+library(BoolTraineR)
+
+#If intending to use parallel processing, uncomment the following lines. 
+#library(doParallel)
+#num_core = 4 #specify the number of cores to be used.
+#doParallel::registerDoParallel(cores=num_core)
+
+
+

6.2.3 Data preparation

+

3 pieces of data are needed to infer a Boolean model with an initial model: an expression data, an initial Boolean model and an initial state.

+

To load the data into R, use read.table or read.csv. In this example, we are using the example data included with the package, so we are accessing it by using data.

+

initialise_model converts the data frame containing the Boolean model into a BoolModel object.

+
#(2) Load data.
+data(krum_bmodel) #load a data frame of Boolean model.
+data(krum_istate) #load a data frame of initial state.
+data(wilson_raw_data) #load a data frame of expression data.
+
+bmodel = initialise_model(krum_bmodel)
+istate = krum_istate
+edata = wilson_raw_data
+

Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically.

+
# (3) Filter cell types.
+cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", 
+    rownames(edata))  #select cells to be included.
+edata = edata[cell_ind, ]
+
+
+

6.2.4 Run model training

+

To reconstruct a Boolean model from an expression data, run model_train.

+

In this example, model_train takes a few seconds to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above.

+

You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise using plotBM, which is based on igraph package. For easier manipulation, output the Boolean model using outgraph_model and display it with Cytoscape or Gephi.

+
#(4) Inferring Boolean model.
+final_model = model_train(edata, bmodel, istate, max_varperrule=4, verbose=T)
+
+#(5) Visualise the Boolean model generated.
+plotBM(final_model)
+

+
+
+
+

6.3 Extending model with more genes

+

This workflow is intended for use on extending an initial Boolean model with additional genes.

+

When an initial model is used, note that only genes that are both present in the initial model and expression data will be used for reconstructing gene interactions. Any genes in the initial model that do not have corresponding expression values in the data will keep their original gene interactions as specified in the initial model without any modifications.

+
+

6.3.1 Full workflow

+

Full workflow is included here for easy referencing. Each step is discussed in further details below.

+

Note that this example takes a few minutes to run on a single core. The use of parallel processing is recommended.

+
set.seed(0)  #use to ensure reproducibility. remove in actual use.
+
+# (1) Setup paths and environment.
+library(BoolTraineR)
+
+# If intending to use parallel processing, uncomment the following lines.
+# library(doParallel) num_core = 4 #specify the number of cores to be used.
+# doParallel::registerDoParallel(cores=num_core)
+
+# (2) Load data.
+data(krum_bmodel)  #load a data frame of Boolean model.
+data(krum_istate)  #load a data frame of initial state.
+data(wilson_raw_data)  #load a data frame of expression data.
+
+bmodel = initialise_model(krum_bmodel)
+istate = krum_istate
+edata = wilson_raw_data
+
+# (3) Filter cell types.
+cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", 
+    rownames(edata))  #select cells to be included.
+edata = edata[cell_ind, ]
+
+# (4) Adding extra genes to the initial Boolean model. extra_genes =
+# setdiff(colnames(wilson_raw_data), bmodel@target) #to view available genes
+# to be added.
+print(extra_genes)  #to view available genes to be added.
+add_gene = "ldb1"  #genes to be added: ldb1
+grown_bmodel = grow_bmodel(add_gene, bmodel)
+
+# (5) Estimating initial state for the extra genes.
+tmp_data = initialise_raw_data(wilson_raw_data)[[1]]  #preprocess data.
+tmp_istate = mean(tmp_data[grepl("cmp", rownames(tmp_data)), add_gene])  #estimating initial state from CMPs.
+tmp_istate = matrix(round(tmp_istate), nrow = 1)
+colnames(tmp_istate) = add_gene
+grown_istate = cbind(istate, tmp_istate)
+grown_istate = initialise_data(grown_istate)
+
+# (6) Inferring Boolean model.
+final_model = model_train(edata, grown_bmodel, grown_istate, max_varperrule = 4, 
+    verbose = T)
+
+# (7) Visualise the Boolean model generated.
+plotBM(final_model)
+
+
+

6.3.2 Initial setup

+

The first step is to load the BoolTraineR package. If you are intending to use parallel processing, you will also need to load the doParallel package. Then specify how many cores you intend to use using registerDoParallel from the doParallel package.

+
set.seed(0) #use to ensure reproducibility. remove in actual use.
+
+#(1) Setup paths and environment.
+library(BoolTraineR)
+
+#If intending to use parallel processing, uncomment the following lines. 
+#library(doParallel)
+#num_core = 4 #specify the number of cores to be used.
+#doParallel::registerDoParallel(cores=num_core)
+
+
+

6.3.3 Data preparation

+

3 pieces of data are needed to infer a Boolean model with an initial model: an expression data, an initial Boolean model and an initial state.

+

To load the data into R, use read.table or read.csv. In this example, we are using the example data included with the package, so we are accessing it by using data.

+

initialise_model converts the data frame containing the Boolean model into a BoolModel object.

+
#(2) Load data.
+data(krum_bmodel) #load a data frame of Boolean model.
+data(krum_istate) #load a data frame of initial state.
+data(wilson_raw_data) #load a data frame of expression data.
+
+bmodel = initialise_model(krum_bmodel)
+istate = krum_istate
+edata = wilson_raw_data
+

Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically.

+
# (3) Filter cell types.
+cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", 
+    rownames(edata))  #select cells to be included.
+edata = edata[cell_ind, ]
+
+
+

6.3.4 Add extra genes to the initial Boolean model

+

Extra genes can be added to the initial model using grow_bmodel. The function will add extra genes into the initial model with empty update functions.

+
#(4) Adding extra genes to the initial Boolean model.
+#extra_genes = setdiff(colnames(wilson_raw_data), bmodel@target) #to view available genes to be added.
+add_gene = 'ldb1' #genes to be added: ldb1
+grown_bmodel = grow_bmodel(add_gene, bmodel)
+
+
+

6.3.5 Estimate initial state for the extra genes

+

Initial state needs to be modify to include the initial expression of the extra genes. The initial state of the extra genes can be set manually, or it can be estimated from the data if the data contain multiple cell types with known relationships. In this example, CMPs are known to be at developmental upstream of erythro-myeloid differentiation, therefore initial state can be estimated by taking the average expression of the extra genes in CMPs.

+
#(5) Estimating initial state for the extra genes.
+tmp_data = initialise_raw_data(wilson_raw_data)[[1]] #preprocess data.
+tmp_istate = mean(tmp_data[grepl('cmp', rownames(tmp_data)), add_gene]) #estimating initial state from CMPs.
+tmp_istate = matrix(round(tmp_istate), nrow=1)
+colnames(tmp_istate) = add_gene
+grown_istate = cbind(istate, tmp_istate)
+grown_istate = initialise_data(grown_istate)
+
+
+

6.3.6 Run model training

+

To reconstruct a Boolean model from an expression data, run model_train.

+

In this example, model_train takes a few minutes to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above.

+

You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise using plotBM, which is based on igraph package. For easier manipulation, output the Boolean model using outgraph_model and display it with Cytoscape or Gephi.

+

Note that this example takes a long time to run. The use of parallel processing is recommended.

+
#(6) Inferring Boolean model.
+final_model = model_train(edata, grown_bmodel, grown_istate, max_varperrule=4, verbose=T)
+
+#(7) Visualise the Boolean model generated.
+plotBM(final_model)
+

+
+
+
+ + + + + + + + diff --git a/vignettes/booltrainer.md b/vignettes/booltrainer.md new file mode 100644 index 0000000..532c74c --- /dev/null +++ b/vignettes/booltrainer.md @@ -0,0 +1,494 @@ +- [Brief introduction](#brief-introduction) +- [Installation](#installation) +- [Input data format](#input-data-format) +- [Output format](#output-format) +- [Useful functions in BoolTraineR](#useful-functions-in-booltrainer) +- [Example workflows](#example-workflows) + - [Inferring model without an initial model](#inferring-model-without-an-initial-model) + - [Full workflow](#full-workflow) + - [Initial setup](#initial-setup) + - [Data preparation](#data-preparation) + - [Run model training](#run-model-training) + - [Inferring model with an initial model](#inferring-model-with-an-initial-model) + - [Full workflow](#full-workflow-1) + - [Initial setup](#initial-setup-1) + - [Data preparation](#data-preparation-1) + - [Run model training](#run-model-training-1) + - [Extending model with more genes](#extending-model-with-more-genes) + - [Full workflow](#full-workflow-2) + - [Initial setup](#initial-setup-2) + - [Data preparation](#data-preparation-2) + - [Add extra genes to the initial Boolean model](#add-extra-genes-to-the-initial-boolean-model) + - [Estimate initial state for the extra genes](#estimate-initial-state-for-the-extra-genes) + - [Run model training](#run-model-training-2) + +Brief introduction +================== + +`BoolTraineR` is a model learning algorithm for reconstructing and training asynchronous Boolean models using single-cell expression data. Refer to the paper for more details on the concepts behind the algorithm. This vignette serves as a tutorial to demonstrate example workflows that can be adapted to individual cases experienced by users. + +Running `BoolTraineR` is straightforward. However, note that depending on the (1) size of single-cell expression data and (2) complexity of Boolean model, `BoolTraineR` may take a long time to complete the computation. In such cases, it is advisable to use the built-in parallel processing capability of `BoolTraineR`. This can be easily achieved by using `doParallel` package, as illustrated in the example. + +Note that the examples presented in this vignette are different from the results presented in our paper. The examples presented here have been simplified to speed up the processing time. + +Installation +============ + +`BoolTraineR` can be installed from CRAN. + +``` r +install.packages('BoolTraineR') +``` + +Or from Github for the latest version. + +``` r +install_github("cheeyeelim/booltrainer") +``` + +Also install `doParallel` package if you intend to use parallel processing. + +Input data format +================= + +Depending on the analysis, only 3 types of data will ever be needed. The format of the data required is discussed below. + +1. Expression data. A matrix with genes on the columns, and cells on the row. + +The expression data should be preprocessed as in any standard sequencing data processing pipelines, which includes quality control filtering and normalisation. + +Use `initialise_raw_data` to convert expression data into a suitable format for model inference. + +``` r +data(wilson_raw_data) +round(wilson_raw_data[1:5,1:5], 4) + +edata = initialise_raw_data(wilson_raw_data) +``` + +| | bptf| cbfa2t3h| csf1r| dnmt3a| eif2b1| +|-----------|--------:|---------:|--------:|--------:|-------:| +| lmpp\_002 | 1.0261| 2.3944| 2.6847| 1.6636| 2.0203| +| lmpp\_003 | 2.6496| 1.7800| 1.6821| 1.5941| 2.7736| +| lmpp\_004 | 10.3080| 0.5889| 4.2653| -0.5565| 0.0026| +| lmpp\_007 | 0.5419| 1.8631| 10.8468| 0.1757| 1.0873| +| lmpp\_008 | 0.9209| 2.6637| 2.8549| 2.1965| 2.3663| + +1. Initial Boolean model. A data frame with two columns, targets and update functions. + +Note that if an update function contains both activation and inhibition genes, they must be expressed with a separate clause containing only activation genes, and a separate clause containing only inhibition genes. (See the update functions of Gata1 and Gata2 for examples) + +Use `initialise_model` to convert the input Boolean model into a BoolModel object. + +``` r +data(krum_bmodel) +head(krum_bmodel) + +bmodel = initialise_model(krum_bmodel) +``` + +| targets | factors | +|:--------|:-----------------------------------| +| gata2 | gata2 & ! ((gata1 & fog1) | sfpi1) | +| gata1 | (gata1 | gata2 | fli1) & ! sfpi1 | +| fog1 | gata1 | +| eklf | gata1 & ! fli1 | +| fli1 | gata1 & ! eklf | +| scl | gata1 & ! sfpi1 | + +1. Initial state. + +A single row data frame with genes as the columns. The expression state of each gene must be in binarised form, i.e. 0s and 1s. + +Note that all the genes that are present in the initial Boolean model must also be present here. + +``` r +data(krum_istate) +head(krum_istate) +``` + +| | cjun| cebpa| fli1| gata1| gata2| eklf| sfpi1| gfi1| scl| egrnab| fog1| +|----------------|-----:|------:|-----:|------:|------:|-----:|------:|-----:|----:|-------:|-----:| +| initial\_state | 0| 1| 0| 0| 1| 0| 1| 0| 0| 0| 0| + +Output format +============= + +BoolTraineR supports several output formats for Boolean models, as shown below. + +- `outgraph_model` - Outputs a Boolean model in a tab-delimited file with each line being an edge (i.e. gene interaction). This function also outputs a node attribute file, which can be used to distinguish gene and AND nodes in a graph plotting software. This format is readable by both Cytoscape and Gephi. +- `outgenysis_model` - Outputs a Boolean model in a space-delimited file with each line being an edge (i.e. gene interaction). This format is readable by genYsis (used for steady state analysis). +- `writeBM` - Outputs a Boolean model in a comma-delimited file similar in format to the input file format (i.e. two columns: genes and update functions). + +BoolTraineR can also output a state transition graph. + +- `outstate_graph` - Outputs a state space of a Boolean model simulated with an initial state. This format is readable by both Cytoscape and Gephi. + +Useful functions in BoolTraineR +=============================== + +Besides training Boolean models, BoolTraineR can be used for simulating a Boolean model asynchronously and calculate the score of a Boolean model with respect to a data. + +- `model_train` - Core function in `BoolTraineR` that performs Boolean model inference. +- `simulate_model` - Simulate a Boolean model asynchronously using an initial state, and return its state space. +- `calc_mscore` - Calculate a distance score for a Boolean model with respect to an expression data. +- `model_dist` - Calculate the number of genes in the update functions that differ between two Boolean models. +- `model_setdiff` - Show the genes in the update functions that differ between two Boolean models. + +Example workflows +================= + +Three example workflows will be discussed in this vignette: (1) Inferring model without an initial model, (2) Inferring model with an initial model, (3) Extending model with more genes. The two workflows are largely similar, which only differ in the data preparation step. + +Inferring model without an initial model +---------------------------------------- + +This workflow is intended for use on inferring a Boolean model without an initial model. + +When no initial model is used, BoolTraineR will reconstruct gene interactions from a list of user-specified genes. If the number of genes in the expression data is low (e.g. in qPCR), it is also possible to use all the genes in the expression data. + +### Full workflow + +Full workflow is included here for easy referencing. Each step is discussed in further details below. + +``` r +set.seed(0) #use to ensure reproducibility. remove in actual use. + +# (1) Setup paths and environment. +library(BoolTraineR) + +# If intending to use parallel processing, uncomment the following lines. +# library(doParallel) num_core = 4 #specify the number of cores to be used. +# doParallel::registerDoParallel(cores=num_core) + +# (2) Load data. +data(wilson_raw_data) #load a data frame of expression data. +edata = wilson_raw_data + +# (3) Filter cell types. +cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", + rownames(edata)) #select cells to be included. +edata = edata[cell_ind, ] + +# (4) Filter genes. +gene_ind = c("fli1", "gata1", "gata2", "gfi1", "scl", "sfpi1") #select genes to be included. +edata = edata[, gene_ind] + +# (5) Inferring Boolean model. +final_model = model_train(edata, max_varperrule = 4, verbose = T) + +# (6) Visualise the Boolean model generated. +plotBM(final_model) +``` + +### Initial setup + +The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. + +``` r +set.seed(0) #use to ensure reproducibility. remove in actual use. + +#(1) Setup paths and environment. +library(BoolTraineR) + +#If intending to use parallel processing, uncomment the following lines. +#library(doParallel) +#num_core = 4 #specify the number of cores to be used. +#doParallel::registerDoParallel(cores=num_core) +``` + +### Data preparation + +Only the expression data is needed for inferring a Boolean model without an initial model. + +To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. + +``` r +#(2) Load data. +data(wilson_raw_data) #load a data frame of expression data. +edata = wilson_raw_data +``` + +Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. + +``` r +# (3) Filter cell types. +cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", + rownames(edata)) #select cells to be included. +edata = edata[cell_ind, ] + +# (4) Filter genes. +gene_ind = c("fli1", "gata1", "gata2", "gfi1", "scl", "sfpi1") #select genes to be included. +edata = edata[, gene_ind] +``` + +### Run model training + +To reconstruct a Boolean model from an expression data, run `model_train`. + +In this example, `model_train` takes a few seconds to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above. + +You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise quickly using `plotBM`, which is based on `igraph` package. For easier manipulation, output the Boolean model using `outgraph_model` and display it with Cytoscape or Gephi. + +``` r +#(5) Inferring Boolean model. +final_model = model_train(edata, max_varperrule=4, verbose=T) + +#(6) Visualise the Boolean model generated. +plotBM(final_model) +``` + +![](booltrainer_files/figure-markdown_github/unnamed-chunk-15-1.png) + +Inferring model with an initial model +------------------------------------- + +This workflow is intended for use on inferring a Boolean model with an initial model. + +When an initial model is used, note that only genes that are both present in the initial model and expression data will be used for reconstructing gene interactions. Any genes in the initial model that do not have corresponding expression values in the data will keep their original gene interactions as specified in the initial model without any modifications. + +### Full workflow + +Full workflow is included here for easy referencing. Each step is discussed in further details below. + +``` r +set.seed(0) #use to ensure reproducibility. remove in actual use. + +# (1) Setup paths and environment. +library(BoolTraineR) + +# If intending to use parallel processing, uncomment the following lines. +# library(doParallel) num_core = 4 #specify the number of cores to be used. +# doParallel::registerDoParallel(cores=num_core) + +# (2) Load data. +data(krum_bmodel) #load a data frame of Boolean model. +data(krum_istate) #load a data frame of initial state. +data(wilson_raw_data) #load a data frame of expression data. + +bmodel = initialise_model(krum_bmodel) +istate = krum_istate +edata = wilson_raw_data + +# (3) Filter cell types. +cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", + rownames(edata)) #select cells to be included. +edata = edata[cell_ind, ] + +# (4) Inferring Boolean model. +final_model = model_train(edata, bmodel, istate, max_varperrule = 4, verbose = T) + +# (5) Visualise the Boolean model generated. +plotBM(final_model) +``` + +### Initial setup + +The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. + +``` r +set.seed(0) #use to ensure reproducibility. remove in actual use. + +#(1) Setup paths and environment. +library(BoolTraineR) + +#If intending to use parallel processing, uncomment the following lines. +#library(doParallel) +#num_core = 4 #specify the number of cores to be used. +#doParallel::registerDoParallel(cores=num_core) +``` + +### Data preparation + +3 pieces of data are needed to infer a Boolean model with an initial model: an expression data, an initial Boolean model and an initial state. + +To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. + +`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. + +``` r +#(2) Load data. +data(krum_bmodel) #load a data frame of Boolean model. +data(krum_istate) #load a data frame of initial state. +data(wilson_raw_data) #load a data frame of expression data. + +bmodel = initialise_model(krum_bmodel) +istate = krum_istate +edata = wilson_raw_data +``` + +Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. + +``` r +# (3) Filter cell types. +cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", + rownames(edata)) #select cells to be included. +edata = edata[cell_ind, ] +``` + +### Run model training + +To reconstruct a Boolean model from an expression data, run `model_train`. + +In this example, `model_train` takes a few seconds to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above. + +You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise using `plotBM`, which is based on `igraph` package. For easier manipulation, output the Boolean model using `outgraph_model` and display it with Cytoscape or Gephi. + +``` r +#(4) Inferring Boolean model. +final_model = model_train(edata, bmodel, istate, max_varperrule=4, verbose=T) + +#(5) Visualise the Boolean model generated. +plotBM(final_model) +``` + +![](booltrainer_files/figure-markdown_github/unnamed-chunk-21-1.png) + +Extending model with more genes +------------------------------- + +This workflow is intended for use on extending an initial Boolean model with additional genes. + +When an initial model is used, note that only genes that are both present in the initial model and expression data will be used for reconstructing gene interactions. Any genes in the initial model that do not have corresponding expression values in the data will keep their original gene interactions as specified in the initial model without any modifications. + +### Full workflow + +Full workflow is included here for easy referencing. Each step is discussed in further details below. + +*Note that this example takes a few minutes to run on a single core. The use of parallel processing is recommended.* + +``` r +set.seed(0) #use to ensure reproducibility. remove in actual use. + +# (1) Setup paths and environment. +library(BoolTraineR) + +# If intending to use parallel processing, uncomment the following lines. +# library(doParallel) num_core = 4 #specify the number of cores to be used. +# doParallel::registerDoParallel(cores=num_core) + +# (2) Load data. +data(krum_bmodel) #load a data frame of Boolean model. +data(krum_istate) #load a data frame of initial state. +data(wilson_raw_data) #load a data frame of expression data. + +bmodel = initialise_model(krum_bmodel) +istate = krum_istate +edata = wilson_raw_data + +# (3) Filter cell types. +cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", + rownames(edata)) #select cells to be included. +edata = edata[cell_ind, ] + +# (4) Adding extra genes to the initial Boolean model. extra_genes = +# setdiff(colnames(wilson_raw_data), bmodel@target) #to view available genes +# to be added. +print(extra_genes) #to view available genes to be added. +add_gene = "ldb1" #genes to be added: ldb1 +grown_bmodel = grow_bmodel(add_gene, bmodel) + +# (5) Estimating initial state for the extra genes. +tmp_data = initialise_raw_data(wilson_raw_data)[[1]] #preprocess data. +tmp_istate = mean(tmp_data[grepl("cmp", rownames(tmp_data)), add_gene]) #estimating initial state from CMPs. +tmp_istate = matrix(round(tmp_istate), nrow = 1) +colnames(tmp_istate) = add_gene +grown_istate = cbind(istate, tmp_istate) +grown_istate = initialise_data(grown_istate) + +# (6) Inferring Boolean model. +final_model = model_train(edata, grown_bmodel, grown_istate, max_varperrule = 4, + verbose = T) + +# (7) Visualise the Boolean model generated. +plotBM(final_model) +``` + +### Initial setup + +The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. + +``` r +set.seed(0) #use to ensure reproducibility. remove in actual use. + +#(1) Setup paths and environment. +library(BoolTraineR) + +#If intending to use parallel processing, uncomment the following lines. +#library(doParallel) +#num_core = 4 #specify the number of cores to be used. +#doParallel::registerDoParallel(cores=num_core) +``` + +### Data preparation + +3 pieces of data are needed to infer a Boolean model with an initial model: an expression data, an initial Boolean model and an initial state. + +To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. + +`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. + +``` r +#(2) Load data. +data(krum_bmodel) #load a data frame of Boolean model. +data(krum_istate) #load a data frame of initial state. +data(wilson_raw_data) #load a data frame of expression data. + +bmodel = initialise_model(krum_bmodel) +istate = krum_istate +edata = wilson_raw_data +``` + +Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. + +``` r +# (3) Filter cell types. +cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", + rownames(edata)) #select cells to be included. +edata = edata[cell_ind, ] +``` + +### Add extra genes to the initial Boolean model + +Extra genes can be added to the initial model using `grow_bmodel`. The function will add extra genes into the initial model with empty update functions. + +``` r +#(4) Adding extra genes to the initial Boolean model. +#extra_genes = setdiff(colnames(wilson_raw_data), bmodel@target) #to view available genes to be added. +add_gene = 'ldb1' #genes to be added: ldb1 +grown_bmodel = grow_bmodel(add_gene, bmodel) +``` + +### Estimate initial state for the extra genes + +Initial state needs to be modify to include the initial expression of the extra genes. The initial state of the extra genes can be set manually, or it can be estimated from the data if the data contain multiple cell types with known relationships. In this example, CMPs are known to be at developmental upstream of erythro-myeloid differentiation, therefore initial state can be estimated by taking the average expression of the extra genes in CMPs. + +``` r +#(5) Estimating initial state for the extra genes. +tmp_data = initialise_raw_data(wilson_raw_data)[[1]] #preprocess data. +tmp_istate = mean(tmp_data[grepl('cmp', rownames(tmp_data)), add_gene]) #estimating initial state from CMPs. +tmp_istate = matrix(round(tmp_istate), nrow=1) +colnames(tmp_istate) = add_gene +grown_istate = cbind(istate, tmp_istate) +grown_istate = initialise_data(grown_istate) +``` + +### Run model training + +To reconstruct a Boolean model from an expression data, run `model_train`. + +In this example, `model_train` takes a few minutes to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above. + +You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise using `plotBM`, which is based on `igraph` package. For easier manipulation, output the Boolean model using `outgraph_model` and display it with Cytoscape or Gephi. + +*Note that this example takes a long time to run. The use of parallel processing is recommended.* + +``` r +#(6) Inferring Boolean model. +final_model = model_train(edata, grown_bmodel, grown_istate, max_varperrule=4, verbose=T) + +#(7) Visualise the Boolean model generated. +plotBM(final_model) +``` + +![](booltrainer_files/figure-markdown_github/unnamed-chunk-29-1.png) diff --git a/vignettes/booltrainer.pdf b/vignettes/booltrainer.pdf new file mode 100644 index 0000000..3e317d7 Binary files /dev/null and b/vignettes/booltrainer.pdf differ diff --git a/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-15-1.png b/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-15-1.png new file mode 100644 index 0000000..fe40a1b Binary files /dev/null and b/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-15-1.png differ diff --git a/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-21-1.png b/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-21-1.png new file mode 100644 index 0000000..fe40a1b Binary files /dev/null and b/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-21-1.png differ diff --git a/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-29-1.png b/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-29-1.png new file mode 100644 index 0000000..fe40a1b Binary files /dev/null and b/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-29-1.png differ