diff --git a/DESCRIPTION b/DESCRIPTION index 3173734..33ba5b1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: BoolTraineR Type: Package Title: Tools For Training and Analysing Asynchronous Boolean Models -Version: 1.1.2 +Version: 1.1.3 Date: 2015-10-22 Author: Chee Yee Lim Maintainer: Chee Yee Lim diff --git a/NAMESPACE b/NAMESPACE index 6495ba8..64b9c48 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,4 @@ -# Generated by roxygen2 (4.1.1): do not edit by hand +# Generated by roxygen2: do not edit by hand export(BoolModel) export(amat_to_bm) @@ -20,7 +20,6 @@ export(minmod_model) export(model_dist) export(model_setdiff) export(model_train) -export(model_train_sa) export(outgenysis_model) export(outgraph_model) export(outstate_graph) diff --git a/R/initialisation.R b/R/initialisation.R index e3704af..7d7f773 100644 --- a/R/initialisation.R +++ b/R/initialisation.R @@ -5,12 +5,12 @@ #' Note that kmeans clustering as binarisation only works well if the data has a bimodal distribution. #' #' @param x matrix. Numeric data of gene expression. -#' @param data_type character. Specify data types: qpcr, rnaseq. +#' @param max_expr character. Specify whether max expression value is the lowest (as in qPCR), or the highest (as in RNAseq and microarray). Option: 'low', 'high'. Default to 'high'. #' @param uni_thre numerical. Speficy threshold for unimodality test. Default to 0.2. #' @param scale logical. Whether to scale the data to a range of 0-1. Default to T. #' #' @export -initialise_raw_data = function(x, data_type='qpcr', uni_thre=0.2, scale=T) +initialise_raw_data = function(x, max_expr='high', uni_thre=0.2, scale=T) { #(1) Convert negative to positive values. if(min(x)<0) @@ -23,7 +23,7 @@ initialise_raw_data = function(x, data_type='qpcr', uni_thre=0.2, scale=T) stopifnot(min(x)==0) - if(data_type=='qpcr') + if(max_expr=='low') { #(2) Invert qPCR values. Lowest expression should be close to 0, highest expression should be away from 0. x = abs(max(x) - x) diff --git a/R/search.R b/R/search.R index cde2ccf..378028d 100644 --- a/R/search.R +++ b/R/search.R @@ -1,165 +1,11 @@ -#' @title Training Model (using simulated annealing) -#' -#' @description -#' This function performs model training to find the best model, using information from data. It requires an initial state supplied to perform the search, and an initial model can also be supplied to be included in the initial population. -#' Note that if a model is supplied, and the genes in the model is different from the genes in the data, only the genes overlapping between model and data will be retained for further analysis. -#' -#' @param edata data frame of expression data. Should have state(row) x gene(column). -#' @param bmodel Boolean model in data frame. If NULL, use a random Boolean model. Defaults to NULL. -#' @param istate data frame. Must have only 1 row, which represents 1 initial state. Defaults to NULL. -#' @param max_varperrule integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must be higher than number of genes. Defaults to 6. -#' @param and_bool logical. Whether to consider AND terms. IF bmodel is not NULL, defaults to whether AND interaction is included in bmodel. If bmodel is NULL, then defaults to TRUE. -#' @param self_loop logical. Whether to allow self_loop in random starting model. Only used if is.null(bmodel). Default to F. -#' @param restart integer. Number of restart from the best solution. Defaults to 0. -#' @param verbose logical. Whether to give detailed output to the screen. Defaults to F. -#' -#' @export -model_train_sa = function(edata, bmodel=NULL, istate=NULL, max_varperrule=6, and_bool=T, self_loop=F, restart=0, verbose=F) -{ - ##################Implement restart########################## - - vcat('Preparing data for analysis.\n', verbose) - - #Initialise expression data. - tmp_data = initialise_raw_data(edata) #returns a list of two data frames. - cdata = initialise_data(tmp_data[[1]]) #continuous data - ddata = initialise_data(tmp_data[[2]]) #discretised data - - #Initialise model. - if(is.null(bmodel)) - { - bmodel = gen_one_rmodel(colnames(edata), max_varperrule, and_bool, self_loop) - } else - { - if(class(bmodel) != 'BoolModel') - { - bmodel = initialise_model(bmodel) - } - - if(check_and(bmodel) != and_bool) - { - and_bool = check_and(bmodel) - } - } - - #Initialise initial state. - if(is.null(istate)) - { - istate = rbinom(length(bmodel@target), 1, 0.5) - #Getting a random initial state. - while(mean(istate) > 0.9 | mean(istate) < 0.1) #do not want initial state that is too homogenous. - { - istate = rbinom(length(bmodel@target), 1, 0.5) - } - istate = data.frame(matrix(istate, nrow=1)) - colnames(istate) = bmodel@target - } - istate = initialise_data(istate, aslogic=T) - - #Filtering expression data. - overlap_gene = intersect(colnames(cdata), y=bmodel@target) - nonoverlap_gene = bmodel@target[!(bmodel@target %in% overlap_gene)] - names(overlap_gene) = bmodel@target_var[bmodel@target %in% overlap_gene] - names(nonoverlap_gene) = bmodel@target_var[!(bmodel@target %in% overlap_gene)] - - fddata = filter_dflist(ddata, overlap_gene, F) - fcdata = filter_dflist(cdata, overlap_gene, F) - - fcdata = unique_raw_data(fddata, fcdata) #removes duplicates in continuous data. - fddata = unique(fddata) - - vcat('Start training.\n', verbose) - - #(3) Calling final combined search. - cur_score = NA - cur_model = bmodel - cur_step = 1 - cur_temp = 1 - min_temp = 0.00001 - alpha = 0.9 - max_ite = 100 #iterations in same step. - while(cur_temp > min_temp) - { - vcat(sprintf('Current iteration: %s.\n', cur_step), verbose) - - vcat('Stage 1: Exploring neighbouring models.\n', verbose) - mod_model = unlist(minmod_model(cur_model, overlap_gene=overlap_gene)) - vcat(sprintf('Total neighbouring models: %s.\n', length(mod_model)), verbose) - - vcat('Stage 2: Evaluating next model.\n', verbose) - cur_ite = 1 - while(cur_ite <= max_ite) - { - model_ind = sample(1:length(mod_model), 1) - next_model = mod_model[[model_ind]] - mod_model = mod_model[-model_ind] - - #print(printBM(next_model)) #debug - - next_score = calc_mscore(bmodel=next_model, istate=istate, fcdata=fcdata, overlap_gene=overlap_gene, max_varperrule=max_varperrule) - - #Breaking conditions. - if(length(mod_model) == 0) - { - cur_score = next_score - cur_model = next_model - - if(cur_score < best_score) #store best solution ever, regardless of the final ending point. - { - best_score = next_score - best_model = next_model - } - break - } - - if(is.na(cur_score)) - { - #For first iteration. - cur_score = next_score - cur_model = next_model - - best_score = next_score - best_model = next_model - } else - { - #For subsequent iteration. - accept_prob = exp((cur_score - next_score)/cur_temp) #if next solution is better than current solution, accept_prob always more than 1. - - if(accept_prob > runif(1)) #move forward if the prob is more than a random number between 0-1. - { - cur_score = next_score - cur_model = next_model - - #writeBM(cur_model, 'tmp_model.csv') #debug - - if(cur_score < best_score) #store best solution ever, regardless of the final ending point. - { - best_score = next_score - best_model = next_model - } - } - } - cur_ite = cur_ite + 1 - } - - cur_temp = cur_temp*alpha #Reduce subsequent temperature. - cur_step = cur_step + 1 - } - vcat(sprintf('Final iteration: %s.\n', cur_step), verbose) - - output = list(best_score=best_score, best_model=best_model, - cur_score=cur_score, cur_model=cur_model, overlap_gene=overlap_gene, nonoverlap_gene=nonoverlap_gene) - - return(output) -} - #' @title Training Model #' #' @description #' This function performs model training to find the best model, using information from data. It requires an initial state supplied to perform the search, and an initial model can also be supplied to be included in the initial population. #' Note that if a model is supplied, and the genes in the model is different from the genes in the data, only the genes overlapping between model and data will be retained for further analysis. #' -#' @param edata data frame of expression data. Should have state(row) x gene(column). +#' @param cdata data frame of expression data. Should have state(row) x gene(column). +#' @param ddata discretised data frame of expression data. Must supply when preprocess=F. Obtain from initialise_raw_data(). Defaults to NULL. #' @param bmodel Boolean model in data frame. If NULL, use a random Boolean model. Defaults to NULL. #' @param istate data frame. Must have only 1 row, which represents 1 initial state. Defaults to NULL. #' @param max_varperrule integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must be higher than number of genes. Defaults to 6. @@ -171,19 +17,14 @@ model_train_sa = function(edata, bmodel=NULL, istate=NULL, max_varperrule=6, and #' @param detailed_output logical. Whether to return only the model inferred, or all the details obtained during optimisation. Defaults to F. #' #' @export -model_train = function(edata, bmodel=NULL, istate=NULL, max_varperrule=6, and_bool=T, self_loop=F, con_thre=0.3, tol=1e-6, verbose=F, detailed_output=F) +model_train = function(cdata, ddata=NULL, bmodel=NULL, istate=NULL, max_varperrule=6, and_bool=T, self_loop=F, con_thre=0.3, tol=1e-6, verbose=F, detailed_output=F) { vcat('Preparing data for analysis.\n', verbose) - #Initialise expression data. - tmp_data = initialise_raw_data(edata) #returns a list of two data frames. - cdata = initialise_data(tmp_data[[1]]) #continuous data - ddata = initialise_data(tmp_data[[2]]) #discretised data - #Initialise model. if(is.null(bmodel)) { - bmodel = gen_one_rmodel(colnames(edata), max_varperrule, and_bool, self_loop) + bmodel = gen_one_rmodel(colnames(cdata), max_varperrule, and_bool, self_loop) } else { if(class(bmodel) != 'BoolModel') @@ -226,6 +67,7 @@ model_train = function(edata, bmodel=NULL, istate=NULL, max_varperrule=6, and_bo vcat('Start training.\n', verbose) #(3) Calling final combined search. + i = 0 #suppress check error on non-visible global binding. best_model = c() best_score = c() all_best_score = list() @@ -307,8 +149,6 @@ model_train = function(edata, bmodel=NULL, istate=NULL, max_varperrule=6, and_bo previous_score = best_score #store it for comparison. all_best_score = c(all_best_score, list(best_score)) cur_step = cur_step + 1 - - #browser() } vcat(sprintf('Final iteration: %s.\n', cur_step), verbose) diff --git a/README.md b/README.md index 5e36d2b..b2133c6 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,35 @@ -- [Brief introduction](#brief-introduction) -- [Installation](#installation) -- [Input data format](#input-data-format) -- [Output format](#output-format) -- [Useful functions in BoolTraineR](#useful-functions-in-booltrainer) -- [Example workflows](#example-workflows) - - [Inferring model without an initial model](#inferring-model-without-an-initial-model) - - [Full workflow](#full-workflow) - - [Initial setup](#initial-setup) - - [Data preparation](#data-preparation) - - [Run model training](#run-model-training) - - [Inferring model with an initial model](#inferring-model-with-an-initial-model) - - [Full workflow](#full-workflow-1) - - [Initial setup](#initial-setup-1) - - [Data preparation](#data-preparation-1) - - [Run model training](#run-model-training-1) - - [Extending model with more genes](#extending-model-with-more-genes) - - [Full workflow](#full-workflow-2) - - [Initial setup](#initial-setup-2) - - [Data preparation](#data-preparation-2) - - [Add extra genes to the initial Boolean model](#add-extra-genes-to-the-initial-boolean-model) - - [Estimate initial state for the extra genes](#estimate-initial-state-for-the-extra-genes) - - [Run model training](#run-model-training-2) - +- Brief introduction +- Installation +- Input data format +- Output format +- Useful functions in BoolTraineR +- Example workflows + - Inferring model without an initial model + - Full workflow + - Initial setup + - Data preparation + - Run model training + - Inferring model with an initial model + - Full workflow + - Initial setup + - Data preparation + - Run model training + - Extending model with more genes + - Full workflow + - Initial setup + - Data preparation + - Add extra genes to the initial Boolean model + - Estimate initial state for the extra genes + - Run model training + + Brief introduction ================== @@ -40,10 +48,11 @@ Installation install.packages('BoolTraineR') ``` -Or from Github for the latest version. +Or from Github for the latest version. To install from Gitbub, you will require the `devtools` package. ``` r -install_github("cheeyeelim/booltrainer") +install.packages('devtools') +devtools::install_github("cheeyeelim/booltrainer") ``` Also install `doParallel` package if you intend to use parallel processing. @@ -57,13 +66,11 @@ Depending on the analysis, only 3 types of data will ever be needed. The format The expression data should be preprocessed as in any standard sequencing data processing pipelines, which includes quality control filtering and normalisation. -Use `initialise_raw_data` to convert expression data into a suitable format for model inference. +Use `initialise_raw_data` to convert expression data into a suitable format for model inference. It is recommended to use `initialise_raw_data` before subsetting the expression data for preferred cell types. ``` r data(wilson_raw_data) -round(wilson_raw_data[1:5,1:5], 4) - -edata = initialise_raw_data(wilson_raw_data) +round(wilson_raw_data[1:5, 1:5], 4) ``` | | bptf| cbfa2t3h| csf1r| dnmt3a| eif2b1| @@ -74,6 +81,10 @@ edata = initialise_raw_data(wilson_raw_data) | lmpp\_007 | 0.5419| 1.8631| 10.8468| 0.1757| 1.0873| | lmpp\_008 | 0.9209| 2.6637| 2.8549| 2.1965| 2.3663| +``` r +edata = initialise_raw_data(wilson_raw_data, max_expr='low') #max_expr='low' because this is qPCR data. +``` + 1. Initial Boolean model. A data frame with two columns, targets and update functions. Note that if an update function contains both activation and inhibition genes, they must be expressed with a separate clause containing only activation genes, and a separate clause containing only inhibition genes. (See the update functions of Gata1 and Gata2 for examples) @@ -83,8 +94,6 @@ Use `initialise_model` to convert the input Boolean model into a BoolModel objec ``` r data(krum_bmodel) head(krum_bmodel) - -bmodel = initialise_model(krum_bmodel) ``` | targets | factors | @@ -96,6 +105,10 @@ bmodel = initialise_model(krum_bmodel) | fli1 | gata1 & ! eklf | | scl | gata1 & ! sfpi1 | +``` r +bmodel = initialise_model(krum_bmodel) +``` + 1. Initial state. A single row data frame with genes as the columns. The expression state of each gene must be in binarised form, i.e. 0s and 1s. @@ -163,19 +176,24 @@ library(BoolTraineR) # (2) Load data. data(wilson_raw_data) #load a data frame of expression data. -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] # (4) Filter genes. gene_ind = c("fli1", "gata1", "gata2", "gfi1", "scl", "sfpi1") #select genes to be included. -edata = edata[, gene_ind] +fcdata = fcdata[, gene_ind] +fddata = fddata[, gene_ind] # (5) Inferring Boolean model. -final_model = model_train(edata, max_varperrule = 4, verbose = T) +final_model = model_train(cdata = fcdata, ddata = fddata, max_varperrule = 4, + verbose = T) # (6) Visualise the Boolean model generated. plotBM(final_model) @@ -186,15 +204,14 @@ plotBM(final_model) The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. ``` r -set.seed(0) #use to ensure reproducibility. remove in actual use. +set.seed(0) #use to ensure reproducibility. remove in actual use. -#(1) Setup paths and environment. +# (1) Setup paths and environment. library(BoolTraineR) -#If intending to use parallel processing, uncomment the following lines. -#library(doParallel) -#num_core = 4 #specify the number of cores to be used. -#doParallel::registerDoParallel(cores=num_core) +# If intending to use parallel processing, uncomment the following lines. +# library(doParallel) num_core = 4 #specify the number of cores to be used. +# doParallel::registerDoParallel(cores=num_core) ``` ### Data preparation @@ -203,23 +220,29 @@ Only the expression data is needed for inferring a Boolean model without an init To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. +`initialise_raw_data` is used to preprocess the data. + ``` r -#(2) Load data. -data(wilson_raw_data) #load a data frame of expression data. -edata = wilson_raw_data +# (2) Load data. +data(wilson_raw_data) #load a data frame of expression data. +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data ``` -Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. +Once data is loaded and preprocessed, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. ``` r # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] # (4) Filter genes. gene_ind = c("fli1", "gata1", "gata2", "gfi1", "scl", "sfpi1") #select genes to be included. -edata = edata[, gene_ind] +fcdata = fcdata[, gene_ind] +fddata = fddata[, gene_ind] ``` ### Run model training @@ -231,14 +254,15 @@ In this example, `model_train` takes a few seconds to be completed on a single c You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise quickly using `plotBM`, which is based on `igraph` package. For easier manipulation, output the Boolean model using `outgraph_model` and display it with Cytoscape or Gephi. ``` r -#(5) Inferring Boolean model. -final_model = model_train(edata, max_varperrule=4, verbose=T) +# (5) Inferring Boolean model. +final_model = model_train(cdata = fcdata, ddata = fddata, max_varperrule = 4, + verbose = T) -#(6) Visualise the Boolean model generated. +# (6) Visualise the Boolean model generated. plotBM(final_model) ``` -![](vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-15-1.png) +![](vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-17-1.png) Inferring model with an initial model ------------------------------------- @@ -268,15 +292,19 @@ data(wilson_raw_data) #load a data frame of expression data. bmodel = initialise_model(krum_bmodel) istate = krum_istate -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] # (4) Inferring Boolean model. -final_model = model_train(edata, bmodel, istate, max_varperrule = 4, verbose = T) +final_model = model_train(cdata = fcdata, ddata = fddata, bmodel = bmodel, istate = istate, + max_varperrule = 4, verbose = T) # (5) Visualise the Boolean model generated. plotBM(final_model) @@ -287,15 +315,14 @@ plotBM(final_model) The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. ``` r -set.seed(0) #use to ensure reproducibility. remove in actual use. +set.seed(0) #use to ensure reproducibility. remove in actual use. -#(1) Setup paths and environment. +# (1) Setup paths and environment. library(BoolTraineR) -#If intending to use parallel processing, uncomment the following lines. -#library(doParallel) -#num_core = 4 #specify the number of cores to be used. -#doParallel::registerDoParallel(cores=num_core) +# If intending to use parallel processing, uncomment the following lines. +# library(doParallel) num_core = 4 #specify the number of cores to be used. +# doParallel::registerDoParallel(cores=num_core) ``` ### Data preparation @@ -304,45 +331,49 @@ library(BoolTraineR) To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. -`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. +`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. `initialise_raw_data` is used to preprocess the data. ``` r -#(2) Load data. -data(krum_bmodel) #load a data frame of Boolean model. -data(krum_istate) #load a data frame of initial state. -data(wilson_raw_data) #load a data frame of expression data. +# (2) Load data. (2) Load data. +data(krum_bmodel) #load a data frame of Boolean model. +data(krum_istate) #load a data frame of initial state. +data(wilson_raw_data) #load a data frame of expression data. bmodel = initialise_model(krum_bmodel) istate = krum_istate -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data ``` -Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. +Once data are loaded and preprocessed, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. ``` r # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] ``` ### Run model training To reconstruct a Boolean model from an expression data, run `model_train`. -In this example, `model_train` takes a few seconds to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above. +In this example, `model_train` takes one or two minutes to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above. You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise using `plotBM`, which is based on `igraph` package. For easier manipulation, output the Boolean model using `outgraph_model` and display it with Cytoscape or Gephi. ``` r -#(4) Inferring Boolean model. -final_model = model_train(edata, bmodel, istate, max_varperrule=4, verbose=T) +# (4) Inferring Boolean model. +final_model = model_train(cdata = fcdata, ddata = fddata, bmodel = bmodel, istate = istate, + max_varperrule = 4, verbose = T) -#(5) Visualise the Boolean model generated. +# (5) Visualise the Boolean model generated. plotBM(final_model) ``` -![](vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-21-1.png) +![](vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-23-1.png) Extending model with more genes ------------------------------- @@ -374,31 +405,32 @@ data(wilson_raw_data) #load a data frame of expression data. bmodel = initialise_model(krum_bmodel) istate = krum_istate -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] # (4) Adding extra genes to the initial Boolean model. extra_genes = # setdiff(colnames(wilson_raw_data), bmodel@target) #to view available genes -# to be added. -print(extra_genes) #to view available genes to be added. +# to be added. print(extra_genes) #to view available genes to be added. add_gene = "ldb1" #genes to be added: ldb1 grown_bmodel = grow_bmodel(add_gene, bmodel) -# (5) Estimating initial state for the extra genes. -tmp_data = initialise_raw_data(wilson_raw_data)[[1]] #preprocess data. -tmp_istate = mean(tmp_data[grepl("cmp", rownames(tmp_data)), add_gene]) #estimating initial state from CMPs. +# (5) Estimating initial state for the extra genes. (estimating from CMPs) +tmp_istate = mean(cdata[grepl("cmp", rownames(cdata)), add_gene]) tmp_istate = matrix(round(tmp_istate), nrow = 1) colnames(tmp_istate) = add_gene grown_istate = cbind(istate, tmp_istate) grown_istate = initialise_data(grown_istate) # (6) Inferring Boolean model. -final_model = model_train(edata, grown_bmodel, grown_istate, max_varperrule = 4, - verbose = T) +final_model = model_train(cdata = fcdata, ddata = fddata, bmodel = grown_bmodel, + istate = grown_istate, verbose = T) # (7) Visualise the Boolean model generated. plotBM(final_model) @@ -409,15 +441,14 @@ plotBM(final_model) The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. ``` r -set.seed(0) #use to ensure reproducibility. remove in actual use. +set.seed(0) #use to ensure reproducibility. remove in actual use. -#(1) Setup paths and environment. +# (1) Setup paths and environment. library(BoolTraineR) -#If intending to use parallel processing, uncomment the following lines. -#library(doParallel) -#num_core = 4 #specify the number of cores to be used. -#doParallel::registerDoParallel(cores=num_core) +# If intending to use parallel processing, uncomment the following lines. +# library(doParallel) num_core = 4 #specify the number of cores to be used. +# doParallel::registerDoParallel(cores=num_core) ``` ### Data preparation @@ -426,26 +457,29 @@ library(BoolTraineR) To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. -`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. +`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. `initialise_raw_data` is used to preprocess the data. ``` r -#(2) Load data. -data(krum_bmodel) #load a data frame of Boolean model. -data(krum_istate) #load a data frame of initial state. -data(wilson_raw_data) #load a data frame of expression data. +# (2) Load data. +data(krum_bmodel) #load a data frame of Boolean model. +data(krum_istate) #load a data frame of initial state. +data(wilson_raw_data) #load a data frame of expression data. bmodel = initialise_model(krum_bmodel) istate = krum_istate -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data ``` -Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. +Once data are loaded and preprocessed, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. ``` r # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] ``` ### Add extra genes to the initial Boolean model @@ -453,9 +487,10 @@ edata = edata[cell_ind, ] Extra genes can be added to the initial model using `grow_bmodel`. The function will add extra genes into the initial model with empty update functions. ``` r -#(4) Adding extra genes to the initial Boolean model. -#extra_genes = setdiff(colnames(wilson_raw_data), bmodel@target) #to view available genes to be added. -add_gene = 'ldb1' #genes to be added: ldb1 +# (4) Adding extra genes to the initial Boolean model. extra_genes = +# setdiff(colnames(wilson_raw_data), bmodel@target) print(extra_genes) #to +# view available genes to be added. +add_gene = "ldb1" #genes to be added: ldb1 grown_bmodel = grow_bmodel(add_gene, bmodel) ``` @@ -464,10 +499,9 @@ grown_bmodel = grow_bmodel(add_gene, bmodel) Initial state needs to be modify to include the initial expression of the extra genes. The initial state of the extra genes can be set manually, or it can be estimated from the data if the data contain multiple cell types with known relationships. In this example, CMPs are known to be at developmental upstream of erythro-myeloid differentiation, therefore initial state can be estimated by taking the average expression of the extra genes in CMPs. ``` r -#(5) Estimating initial state for the extra genes. -tmp_data = initialise_raw_data(wilson_raw_data)[[1]] #preprocess data. -tmp_istate = mean(tmp_data[grepl('cmp', rownames(tmp_data)), add_gene]) #estimating initial state from CMPs. -tmp_istate = matrix(round(tmp_istate), nrow=1) +# (5) Estimating initial state for the extra genes. (estimating from CMPs) +tmp_istate = mean(cdata[grepl("cmp", rownames(cdata)), add_gene]) +tmp_istate = matrix(round(tmp_istate), nrow = 1) colnames(tmp_istate) = add_gene grown_istate = cbind(istate, tmp_istate) grown_istate = initialise_data(grown_istate) @@ -484,11 +518,12 @@ You will receive a BoolModel object at the end of the model training process. Th *Note that this example takes a long time to run. The use of parallel processing is recommended.* ``` r -#(6) Inferring Boolean model. -final_model = model_train(edata, grown_bmodel, grown_istate, max_varperrule=4, verbose=T) +# (6) Inferring Boolean model. +final_model = model_train(cdata = fcdata, ddata = fddata, bmodel = grown_bmodel, + istate = grown_istate, verbose = T) -#(7) Visualise the Boolean model generated. +# (7) Visualise the Boolean model generated. plotBM(final_model) ``` -![](vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-29-1.png) +![](vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-31-1.png) diff --git a/data/example_models.rda b/data/example_models.rda index cadc607..5ab06ee 100644 Binary files a/data/example_models.rda and b/data/example_models.rda differ diff --git a/man/BoolModel-class.Rd b/man/BoolModel-class.Rd index f709c8b..dce6959 100644 --- a/man/BoolModel-class.Rd +++ b/man/BoolModel-class.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/boolmodel_class.R \docType{class} \name{BoolModel-class} diff --git a/man/BoolTraineR.Rd b/man/BoolTraineR.Rd index 770934c..278193f 100644 --- a/man/BoolTraineR.Rd +++ b/man/BoolTraineR.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/booltrainer.R \docType{package} \name{BoolTraineR} diff --git a/man/amat_to_bm.Rd b/man/amat_to_bm.Rd index 1f40e2c..e718a6d 100644 --- a/man/amat_to_bm.Rd +++ b/man/amat_to_bm.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/methods.R \name{amat_to_bm} \alias{amat_to_bm} diff --git a/man/bm_to_amat.Rd b/man/bm_to_amat.Rd index 9518b00..64726ee 100644 --- a/man/bm_to_amat.Rd +++ b/man/bm_to_amat.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/methods.R \name{bm_to_amat} \alias{bm_to_amat} diff --git a/man/bm_to_df.Rd b/man/bm_to_df.Rd index f01f9a7..92bf2fe 100644 --- a/man/bm_to_df.Rd +++ b/man/bm_to_df.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/methods.R \name{bm_to_df} \alias{bm_to_df} diff --git a/man/bon_bmodel.Rd b/man/bon_bmodel.Rd index 0fa5960..e87c3ef 100644 --- a/man/bon_bmodel.Rd +++ b/man/bon_bmodel.Rd @@ -1,10 +1,10 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_desc.R \docType{data} \name{bon_bmodel} \alias{bon_bmodel} \title{HSC Boolean Model from Bonzanni et al.} -\format{A data frame with 11 rows and 2 columns. +\format{A data frame with 11 rows and 2 columns. Rows: each row consists of 1 gene and its associated Boolean rule. Column 1: target gene diff --git a/man/bon_istate.Rd b/man/bon_istate.Rd index 75e26f2..e4e2f26 100644 --- a/man/bon_istate.Rd +++ b/man/bon_istate.Rd @@ -1,10 +1,10 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_desc.R \docType{data} \name{bon_istate} \alias{bon_istate} \title{Initial state from Bonzanni et al.} -\format{A data frame with 1 row and 11 columns. +\format{A data frame with 1 row and 11 columns. Rows: each row consists of 1 set of Boolean state. Columns: each column is for 1 gene/variable.} diff --git a/man/calc_mscore.Rd b/man/calc_mscore.Rd index a1224c5..fef9140 100644 --- a/man/calc_mscore.Rd +++ b/man/calc_mscore.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/score_calculation.R \name{calc_mscore} \alias{calc_mscore} diff --git a/man/calc_roc.Rd b/man/calc_roc.Rd index e89015f..bd11a3c 100644 --- a/man/calc_roc.Rd +++ b/man/calc_roc.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/score_calculation.R \name{calc_roc} \alias{calc_roc} diff --git a/man/check_and.Rd b/man/check_and.Rd index d638251..2d14d1a 100644 --- a/man/check_and.Rd +++ b/man/check_and.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/general.R \name{check_and} \alias{check_and} diff --git a/man/compress_bmodel.Rd b/man/compress_bmodel.Rd index a1c8f2d..a59b23f 100644 --- a/man/compress_bmodel.Rd +++ b/man/compress_bmodel.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/compression.R \name{compress_bmodel} \alias{compress_bmodel} @@ -14,7 +14,7 @@ compress_bmodel(bmodel, encoding, max_varperrule) \item{max_varperrule}{integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must not be smaller than number of variables. Default to 6.} } \description{ -This function compresses S4 BoolModel object by representing variables using numbers, and also only the act rules and inh rules are kept. +This function compresses S4 BoolModel object by representing variables using numbers, and also only the act rules and inh rules are kept. Return a list of 3 vectors, corresponding to act rules and inh rules. } diff --git a/man/decompress_bmodel.Rd b/man/decompress_bmodel.Rd index e846067..cdd462d 100644 --- a/man/decompress_bmodel.Rd +++ b/man/decompress_bmodel.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/compression.R \name{decompress_bmodel} \alias{decompress_bmodel} @@ -16,7 +16,7 @@ decompress_bmodel(x, encoding, gene = NULL, format = "bmodel") \item{format}{character. Specifies which format to return. Possible values: 'bmodel', 'df', 'amat', 'simp_df'. Default to 'bmodel'.} } \description{ -This function decompresses the bmodel compressed by compress_bmodel(). +This function decompresses the bmodel compressed by compress_bmodel(). Return a S4 BoolModel object. } diff --git a/man/decreate_boolmodel.Rd b/man/decreate_boolmodel.Rd index 6b3688d..f5e7602 100644 --- a/man/decreate_boolmodel.Rd +++ b/man/decreate_boolmodel.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/simulation.R \name{decreate_boolmodel} \alias{decreate_boolmodel} diff --git a/man/df_to_bm.Rd b/man/df_to_bm.Rd index 04923a2..58e4305 100644 --- a/man/df_to_bm.Rd +++ b/man/df_to_bm.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/methods.R \name{df_to_bm} \alias{df_to_bm} diff --git a/man/emodel1.Rd b/man/emodel1.Rd index b51d416..f51cca0 100644 --- a/man/emodel1.Rd +++ b/man/emodel1.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_desc.R \docType{data} \name{emodel1} diff --git a/man/emodel2.Rd b/man/emodel2.Rd index 7c26d86..0d3f435 100644 --- a/man/emodel2.Rd +++ b/man/emodel2.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_desc.R \docType{data} \name{emodel2} diff --git a/man/emodel3.Rd b/man/emodel3.Rd index a68d0ff..ad434d4 100644 --- a/man/emodel3.Rd +++ b/man/emodel3.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_desc.R \docType{data} \name{emodel3} diff --git a/man/eval_bool.Rd b/man/eval_bool.Rd index d38c7ea..1ef8ba4 100644 --- a/man/eval_bool.Rd +++ b/man/eval_bool.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/simulation.R \name{eval_bool} \alias{eval_bool} diff --git a/man/extract_term.Rd b/man/extract_term.Rd index 5e372ba..ae8b98c 100644 --- a/man/extract_term.Rd +++ b/man/extract_term.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/general.R \name{extract_term} \alias{extract_term} diff --git a/man/filter_dflist.Rd b/man/filter_dflist.Rd index aa85f85..6264b88 100644 --- a/man/filter_dflist.Rd +++ b/man/filter_dflist.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/general.R \name{filter_dflist} \alias{filter_dflist} diff --git a/man/gen_one_rmodel.Rd b/man/gen_one_rmodel.Rd index 5cdbfea..32cb837 100644 --- a/man/gen_one_rmodel.Rd +++ b/man/gen_one_rmodel.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rand_model.R \name{gen_one_rmodel} \alias{gen_one_rmodel} diff --git a/man/gen_singlerule.Rd b/man/gen_singlerule.Rd index 2968654..f3b021a 100644 --- a/man/gen_singlerule.Rd +++ b/man/gen_singlerule.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rand_model.R \name{gen_singlerule} \alias{gen_singlerule} diff --git a/man/gen_two_rmodel.Rd b/man/gen_two_rmodel.Rd index 9ccfb22..567c245 100644 --- a/man/gen_two_rmodel.Rd +++ b/man/gen_two_rmodel.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rand_model.R \name{gen_two_rmodel} \alias{gen_two_rmodel} @@ -21,7 +21,7 @@ gen_two_rmodel(var, steps, mvar = length(var), and_bool = F, \item{self_loop}{logical. Indicates whether to allow self_loop. Default to F.} } \description{ -This function generates a random Boolean model, then get another random Boolean model that is a specified number of steps apart by adding and/or removing genes. +This function generates a random Boolean model, then get another random Boolean model that is a specified number of steps apart by adding and/or removing genes. Returns a list of two S4 BoolModel objects. } \details{ diff --git a/man/gen_two_rmodel_dag.Rd b/man/gen_two_rmodel_dag.Rd index d62f2e1..6e1b31b 100644 --- a/man/gen_two_rmodel_dag.Rd +++ b/man/gen_two_rmodel_dag.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rand_model.R \name{gen_two_rmodel_dag} \alias{gen_two_rmodel_dag} @@ -19,7 +19,7 @@ gen_two_rmodel_dag(var, steps, mvar = length(var), in_amat = NULL, \item{acyclic}{logical. Whether to restrict the model to being acyclic or not. Defaults to TRUE.} } \description{ -This function generates a random DAG Boolean model, then get another random DAG Boolean model that is a specified number of steps apart by adding and/or removing genes. +This function generates a random DAG Boolean model, then get another random DAG Boolean model that is a specified number of steps apart by adding and/or removing genes. Difficult to generate completely directed graph with a specified number of steps apart. } diff --git a/man/get_encodings.Rd b/man/get_encodings.Rd index f09f216..9a02613 100644 --- a/man/get_encodings.Rd +++ b/man/get_encodings.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/compression.R \name{get_encodings} \alias{get_encodings} diff --git a/man/grow_bmodel.Rd b/man/grow_bmodel.Rd index 2599274..f1d41ac 100644 --- a/man/grow_bmodel.Rd +++ b/man/grow_bmodel.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/model_modification.R \name{grow_bmodel} \alias{grow_bmodel} diff --git a/man/initialise_data.Rd b/man/initialise_data.Rd index b0e3ceb..871034e 100644 --- a/man/initialise_data.Rd +++ b/man/initialise_data.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/initialisation.R \name{initialise_data} \alias{initialise_data} diff --git a/man/initialise_model.Rd b/man/initialise_model.Rd index f4bd251..3a1a799 100644 --- a/man/initialise_model.Rd +++ b/man/initialise_model.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/initialisation.R \name{initialise_model} \alias{initialise_model} diff --git a/man/initialise_raw_data.Rd b/man/initialise_raw_data.Rd index df5b019..3aacc58 100644 --- a/man/initialise_raw_data.Rd +++ b/man/initialise_raw_data.Rd @@ -1,15 +1,15 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/initialisation.R \name{initialise_raw_data} \alias{initialise_raw_data} \title{Initialise raw data} \usage{ -initialise_raw_data(x, data_type = "qpcr", uni_thre = 0.2, scale = T) +initialise_raw_data(x, max_expr = "high", uni_thre = 0.2, scale = T) } \arguments{ \item{x}{matrix. Numeric data of gene expression.} -\item{data_type}{character. Specify data types: qpcr, rnaseq.} +\item{max_expr}{character. Specify whether max expression value is the lowest (as in qPCR), or the highest (as in RNAseq and microarray). Option: 'low', 'high'. Default to 'high'.} \item{uni_thre}{numerical. Speficy threshold for unimodality test. Default to 0.2.} diff --git a/man/krum_bmodel.Rd b/man/krum_bmodel.Rd index 579807b..78a5d67 100644 --- a/man/krum_bmodel.Rd +++ b/man/krum_bmodel.Rd @@ -1,10 +1,10 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_desc.R \docType{data} \name{krum_bmodel} \alias{krum_bmodel} \title{Myeloid Boolean Model from Krumsiek et al.} -\format{A data frame with 11 rows and 2 columns. +\format{A data frame with 11 rows and 2 columns. Rows: each row consists of 1 gene and its associated Boolean rule. Column 1: target gene diff --git a/man/krum_istate.Rd b/man/krum_istate.Rd index dcd8f2d..a97c9a9 100644 --- a/man/krum_istate.Rd +++ b/man/krum_istate.Rd @@ -1,10 +1,10 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_desc.R \docType{data} \name{krum_istate} \alias{krum_istate} \title{Initial state from Krumsiek et al.} -\format{A data frame with 1 row and 11 columns. +\format{A data frame with 1 row and 11 columns. Rows: each row consists of 1 set of Boolean state. Columns: each column is for 1 gene/variable.} diff --git a/man/m_score.Rd b/man/m_score.Rd index 010bd58..21f2ed8 100644 --- a/man/m_score.Rd +++ b/man/m_score.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/score_calculation.R \name{m_score} \alias{m_score} diff --git a/man/man_dist.Rd b/man/man_dist.Rd index f2be266..1e5be3a 100644 --- a/man/man_dist.Rd +++ b/man/man_dist.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/score_calculation.R \name{man_dist} \alias{man_dist} diff --git a/man/match_term.Rd b/man/match_term.Rd index 4751b42..b3b3573 100644 --- a/man/match_term.Rd +++ b/man/match_term.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/general.R \name{match_term} \alias{match_term} diff --git a/man/minmod_internal.Rd b/man/minmod_internal.Rd index 4eb7b88..e1750da 100644 --- a/man/minmod_internal.Rd +++ b/man/minmod_internal.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/model_modification.R \name{minmod_internal} \alias{minmod_internal} diff --git a/man/minmod_model.Rd b/man/minmod_model.Rd index 13ea234..e7c5b86 100644 --- a/man/minmod_model.Rd +++ b/man/minmod_model.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/model_modification.R \name{minmod_model} \alias{minmod_model} diff --git a/man/model_consensus.Rd b/man/model_consensus.Rd index e77c7c1..739614f 100644 --- a/man/model_consensus.Rd +++ b/man/model_consensus.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/search.R \name{model_consensus} \alias{model_consensus} diff --git a/man/model_dist.Rd b/man/model_dist.Rd index f785b0f..e915fb2 100644 --- a/man/model_dist.Rd +++ b/man/model_dist.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/general.R \name{model_dist} \alias{model_dist} diff --git a/man/model_setdiff.Rd b/man/model_setdiff.Rd index 2fa5646..e895aab 100644 --- a/man/model_setdiff.Rd +++ b/man/model_setdiff.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/general.R \name{model_setdiff} \alias{model_setdiff} diff --git a/man/model_train.Rd b/man/model_train.Rd index 6511567..490637c 100644 --- a/man/model_train.Rd +++ b/man/model_train.Rd @@ -1,15 +1,18 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/search.R \name{model_train} \alias{model_train} \title{Training Model} \usage{ -model_train(edata, bmodel = NULL, istate = NULL, max_varperrule = 6, - and_bool = T, self_loop = F, con_thre = 0.3, tol = 1e-06, - verbose = F, detailed_output = F) +model_train(cdata, ddata = NULL, bmodel = NULL, istate = NULL, + max_varperrule = 6, preprocess = T, max_expr = "high", and_bool = T, + self_loop = F, con_thre = 0.3, tol = 1e-06, verbose = F, + detailed_output = F) } \arguments{ -\item{edata}{data frame of expression data. Should have state(row) x gene(column).} +\item{cdata}{data frame of expression data. Should have state(row) x gene(column).} + +\item{ddata}{discretised data frame of expression data. Must supply when preprocess=F. Obtain from initialise_raw_data(). Defaults to NULL.} \item{bmodel}{Boolean model in data frame. If NULL, use a random Boolean model. Defaults to NULL.} @@ -17,6 +20,10 @@ model_train(edata, bmodel = NULL, istate = NULL, max_varperrule = 6, \item{max_varperrule}{integer. Maximum number of terms per rule (combining both act and inh rule). Note that this number must be higher than number of genes. Defaults to 6.} +\item{preprocess}{logical. Whether to preprocess expression data. Default to T.} + +\item{max_expr}{character. Only use when preprocess==T. Specify whether max expression value is the lowest (as in qPCR), or the highest (as in RNAseq and microarray). Option: 'low', 'high'. Default to 'high'.} + \item{and_bool}{logical. Whether to consider AND terms. IF bmodel is not NULL, defaults to whether AND interaction is included in bmodel. If bmodel is NULL, then defaults to TRUE.} \item{self_loop}{logical. Whether to allow self_loop in random starting model. Only used if is.null(bmodel). Default to F.} diff --git a/man/outgenysis_model.Rd b/man/outgenysis_model.Rd index 8c0a9cc..4f5179a 100644 --- a/man/outgenysis_model.Rd +++ b/man/outgenysis_model.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/output_format.R \name{outgenysis_model} \alias{outgenysis_model} diff --git a/man/outgraph_model.Rd b/man/outgraph_model.Rd index c3edd9d..436dcac 100644 --- a/man/outgraph_model.Rd +++ b/man/outgraph_model.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/output_format.R \name{outgraph_model} \alias{outgraph_model} diff --git a/man/outstate_graph.Rd b/man/outstate_graph.Rd index 0ec40fe..86cf208 100644 --- a/man/outstate_graph.Rd +++ b/man/outstate_graph.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/output_format.R \name{outstate_graph} \alias{outstate_graph} diff --git a/man/plotBM.Rd b/man/plotBM.Rd index 8e7d538..d03c437 100644 --- a/man/plotBM.Rd +++ b/man/plotBM.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/methods.R \name{plotBM} \alias{plotBM} diff --git a/man/printBM.Rd b/man/printBM.Rd index 2d8b0ad..8f8115b 100644 --- a/man/printBM.Rd +++ b/man/printBM.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/methods.R \name{printBM} \alias{printBM} diff --git a/man/rcpp_simulate.Rd b/man/rcpp_simulate.Rd index 4805cf5..c4db52f 100644 --- a/man/rcpp_simulate.Rd +++ b/man/rcpp_simulate.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/RcppExports.R \name{rcpp_simulate} \alias{rcpp_simulate} diff --git a/man/rcpp_validate.Rd b/man/rcpp_validate.Rd index 6cd1548..0aaeff8 100644 --- a/man/rcpp_validate.Rd +++ b/man/rcpp_validate.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/RcppExports.R \name{rcpp_validate} \alias{rcpp_validate} diff --git a/man/simulate_model.Rd b/man/simulate_model.Rd index 952490f..2b9650c 100644 --- a/man/simulate_model.Rd +++ b/man/simulate_model.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/simulation.R \name{simulate_model} \alias{simulate_model} diff --git a/man/unique_raw_data.Rd b/man/unique_raw_data.Rd index 0cd4620..79207b1 100644 --- a/man/unique_raw_data.Rd +++ b/man/unique_raw_data.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/initialisation.R \name{unique_raw_data} \alias{unique_raw_data} diff --git a/man/validate_adjmat.Rd b/man/validate_adjmat.Rd index 159e7b4..0e17ed1 100644 --- a/man/validate_adjmat.Rd +++ b/man/validate_adjmat.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/score_calculation.R \name{validate_adjmat} \alias{validate_adjmat} diff --git a/man/vcat.Rd b/man/vcat.Rd index 2274b5d..b33a080 100644 --- a/man/vcat.Rd +++ b/man/vcat.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/general.R \name{vcat} \alias{vcat} diff --git a/man/which.random.min.Rd b/man/which.random.min.Rd index 1ca082e..2f75a73 100644 --- a/man/which.random.min.Rd +++ b/man/which.random.min.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/general.R \name{which.random.min} \alias{which.random.min} diff --git a/man/wilson_raw_data.Rd b/man/wilson_raw_data.Rd index f3e2fa5..ec91f7b 100644 --- a/man/wilson_raw_data.Rd +++ b/man/wilson_raw_data.Rd @@ -1,10 +1,10 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_desc.R \docType{data} \name{wilson_raw_data} \alias{wilson_raw_data} \title{Raw single cell qRT-PCR expression data from Wilson et al.} -\format{A data frame with 1626 rows and 44 columns. +\format{A data frame with 1626 rows and 44 columns. Rows: each row consists of raw expression values from 1 cell. Columns: each column is for 1 gene/variable.} diff --git a/man/wilson_raw_rnaseq.Rd b/man/wilson_raw_rnaseq.Rd index 3c94553..c582799 100644 --- a/man/wilson_raw_rnaseq.Rd +++ b/man/wilson_raw_rnaseq.Rd @@ -1,10 +1,10 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_desc.R \docType{data} \name{wilson_raw_rnaseq} \alias{wilson_raw_rnaseq} \title{Raw single cell RNAseq expression data from Wilson et al.} -\format{A data frame with 96 rows and 38498 columns. +\format{A data frame with 96 rows and 38498 columns. Rows: each row consists of raw expression values from 1 cell. Columns: each column is for 1 gene/variable.} diff --git a/man/writeBM.Rd b/man/writeBM.Rd index 1fef686..5536855 100644 --- a/man/writeBM.Rd +++ b/man/writeBM.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/methods.R \name{writeBM} \alias{writeBM} diff --git a/src/BoolTraineR.dll b/src/BoolTraineR.dll deleted file mode 100644 index 27ff8e1..0000000 Binary files a/src/BoolTraineR.dll and /dev/null differ diff --git a/src/RcppExports.o b/src/RcppExports.o deleted file mode 100644 index 91c4697..0000000 Binary files a/src/RcppExports.o and /dev/null differ diff --git a/src/score_calculation.o b/src/score_calculation.o deleted file mode 100644 index ff2f0b1..0000000 Binary files a/src/score_calculation.o and /dev/null differ diff --git a/src/simulation.o b/src/simulation.o deleted file mode 100644 index 06fd1bd..0000000 Binary files a/src/simulation.o and /dev/null differ diff --git a/vignettes/booltrainer.Rmd b/vignettes/booltrainer.Rmd index 30bbe4f..b982c81 100644 --- a/vignettes/booltrainer.Rmd +++ b/vignettes/booltrainer.Rmd @@ -41,10 +41,11 @@ Note that the examples presented in this vignette are different from the results install.packages('BoolTraineR') ``` -Or from Github for the latest version. +Or from Github for the latest version. To install from Gitbub, you will require the `devtools` package. ```{r, eval = FALSE} -install_github("cheeyeelim/booltrainer") +install.packages('devtools') +devtools::install_github("cheeyeelim/booltrainer") ``` Also install `doParallel` package if you intend to use parallel processing. @@ -58,18 +59,19 @@ A matrix with genes on the columns, and cells on the row. The expression data should be preprocessed as in any standard sequencing data processing pipelines, which includes quality control filtering and normalisation. -Use `initialise_raw_data` to convert expression data into a suitable format for model inference. +Use `initialise_raw_data` to convert expression data into a suitable format for model inference. It is recommended to use `initialise_raw_data` before subsetting the expression data for preferred cell types. -```{r, eval=FALSE} +```{r, eval=FALSE, tidy=TRUE} data(wilson_raw_data) round(wilson_raw_data[1:5,1:5], 4) - -edata = initialise_raw_data(wilson_raw_data) ``` ```{r, echo=FALSE} data(wilson_raw_data) knitr::kable(round(wilson_raw_data[1:5,1:5], 4)) ``` +```{r, eval=FALSE} +edata = initialise_raw_data(wilson_raw_data, max_expr='low') #max_expr='low' because this is qPCR data. +``` 2. Initial Boolean model. A data frame with two columns, targets and update functions. @@ -81,13 +83,14 @@ Use `initialise_model` to convert the input Boolean model into a BoolModel objec ```{r, eval=FALSE} data(krum_bmodel) head(krum_bmodel) - -bmodel = initialise_model(krum_bmodel) ``` ```{r, echo=FALSE} data(krum_bmodel) knitr::kable(head(krum_bmodel)) ``` +```{r, eval=FALSE} +bmodel = initialise_model(krum_bmodel) +``` 3. Initial state. @@ -153,18 +156,22 @@ library(BoolTraineR) #(2) Load data. data(wilson_raw_data) #load a data frame of expression data. -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = 'low') +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data #(3) Filter cell types. -cell_ind = grepl('cmp', rownames(edata)) | grepl('gmp', rownames(edata)) | grepl('mep', rownames(edata)) #select cells to be included. -edata = edata[cell_ind,] +cell_ind = grepl('cmp', rownames(cdata)) | grepl('gmp', rownames(cdata)) | grepl('mep', rownames(cdata)) +fcdata = cdata[cell_ind,] #select only relevant cells. +fddata = ddata[cell_ind,] #(4) Filter genes. gene_ind = c('fli1', 'gata1', 'gata2', 'gfi1', 'scl', 'sfpi1') #select genes to be included. -edata = edata[, gene_ind] +fcdata = fcdata[, gene_ind] +fddata = fddata[, gene_ind] #(5) Inferring Boolean model. -final_model = model_train(edata, max_varperrule=4, verbose=T) +final_model = model_train(cdata=fcdata, ddata=fddata, max_varperrule=4, verbose=T) #(6) Visualise the Boolean model generated. plotBM(final_model) @@ -174,7 +181,7 @@ plotBM(final_model) The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. -```{r} +```{r, tidy=TRUE} set.seed(0) #use to ensure reproducibility. remove in actual use. #(1) Setup paths and environment. @@ -190,24 +197,30 @@ library(BoolTraineR) Only the expression data is needed for inferring a Boolean model without an initial model. -To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. +To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. -```{r, eval=FALSE} +`initialise_raw_data` is used to preprocess the data. + +```{r, eval=FALSE, tidy=TRUE} #(2) Load data. data(wilson_raw_data) #load a data frame of expression data. -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = 'low') +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data ``` -Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. +Once data is loaded and preprocessed, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. -```{r, tidy=TRUE , eval=FALSE} +```{r, tidy=TRUE , eval=FALSE, tidy=TRUE} #(3) Filter cell types. -cell_ind = grepl('cmp', rownames(edata)) | grepl('gmp', rownames(edata)) | grepl('mep', rownames(edata)) #select cells to be included. -edata = edata[cell_ind,] +cell_ind = grepl('cmp', rownames(cdata)) | grepl('gmp', rownames(cdata)) | grepl('mep', rownames(cdata)) +fcdata = cdata[cell_ind,] #select only relevant cells. +fddata = ddata[cell_ind,] #(4) Filter genes. gene_ind = c('fli1', 'gata1', 'gata2', 'gfi1', 'scl', 'sfpi1') #select genes to be included. -edata = edata[, gene_ind] +fcdata = fcdata[, gene_ind] +fddata = fddata[, gene_ind] ``` ### Run model training @@ -218,9 +231,9 @@ In this example, `model_train` takes a few seconds to be completed on a single c You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise quickly using `plotBM`, which is based on `igraph` package. For easier manipulation, output the Boolean model using `outgraph_model` and display it with Cytoscape or Gephi. -```{r, eval=FALSE} +```{r, eval=FALSE, tidy=TRUE} #(5) Inferring Boolean model. -final_model = model_train(edata, max_varperrule=4, verbose=T) +final_model = model_train(cdata=fcdata, ddata=fddata, max_varperrule=4, verbose=T) #(6) Visualise the Boolean model generated. plotBM(final_model) @@ -258,14 +271,17 @@ data(wilson_raw_data) #load a data frame of expression data. bmodel = initialise_model(krum_bmodel) istate = krum_istate -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = 'low') +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data #(3) Filter cell types. -cell_ind = grepl('cmp', rownames(edata)) | grepl('gmp', rownames(edata)) | grepl('mep', rownames(edata)) #select cells to be included. -edata = edata[cell_ind,] +cell_ind = grepl('cmp', rownames(cdata)) | grepl('gmp', rownames(cdata)) | grepl('mep', rownames(cdata)) +fcdata = cdata[cell_ind,] #select only relevant cells. +fddata = ddata[cell_ind,] #(4) Inferring Boolean model. -final_model = model_train(edata, bmodel, istate, max_varperrule=4, verbose=T) +final_model = model_train(cdata=fcdata, ddata=fddata, bmodel=bmodel, istate=istate, max_varperrule=4, verbose=T) #(5) Visualise the Boolean model generated. plotBM(final_model) @@ -275,7 +291,7 @@ plotBM(final_model) The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. -```{r} +```{r, tidy=TRUE} set.seed(0) #use to ensure reproducibility. remove in actual use. #(1) Setup paths and environment. @@ -293,9 +309,10 @@ library(BoolTraineR) To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. -`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. +`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. `initialise_raw_data` is used to preprocess the data. -```{r, eval=FALSE} +```{r, eval=FALSE, tidy=TRUE} +#(2) Load data. #(2) Load data. data(krum_bmodel) #load a data frame of Boolean model. data(krum_istate) #load a data frame of initial state. @@ -303,28 +320,31 @@ data(wilson_raw_data) #load a data frame of expression data. bmodel = initialise_model(krum_bmodel) istate = krum_istate -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = 'low') +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data ``` -Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. +Once data are loaded and preprocessed, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. -```{r, tidy=TRUE , eval=FALSE} +```{r, tidy=TRUE , eval=FALSE, tidy=TRUE} #(3) Filter cell types. -cell_ind = grepl('cmp', rownames(edata)) | grepl('gmp', rownames(edata)) | grepl('mep', rownames(edata)) #select cells to be included. -edata = edata[cell_ind,] +cell_ind = grepl('cmp', rownames(cdata)) | grepl('gmp', rownames(cdata)) | grepl('mep', rownames(cdata)) +fcdata = cdata[cell_ind,] #select only relevant cells. +fddata = ddata[cell_ind,] ``` ### Run model training To reconstruct a Boolean model from an expression data, run `model_train`. -In this example, `model_train` takes a few seconds to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above. +In this example, `model_train` takes one or two minutes to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above. You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise using `plotBM`, which is based on `igraph` package. For easier manipulation, output the Boolean model using `outgraph_model` and display it with Cytoscape or Gephi. -```{r, eval=FALSE} +```{r, eval=FALSE, tidy=TRUE} #(4) Inferring Boolean model. -final_model = model_train(edata, bmodel, istate, max_varperrule=4, verbose=T) +final_model = model_train(cdata=fcdata, ddata=fddata, bmodel=bmodel, istate=istate, max_varperrule=4, verbose=T) #(5) Visualise the Boolean model generated. plotBM(final_model) @@ -364,28 +384,30 @@ data(wilson_raw_data) #load a data frame of expression data. bmodel = initialise_model(krum_bmodel) istate = krum_istate -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = 'low') +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data #(3) Filter cell types. -cell_ind = grepl('cmp', rownames(edata)) | grepl('gmp', rownames(edata)) | grepl('mep', rownames(edata)) #select cells to be included. -edata = edata[cell_ind,] +cell_ind = grepl('cmp', rownames(cdata)) | grepl('gmp', rownames(cdata)) | grepl('mep', rownames(cdata)) +fcdata = cdata[cell_ind,] #select only relevant cells. +fddata = ddata[cell_ind,] #(4) Adding extra genes to the initial Boolean model. #extra_genes = setdiff(colnames(wilson_raw_data), bmodel@target) #to view available genes to be added. -print(extra_genes) #to view available genes to be added. +#print(extra_genes) #to view available genes to be added. add_gene = 'ldb1' #genes to be added: ldb1 grown_bmodel = grow_bmodel(add_gene, bmodel) -#(5) Estimating initial state for the extra genes. -tmp_data = initialise_raw_data(wilson_raw_data)[[1]] #preprocess data. -tmp_istate = mean(tmp_data[grepl('cmp', rownames(tmp_data)), add_gene]) #estimating initial state from CMPs. +#(5) Estimating initial state for the extra genes. (estimating from CMPs) +tmp_istate = mean(cdata[grepl('cmp', rownames(cdata)), add_gene]) tmp_istate = matrix(round(tmp_istate), nrow=1) colnames(tmp_istate) = add_gene grown_istate = cbind(istate, tmp_istate) grown_istate = initialise_data(grown_istate) #(6) Inferring Boolean model. -final_model = model_train(edata, grown_bmodel, grown_istate, max_varperrule=4, verbose=T) +final_model = model_train(cdata=fcdata, ddata=fddata, bmodel=grown_bmodel, istate=grown_istate, verbose=T) #(7) Visualise the Boolean model generated. plotBM(final_model) @@ -395,7 +417,7 @@ plotBM(final_model) The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. -```{r} +```{r, tidy=TRUE} set.seed(0) #use to ensure reproducibility. remove in actual use. #(1) Setup paths and environment. @@ -413,9 +435,9 @@ library(BoolTraineR) To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. -`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. +`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. `initialise_raw_data` is used to preprocess the data. -```{r, eval=FALSE} +```{r, eval=FALSE, tidy=TRUE} #(2) Load data. data(krum_bmodel) #load a data frame of Boolean model. data(krum_istate) #load a data frame of initial state. @@ -423,24 +445,28 @@ data(wilson_raw_data) #load a data frame of expression data. bmodel = initialise_model(krum_bmodel) istate = krum_istate -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = 'low') +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data ``` -Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. +Once data are loaded and preprocessed, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. ```{r, tidy=TRUE , eval=FALSE} #(3) Filter cell types. -cell_ind = grepl('cmp', rownames(edata)) | grepl('gmp', rownames(edata)) | grepl('mep', rownames(edata)) #select cells to be included. -edata = edata[cell_ind,] +cell_ind = grepl('cmp', rownames(cdata)) | grepl('gmp', rownames(cdata)) | grepl('mep', rownames(cdata)) +fcdata = cdata[cell_ind,] #select only relevant cells. +fddata = ddata[cell_ind,] ``` ### Add extra genes to the initial Boolean model Extra genes can be added to the initial model using `grow_bmodel`. The function will add extra genes into the initial model with empty update functions. -```{r, eval=FALSE} +```{r, eval=FALSE, tidy=TRUE} #(4) Adding extra genes to the initial Boolean model. -#extra_genes = setdiff(colnames(wilson_raw_data), bmodel@target) #to view available genes to be added. +#extra_genes = setdiff(colnames(wilson_raw_data), bmodel@target) +#print(extra_genes) #to view available genes to be added. add_gene = 'ldb1' #genes to be added: ldb1 grown_bmodel = grow_bmodel(add_gene, bmodel) ``` @@ -449,10 +475,9 @@ grown_bmodel = grow_bmodel(add_gene, bmodel) Initial state needs to be modify to include the initial expression of the extra genes. The initial state of the extra genes can be set manually, or it can be estimated from the data if the data contain multiple cell types with known relationships. In this example, CMPs are known to be at developmental upstream of erythro-myeloid differentiation, therefore initial state can be estimated by taking the average expression of the extra genes in CMPs. -```{r, eval=FALSE} -#(5) Estimating initial state for the extra genes. -tmp_data = initialise_raw_data(wilson_raw_data)[[1]] #preprocess data. -tmp_istate = mean(tmp_data[grepl('cmp', rownames(tmp_data)), add_gene]) #estimating initial state from CMPs. +```{r, eval=FALSE, tidy=TRUE} +#(5) Estimating initial state for the extra genes. (estimating from CMPs) +tmp_istate = mean(cdata[grepl('cmp', rownames(cdata)), add_gene]) tmp_istate = matrix(round(tmp_istate), nrow=1) colnames(tmp_istate) = add_gene grown_istate = cbind(istate, tmp_istate) @@ -469,9 +494,9 @@ You will receive a BoolModel object at the end of the model training process. Th *Note that this example takes a long time to run. The use of parallel processing is recommended.* -```{r, eval=FALSE} +```{r, eval=FALSE, tidy=TRUE} #(6) Inferring Boolean model. -final_model = model_train(edata, grown_bmodel, grown_istate, max_varperrule=4, verbose=T) +final_model = model_train(cdata=fcdata, ddata=fddata, bmodel=grown_bmodel, istate=grown_istate, verbose=T) #(7) Visualise the Boolean model generated. plotBM(final_model) diff --git a/vignettes/booltrainer.html b/vignettes/booltrainer.html index aad23b7..f77238a 100644 --- a/vignettes/booltrainer.html +++ b/vignettes/booltrainer.html @@ -10,7 +10,7 @@ - + Using BoolTraineR to reconstruct asynchronous Boolean models @@ -18,41 +18,23 @@ - + @@ -72,7 +54,7 @@
@@ -107,6 +89,14 @@

2015-11-29

+

1 Brief introduction

BoolTraineR is a model learning algorithm for reconstructing and training asynchronous Boolean models using single-cell expression data. Refer to the paper for more details on the concepts behind the algorithm. This vignette serves as a tutorial to demonstrate example workflows that can be adapted to individual cases experienced by users.

@@ -116,9 +106,10 @@

2015-11-29

2 Installation

BoolTraineR can be installed from CRAN.

-
install.packages('BoolTraineR')
-

Or from Github for the latest version.

-
install_github("cheeyeelim/booltrainer")
+
install.packages('BoolTraineR')
+

Or from Github for the latest version. To install from Gitbub, you will require the devtools package.

+
install.packages('devtools')
+devtools::install_github("cheeyeelim/booltrainer")

Also install doParallel package if you intend to use parallel processing.

@@ -128,11 +119,9 @@

2015-11-29

  • Expression data. A matrix with genes on the columns, and cells on the row.
  • The expression data should be preprocessed as in any standard sequencing data processing pipelines, which includes quality control filtering and normalisation.

    -

    Use initialise_raw_data to convert expression data into a suitable format for model inference.

    -
    data(wilson_raw_data)
    -round(wilson_raw_data[1:5,1:5], 4)
    -
    -edata = initialise_raw_data(wilson_raw_data)
    +

    Use initialise_raw_data to convert expression data into a suitable format for model inference. It is recommended to use initialise_raw_data before subsetting the expression data for preferred cell types.

    +
    data(wilson_raw_data)
    +round(wilson_raw_data[1:5, 1:5], 4)
    @@ -187,15 +176,14 @@

    2015-11-29

    +
    edata = initialise_raw_data(wilson_raw_data, max_expr='low') #max_expr='low' because this is qPCR data.
    1. Initial Boolean model. A data frame with two columns, targets and update functions.

    Note that if an update function contains both activation and inhibition genes, they must be expressed with a separate clause containing only activation genes, and a separate clause containing only inhibition genes. (See the update functions of Gata1 and Gata2 for examples)

    Use initialise_model to convert the input Boolean model into a BoolModel object.

    -
    data(krum_bmodel)
    -head(krum_bmodel)
    -
    -bmodel = initialise_model(krum_bmodel)
    +
    data(krum_bmodel)
    +head(krum_bmodel)
    @@ -230,13 +218,14 @@

    2015-11-29

    +
    bmodel = initialise_model(krum_bmodel)
    1. Initial state.

    A single row data frame with genes as the columns. The expression state of each gene must be in binarised form, i.e. 0s and 1s.

    Note that all the genes that are present in the initial Boolean model must also be present here.

    -
    data(krum_istate)
    -head(krum_istate)
    +
    data(krum_istate)
    +head(krum_istate)
    @@ -306,7 +295,7 @@

    2015-11-29

    6.1.1 Full workflow

    Full workflow is included here for easy referencing. Each step is discussed in further details below.

    -
    set.seed(0)  #use to ensure reproducibility. remove in actual use.
    +
    set.seed(0)  #use to ensure reproducibility. remove in actual use.
     
     # (1) Setup paths and environment.
     library(BoolTraineR)
    @@ -317,64 +306,74 @@ 

    2015-11-29

    # (2) Load data. data(wilson_raw_data) #load a data frame of expression data. -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] # (4) Filter genes. gene_ind = c("fli1", "gata1", "gata2", "gfi1", "scl", "sfpi1") #select genes to be included. -edata = edata[, gene_ind] +fcdata = fcdata[, gene_ind] +fddata = fddata[, gene_ind] # (5) Inferring Boolean model. -final_model = model_train(edata, max_varperrule = 4, verbose = T) +final_model = model_train(cdata = fcdata, ddata = fddata, max_varperrule = 4, + verbose = T) # (6) Visualise the Boolean model generated. -plotBM(final_model)
    +plotBM(final_model)

    6.1.2 Initial setup

    The first step is to load the BoolTraineR package. If you are intending to use parallel processing, you will also need to load the doParallel package. Then specify how many cores you intend to use using registerDoParallel from the doParallel package.

    -
    set.seed(0) #use to ensure reproducibility. remove in actual use.
    +
    set.seed(0)  #use to ensure reproducibility. remove in actual use.
     
    -#(1) Setup paths and environment.
    +# (1) Setup paths and environment.
     library(BoolTraineR)
     
    -#If intending to use parallel processing, uncomment the following lines. 
    -#library(doParallel)
    -#num_core = 4 #specify the number of cores to be used.
    -#doParallel::registerDoParallel(cores=num_core)
    +# If intending to use parallel processing, uncomment the following lines. +# library(doParallel) num_core = 4 #specify the number of cores to be used. +# doParallel::registerDoParallel(cores=num_core)

    6.1.3 Data preparation

    Only the expression data is needed for inferring a Boolean model without an initial model.

    To load the data into R, use read.table or read.csv. In this example, we are using the example data included with the package, so we are accessing it by using data.

    -
    #(2) Load data.
    -data(wilson_raw_data) #load a data frame of expression data.
    -edata = wilson_raw_data
    -

    Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete.

    -
    # (3) Filter cell types.
    -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", 
    -    rownames(edata))  #select cells to be included.
    -edata = edata[cell_ind, ]
    +

    initialise_raw_data is used to preprocess the data.

    +
    # (2) Load data.
    +data(wilson_raw_data)  #load a data frame of expression data.
    +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low")
    +cdata = tmp_data[[1]]  #continuous data
    +ddata = tmp_data[[2]]  #discretised data
    +

    Once data is loaded and preprocessed, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete.

    +
    # (3) Filter cell types.
    +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", 
    +    rownames(cdata))
    +fcdata = cdata[cell_ind, ]  #select only relevant cells.
    +fddata = ddata[cell_ind, ]
     
     # (4) Filter genes.
     gene_ind = c("fli1", "gata1", "gata2", "gfi1", "scl", "sfpi1")  #select genes to be included.
    -edata = edata[, gene_ind]
    +fcdata = fcdata[, gene_ind] +fddata = fddata[, gene_ind]

    6.1.4 Run model training

    To reconstruct a Boolean model from an expression data, run model_train.

    In this example, model_train takes a few seconds to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above.

    You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise quickly using plotBM, which is based on igraph package. For easier manipulation, output the Boolean model using outgraph_model and display it with Cytoscape or Gephi.

    -
    #(5) Inferring Boolean model.
    -final_model = model_train(edata, max_varperrule=4, verbose=T)
    +
    # (5) Inferring Boolean model.
    +final_model = model_train(cdata = fcdata, ddata = fddata, max_varperrule = 4, 
    +    verbose = T)
     
    -#(6) Visualise the Boolean model generated.
    -plotBM(final_model)
    -

    +# (6) Visualise the Boolean model generated. +plotBM(final_model)
    +

    @@ -384,7 +383,7 @@

    2015-11-29

    6.2.1 Full workflow

    Full workflow is included here for easy referencing. Each step is discussed in further details below.

    -
    set.seed(0)  #use to ensure reproducibility. remove in actual use.
    +
    set.seed(0)  #use to ensure reproducibility. remove in actual use.
     
     # (1) Setup paths and environment.
     library(BoolTraineR)
    @@ -400,62 +399,69 @@ 

    2015-11-29

    bmodel = initialise_model(krum_bmodel) istate = krum_istate -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] # (4) Inferring Boolean model. -final_model = model_train(edata, bmodel, istate, max_varperrule = 4, verbose = T) +final_model = model_train(cdata = fcdata, ddata = fddata, bmodel = bmodel, istate = istate, + max_varperrule = 4, verbose = T) # (5) Visualise the Boolean model generated. -plotBM(final_model)
    +plotBM(final_model)

    6.2.2 Initial setup

    The first step is to load the BoolTraineR package. If you are intending to use parallel processing, you will also need to load the doParallel package. Then specify how many cores you intend to use using registerDoParallel from the doParallel package.

    -
    set.seed(0) #use to ensure reproducibility. remove in actual use.
    +
    set.seed(0)  #use to ensure reproducibility. remove in actual use.
     
    -#(1) Setup paths and environment.
    +# (1) Setup paths and environment.
     library(BoolTraineR)
     
    -#If intending to use parallel processing, uncomment the following lines. 
    -#library(doParallel)
    -#num_core = 4 #specify the number of cores to be used.
    -#doParallel::registerDoParallel(cores=num_core)
    +# If intending to use parallel processing, uncomment the following lines. +# library(doParallel) num_core = 4 #specify the number of cores to be used. +# doParallel::registerDoParallel(cores=num_core)

    6.2.3 Data preparation

    3 pieces of data are needed to infer a Boolean model with an initial model: an expression data, an initial Boolean model and an initial state.

    To load the data into R, use read.table or read.csv. In this example, we are using the example data included with the package, so we are accessing it by using data.

    -

    initialise_model converts the data frame containing the Boolean model into a BoolModel object.

    -
    #(2) Load data.
    -data(krum_bmodel) #load a data frame of Boolean model.
    -data(krum_istate) #load a data frame of initial state.
    -data(wilson_raw_data) #load a data frame of expression data.
    +

    initialise_model converts the data frame containing the Boolean model into a BoolModel object. initialise_raw_data is used to preprocess the data.

    +
    # (2) Load data. (2) Load data.
    +data(krum_bmodel)  #load a data frame of Boolean model.
    +data(krum_istate)  #load a data frame of initial state.
    +data(wilson_raw_data)  #load a data frame of expression data.
     
     bmodel = initialise_model(krum_bmodel)
     istate = krum_istate
    -edata = wilson_raw_data
    -

    Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically.

    -
    # (3) Filter cell types.
    -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", 
    -    rownames(edata))  #select cells to be included.
    -edata = edata[cell_ind, ]
    +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data
    +

    Once data are loaded and preprocessed, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically.

    +
    # (3) Filter cell types.
    +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", 
    +    rownames(cdata))
    +fcdata = cdata[cell_ind, ]  #select only relevant cells.
    +fddata = ddata[cell_ind, ]

    6.2.4 Run model training

    To reconstruct a Boolean model from an expression data, run model_train.

    -

    In this example, model_train takes a few seconds to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above.

    +

    In this example, model_train takes one or two minutes to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above.

    You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise using plotBM, which is based on igraph package. For easier manipulation, output the Boolean model using outgraph_model and display it with Cytoscape or Gephi.

    -
    #(4) Inferring Boolean model.
    -final_model = model_train(edata, bmodel, istate, max_varperrule=4, verbose=T)
    +
    # (4) Inferring Boolean model.
    +final_model = model_train(cdata = fcdata, ddata = fddata, bmodel = bmodel, istate = istate, 
    +    max_varperrule = 4, verbose = T)
     
    -#(5) Visualise the Boolean model generated.
    -plotBM(final_model)
    -

    +# (5) Visualise the Boolean model generated. +plotBM(final_model)
    +

    @@ -466,7 +472,7 @@

    2015-11-29

    6.3.1 Full workflow

    Full workflow is included here for easy referencing. Each step is discussed in further details below.

    Note that this example takes a few minutes to run on a single core. The use of parallel processing is recommended.

    -
    set.seed(0)  #use to ensure reproducibility. remove in actual use.
    +
    set.seed(0)  #use to ensure reproducibility. remove in actual use.
     
     # (1) Setup paths and environment.
     library(BoolTraineR)
    @@ -482,85 +488,88 @@ 

    2015-11-29

    bmodel = initialise_model(krum_bmodel) istate = krum_istate -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] # (4) Adding extra genes to the initial Boolean model. extra_genes = # setdiff(colnames(wilson_raw_data), bmodel@target) #to view available genes -# to be added. -print(extra_genes) #to view available genes to be added. +# to be added. print(extra_genes) #to view available genes to be added. add_gene = "ldb1" #genes to be added: ldb1 grown_bmodel = grow_bmodel(add_gene, bmodel) -# (5) Estimating initial state for the extra genes. -tmp_data = initialise_raw_data(wilson_raw_data)[[1]] #preprocess data. -tmp_istate = mean(tmp_data[grepl("cmp", rownames(tmp_data)), add_gene]) #estimating initial state from CMPs. +# (5) Estimating initial state for the extra genes. (estimating from CMPs) +tmp_istate = mean(cdata[grepl("cmp", rownames(cdata)), add_gene]) tmp_istate = matrix(round(tmp_istate), nrow = 1) colnames(tmp_istate) = add_gene grown_istate = cbind(istate, tmp_istate) grown_istate = initialise_data(grown_istate) # (6) Inferring Boolean model. -final_model = model_train(edata, grown_bmodel, grown_istate, max_varperrule = 4, - verbose = T) +final_model = model_train(cdata = fcdata, ddata = fddata, bmodel = grown_bmodel, + istate = grown_istate, verbose = T) # (7) Visualise the Boolean model generated. -plotBM(final_model)
    +plotBM(final_model)

    6.3.2 Initial setup

    The first step is to load the BoolTraineR package. If you are intending to use parallel processing, you will also need to load the doParallel package. Then specify how many cores you intend to use using registerDoParallel from the doParallel package.

    -
    set.seed(0) #use to ensure reproducibility. remove in actual use.
    +
    set.seed(0)  #use to ensure reproducibility. remove in actual use.
     
    -#(1) Setup paths and environment.
    +# (1) Setup paths and environment.
     library(BoolTraineR)
     
    -#If intending to use parallel processing, uncomment the following lines. 
    -#library(doParallel)
    -#num_core = 4 #specify the number of cores to be used.
    -#doParallel::registerDoParallel(cores=num_core)
    +# If intending to use parallel processing, uncomment the following lines. +# library(doParallel) num_core = 4 #specify the number of cores to be used. +# doParallel::registerDoParallel(cores=num_core)

    6.3.3 Data preparation

    3 pieces of data are needed to infer a Boolean model with an initial model: an expression data, an initial Boolean model and an initial state.

    To load the data into R, use read.table or read.csv. In this example, we are using the example data included with the package, so we are accessing it by using data.

    -

    initialise_model converts the data frame containing the Boolean model into a BoolModel object.

    -
    #(2) Load data.
    -data(krum_bmodel) #load a data frame of Boolean model.
    -data(krum_istate) #load a data frame of initial state.
    -data(wilson_raw_data) #load a data frame of expression data.
    +

    initialise_model converts the data frame containing the Boolean model into a BoolModel object. initialise_raw_data is used to preprocess the data.

    +
    # (2) Load data.
    +data(krum_bmodel)  #load a data frame of Boolean model.
    +data(krum_istate)  #load a data frame of initial state.
    +data(wilson_raw_data)  #load a data frame of expression data.
     
     bmodel = initialise_model(krum_bmodel)
     istate = krum_istate
    -edata = wilson_raw_data
    -

    Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically.

    -
    # (3) Filter cell types.
    -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", 
    -    rownames(edata))  #select cells to be included.
    -edata = edata[cell_ind, ]
    +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data
    +

    Once data are loaded and preprocessed, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically.

    +
    # (3) Filter cell types.
    +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", 
    +    rownames(cdata))
    +fcdata = cdata[cell_ind, ]  #select only relevant cells.
    +fddata = ddata[cell_ind, ]

    6.3.4 Add extra genes to the initial Boolean model

    Extra genes can be added to the initial model using grow_bmodel. The function will add extra genes into the initial model with empty update functions.

    -
    #(4) Adding extra genes to the initial Boolean model.
    -#extra_genes = setdiff(colnames(wilson_raw_data), bmodel@target) #to view available genes to be added.
    -add_gene = 'ldb1' #genes to be added: ldb1
    -grown_bmodel = grow_bmodel(add_gene, bmodel)
    +
    # (4) Adding extra genes to the initial Boolean model. extra_genes =
    +# setdiff(colnames(wilson_raw_data), bmodel@target) print(extra_genes) #to
    +# view available genes to be added.
    +add_gene = "ldb1"  #genes to be added: ldb1
    +grown_bmodel = grow_bmodel(add_gene, bmodel)

    6.3.5 Estimate initial state for the extra genes

    Initial state needs to be modify to include the initial expression of the extra genes. The initial state of the extra genes can be set manually, or it can be estimated from the data if the data contain multiple cell types with known relationships. In this example, CMPs are known to be at developmental upstream of erythro-myeloid differentiation, therefore initial state can be estimated by taking the average expression of the extra genes in CMPs.

    -
    #(5) Estimating initial state for the extra genes.
    -tmp_data = initialise_raw_data(wilson_raw_data)[[1]] #preprocess data.
    -tmp_istate = mean(tmp_data[grepl('cmp', rownames(tmp_data)), add_gene]) #estimating initial state from CMPs.
    -tmp_istate = matrix(round(tmp_istate), nrow=1)
    +
    # (5) Estimating initial state for the extra genes. (estimating from CMPs)
    +tmp_istate = mean(cdata[grepl("cmp", rownames(cdata)), add_gene])
    +tmp_istate = matrix(round(tmp_istate), nrow = 1)
     colnames(tmp_istate) = add_gene
     grown_istate = cbind(istate, tmp_istate)
    -grown_istate = initialise_data(grown_istate)
    +grown_istate = initialise_data(grown_istate)

    6.3.6 Run model training

    @@ -568,12 +577,13 @@

    2015-11-29

    In this example, model_train takes a few minutes to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above.

    You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise using plotBM, which is based on igraph package. For easier manipulation, output the Boolean model using outgraph_model and display it with Cytoscape or Gephi.

    Note that this example takes a long time to run. The use of parallel processing is recommended.

    -
    #(6) Inferring Boolean model.
    -final_model = model_train(edata, grown_bmodel, grown_istate, max_varperrule=4, verbose=T)
    +
    # (6) Inferring Boolean model.
    +final_model = model_train(cdata = fcdata, ddata = fddata, bmodel = grown_bmodel, 
    +    istate = grown_istate, verbose = T)
     
    -#(7) Visualise the Boolean model generated.
    -plotBM(final_model)
    -

    +# (7) Visualise the Boolean model generated. +plotBM(final_model)
    +

    diff --git a/vignettes/booltrainer.md b/vignettes/booltrainer.md index 532c74c..b2133c6 100644 --- a/vignettes/booltrainer.md +++ b/vignettes/booltrainer.md @@ -1,27 +1,35 @@ -- [Brief introduction](#brief-introduction) -- [Installation](#installation) -- [Input data format](#input-data-format) -- [Output format](#output-format) -- [Useful functions in BoolTraineR](#useful-functions-in-booltrainer) -- [Example workflows](#example-workflows) - - [Inferring model without an initial model](#inferring-model-without-an-initial-model) - - [Full workflow](#full-workflow) - - [Initial setup](#initial-setup) - - [Data preparation](#data-preparation) - - [Run model training](#run-model-training) - - [Inferring model with an initial model](#inferring-model-with-an-initial-model) - - [Full workflow](#full-workflow-1) - - [Initial setup](#initial-setup-1) - - [Data preparation](#data-preparation-1) - - [Run model training](#run-model-training-1) - - [Extending model with more genes](#extending-model-with-more-genes) - - [Full workflow](#full-workflow-2) - - [Initial setup](#initial-setup-2) - - [Data preparation](#data-preparation-2) - - [Add extra genes to the initial Boolean model](#add-extra-genes-to-the-initial-boolean-model) - - [Estimate initial state for the extra genes](#estimate-initial-state-for-the-extra-genes) - - [Run model training](#run-model-training-2) - +- Brief introduction +- Installation +- Input data format +- Output format +- Useful functions in BoolTraineR +- Example workflows + - Inferring model without an initial model + - Full workflow + - Initial setup + - Data preparation + - Run model training + - Inferring model with an initial model + - Full workflow + - Initial setup + - Data preparation + - Run model training + - Extending model with more genes + - Full workflow + - Initial setup + - Data preparation + - Add extra genes to the initial Boolean model + - Estimate initial state for the extra genes + - Run model training + + Brief introduction ================== @@ -40,10 +48,11 @@ Installation install.packages('BoolTraineR') ``` -Or from Github for the latest version. +Or from Github for the latest version. To install from Gitbub, you will require the `devtools` package. ``` r -install_github("cheeyeelim/booltrainer") +install.packages('devtools') +devtools::install_github("cheeyeelim/booltrainer") ``` Also install `doParallel` package if you intend to use parallel processing. @@ -57,13 +66,11 @@ Depending on the analysis, only 3 types of data will ever be needed. The format The expression data should be preprocessed as in any standard sequencing data processing pipelines, which includes quality control filtering and normalisation. -Use `initialise_raw_data` to convert expression data into a suitable format for model inference. +Use `initialise_raw_data` to convert expression data into a suitable format for model inference. It is recommended to use `initialise_raw_data` before subsetting the expression data for preferred cell types. ``` r data(wilson_raw_data) -round(wilson_raw_data[1:5,1:5], 4) - -edata = initialise_raw_data(wilson_raw_data) +round(wilson_raw_data[1:5, 1:5], 4) ``` | | bptf| cbfa2t3h| csf1r| dnmt3a| eif2b1| @@ -74,6 +81,10 @@ edata = initialise_raw_data(wilson_raw_data) | lmpp\_007 | 0.5419| 1.8631| 10.8468| 0.1757| 1.0873| | lmpp\_008 | 0.9209| 2.6637| 2.8549| 2.1965| 2.3663| +``` r +edata = initialise_raw_data(wilson_raw_data, max_expr='low') #max_expr='low' because this is qPCR data. +``` + 1. Initial Boolean model. A data frame with two columns, targets and update functions. Note that if an update function contains both activation and inhibition genes, they must be expressed with a separate clause containing only activation genes, and a separate clause containing only inhibition genes. (See the update functions of Gata1 and Gata2 for examples) @@ -83,8 +94,6 @@ Use `initialise_model` to convert the input Boolean model into a BoolModel objec ``` r data(krum_bmodel) head(krum_bmodel) - -bmodel = initialise_model(krum_bmodel) ``` | targets | factors | @@ -96,6 +105,10 @@ bmodel = initialise_model(krum_bmodel) | fli1 | gata1 & ! eklf | | scl | gata1 & ! sfpi1 | +``` r +bmodel = initialise_model(krum_bmodel) +``` + 1. Initial state. A single row data frame with genes as the columns. The expression state of each gene must be in binarised form, i.e. 0s and 1s. @@ -163,19 +176,24 @@ library(BoolTraineR) # (2) Load data. data(wilson_raw_data) #load a data frame of expression data. -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] # (4) Filter genes. gene_ind = c("fli1", "gata1", "gata2", "gfi1", "scl", "sfpi1") #select genes to be included. -edata = edata[, gene_ind] +fcdata = fcdata[, gene_ind] +fddata = fddata[, gene_ind] # (5) Inferring Boolean model. -final_model = model_train(edata, max_varperrule = 4, verbose = T) +final_model = model_train(cdata = fcdata, ddata = fddata, max_varperrule = 4, + verbose = T) # (6) Visualise the Boolean model generated. plotBM(final_model) @@ -186,15 +204,14 @@ plotBM(final_model) The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. ``` r -set.seed(0) #use to ensure reproducibility. remove in actual use. +set.seed(0) #use to ensure reproducibility. remove in actual use. -#(1) Setup paths and environment. +# (1) Setup paths and environment. library(BoolTraineR) -#If intending to use parallel processing, uncomment the following lines. -#library(doParallel) -#num_core = 4 #specify the number of cores to be used. -#doParallel::registerDoParallel(cores=num_core) +# If intending to use parallel processing, uncomment the following lines. +# library(doParallel) num_core = 4 #specify the number of cores to be used. +# doParallel::registerDoParallel(cores=num_core) ``` ### Data preparation @@ -203,23 +220,29 @@ Only the expression data is needed for inferring a Boolean model without an init To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. +`initialise_raw_data` is used to preprocess the data. + ``` r -#(2) Load data. -data(wilson_raw_data) #load a data frame of expression data. -edata = wilson_raw_data +# (2) Load data. +data(wilson_raw_data) #load a data frame of expression data. +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data ``` -Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. +Once data is loaded and preprocessed, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. ``` r # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] # (4) Filter genes. gene_ind = c("fli1", "gata1", "gata2", "gfi1", "scl", "sfpi1") #select genes to be included. -edata = edata[, gene_ind] +fcdata = fcdata[, gene_ind] +fddata = fddata[, gene_ind] ``` ### Run model training @@ -231,14 +254,15 @@ In this example, `model_train` takes a few seconds to be completed on a single c You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise quickly using `plotBM`, which is based on `igraph` package. For easier manipulation, output the Boolean model using `outgraph_model` and display it with Cytoscape or Gephi. ``` r -#(5) Inferring Boolean model. -final_model = model_train(edata, max_varperrule=4, verbose=T) +# (5) Inferring Boolean model. +final_model = model_train(cdata = fcdata, ddata = fddata, max_varperrule = 4, + verbose = T) -#(6) Visualise the Boolean model generated. +# (6) Visualise the Boolean model generated. plotBM(final_model) ``` -![](booltrainer_files/figure-markdown_github/unnamed-chunk-15-1.png) +![](vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-17-1.png) Inferring model with an initial model ------------------------------------- @@ -268,15 +292,19 @@ data(wilson_raw_data) #load a data frame of expression data. bmodel = initialise_model(krum_bmodel) istate = krum_istate -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] # (4) Inferring Boolean model. -final_model = model_train(edata, bmodel, istate, max_varperrule = 4, verbose = T) +final_model = model_train(cdata = fcdata, ddata = fddata, bmodel = bmodel, istate = istate, + max_varperrule = 4, verbose = T) # (5) Visualise the Boolean model generated. plotBM(final_model) @@ -287,15 +315,14 @@ plotBM(final_model) The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. ``` r -set.seed(0) #use to ensure reproducibility. remove in actual use. +set.seed(0) #use to ensure reproducibility. remove in actual use. -#(1) Setup paths and environment. +# (1) Setup paths and environment. library(BoolTraineR) -#If intending to use parallel processing, uncomment the following lines. -#library(doParallel) -#num_core = 4 #specify the number of cores to be used. -#doParallel::registerDoParallel(cores=num_core) +# If intending to use parallel processing, uncomment the following lines. +# library(doParallel) num_core = 4 #specify the number of cores to be used. +# doParallel::registerDoParallel(cores=num_core) ``` ### Data preparation @@ -304,45 +331,49 @@ library(BoolTraineR) To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. -`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. +`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. `initialise_raw_data` is used to preprocess the data. ``` r -#(2) Load data. -data(krum_bmodel) #load a data frame of Boolean model. -data(krum_istate) #load a data frame of initial state. -data(wilson_raw_data) #load a data frame of expression data. +# (2) Load data. (2) Load data. +data(krum_bmodel) #load a data frame of Boolean model. +data(krum_istate) #load a data frame of initial state. +data(wilson_raw_data) #load a data frame of expression data. bmodel = initialise_model(krum_bmodel) istate = krum_istate -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data ``` -Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. +Once data are loaded and preprocessed, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. ``` r # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] ``` ### Run model training To reconstruct a Boolean model from an expression data, run `model_train`. -In this example, `model_train` takes a few seconds to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above. +In this example, `model_train` takes one or two minutes to be completed on a single core. If this steps take a very long time to complete, do consider using the parallel processing option as described above. You will receive a BoolModel object at the end of the model training process. The BoolModel object can be visualise using `plotBM`, which is based on `igraph` package. For easier manipulation, output the Boolean model using `outgraph_model` and display it with Cytoscape or Gephi. ``` r -#(4) Inferring Boolean model. -final_model = model_train(edata, bmodel, istate, max_varperrule=4, verbose=T) +# (4) Inferring Boolean model. +final_model = model_train(cdata = fcdata, ddata = fddata, bmodel = bmodel, istate = istate, + max_varperrule = 4, verbose = T) -#(5) Visualise the Boolean model generated. +# (5) Visualise the Boolean model generated. plotBM(final_model) ``` -![](booltrainer_files/figure-markdown_github/unnamed-chunk-21-1.png) +![](vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-23-1.png) Extending model with more genes ------------------------------- @@ -374,31 +405,32 @@ data(wilson_raw_data) #load a data frame of expression data. bmodel = initialise_model(krum_bmodel) istate = krum_istate -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] # (4) Adding extra genes to the initial Boolean model. extra_genes = # setdiff(colnames(wilson_raw_data), bmodel@target) #to view available genes -# to be added. -print(extra_genes) #to view available genes to be added. +# to be added. print(extra_genes) #to view available genes to be added. add_gene = "ldb1" #genes to be added: ldb1 grown_bmodel = grow_bmodel(add_gene, bmodel) -# (5) Estimating initial state for the extra genes. -tmp_data = initialise_raw_data(wilson_raw_data)[[1]] #preprocess data. -tmp_istate = mean(tmp_data[grepl("cmp", rownames(tmp_data)), add_gene]) #estimating initial state from CMPs. +# (5) Estimating initial state for the extra genes. (estimating from CMPs) +tmp_istate = mean(cdata[grepl("cmp", rownames(cdata)), add_gene]) tmp_istate = matrix(round(tmp_istate), nrow = 1) colnames(tmp_istate) = add_gene grown_istate = cbind(istate, tmp_istate) grown_istate = initialise_data(grown_istate) # (6) Inferring Boolean model. -final_model = model_train(edata, grown_bmodel, grown_istate, max_varperrule = 4, - verbose = T) +final_model = model_train(cdata = fcdata, ddata = fddata, bmodel = grown_bmodel, + istate = grown_istate, verbose = T) # (7) Visualise the Boolean model generated. plotBM(final_model) @@ -409,15 +441,14 @@ plotBM(final_model) The first step is to load the `BoolTraineR` package. If you are intending to use parallel processing, you will also need to load the `doParallel` package. Then specify how many cores you intend to use using `registerDoParallel` from the `doParallel` package. ``` r -set.seed(0) #use to ensure reproducibility. remove in actual use. +set.seed(0) #use to ensure reproducibility. remove in actual use. -#(1) Setup paths and environment. +# (1) Setup paths and environment. library(BoolTraineR) -#If intending to use parallel processing, uncomment the following lines. -#library(doParallel) -#num_core = 4 #specify the number of cores to be used. -#doParallel::registerDoParallel(cores=num_core) +# If intending to use parallel processing, uncomment the following lines. +# library(doParallel) num_core = 4 #specify the number of cores to be used. +# doParallel::registerDoParallel(cores=num_core) ``` ### Data preparation @@ -426,26 +457,29 @@ library(BoolTraineR) To load the data into R, use `read.table` or `read.csv`. In this example, we are using the example data included with the package, so we are accessing it by using `data`. -`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. +`initialise_model` converts the data frame containing the Boolean model into a BoolModel object. `initialise_raw_data` is used to preprocess the data. ``` r -#(2) Load data. -data(krum_bmodel) #load a data frame of Boolean model. -data(krum_istate) #load a data frame of initial state. -data(wilson_raw_data) #load a data frame of expression data. +# (2) Load data. +data(krum_bmodel) #load a data frame of Boolean model. +data(krum_istate) #load a data frame of initial state. +data(wilson_raw_data) #load a data frame of expression data. bmodel = initialise_model(krum_bmodel) istate = krum_istate -edata = wilson_raw_data +tmp_data = initialise_raw_data(wilson_raw_data, max_expr = "low") +cdata = tmp_data[[1]] #continuous data +ddata = tmp_data[[2]] #discretised data ``` -Once data is loaded, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. +Once data are loaded and preprocessed, filter the cell types or genes to be included in the analysis if needed. It is advisable to reduce the number of genes to be included if the computation takes too long to complete. In this example, genes are not filtered as all genes that are present in both expression data and Boolean model are used automatically. ``` r # (3) Filter cell types. -cell_ind = grepl("cmp", rownames(edata)) | grepl("gmp", rownames(edata)) | grepl("mep", - rownames(edata)) #select cells to be included. -edata = edata[cell_ind, ] +cell_ind = grepl("cmp", rownames(cdata)) | grepl("gmp", rownames(cdata)) | grepl("mep", + rownames(cdata)) +fcdata = cdata[cell_ind, ] #select only relevant cells. +fddata = ddata[cell_ind, ] ``` ### Add extra genes to the initial Boolean model @@ -453,9 +487,10 @@ edata = edata[cell_ind, ] Extra genes can be added to the initial model using `grow_bmodel`. The function will add extra genes into the initial model with empty update functions. ``` r -#(4) Adding extra genes to the initial Boolean model. -#extra_genes = setdiff(colnames(wilson_raw_data), bmodel@target) #to view available genes to be added. -add_gene = 'ldb1' #genes to be added: ldb1 +# (4) Adding extra genes to the initial Boolean model. extra_genes = +# setdiff(colnames(wilson_raw_data), bmodel@target) print(extra_genes) #to +# view available genes to be added. +add_gene = "ldb1" #genes to be added: ldb1 grown_bmodel = grow_bmodel(add_gene, bmodel) ``` @@ -464,10 +499,9 @@ grown_bmodel = grow_bmodel(add_gene, bmodel) Initial state needs to be modify to include the initial expression of the extra genes. The initial state of the extra genes can be set manually, or it can be estimated from the data if the data contain multiple cell types with known relationships. In this example, CMPs are known to be at developmental upstream of erythro-myeloid differentiation, therefore initial state can be estimated by taking the average expression of the extra genes in CMPs. ``` r -#(5) Estimating initial state for the extra genes. -tmp_data = initialise_raw_data(wilson_raw_data)[[1]] #preprocess data. -tmp_istate = mean(tmp_data[grepl('cmp', rownames(tmp_data)), add_gene]) #estimating initial state from CMPs. -tmp_istate = matrix(round(tmp_istate), nrow=1) +# (5) Estimating initial state for the extra genes. (estimating from CMPs) +tmp_istate = mean(cdata[grepl("cmp", rownames(cdata)), add_gene]) +tmp_istate = matrix(round(tmp_istate), nrow = 1) colnames(tmp_istate) = add_gene grown_istate = cbind(istate, tmp_istate) grown_istate = initialise_data(grown_istate) @@ -484,11 +518,12 @@ You will receive a BoolModel object at the end of the model training process. Th *Note that this example takes a long time to run. The use of parallel processing is recommended.* ``` r -#(6) Inferring Boolean model. -final_model = model_train(edata, grown_bmodel, grown_istate, max_varperrule=4, verbose=T) +# (6) Inferring Boolean model. +final_model = model_train(cdata = fcdata, ddata = fddata, bmodel = grown_bmodel, + istate = grown_istate, verbose = T) -#(7) Visualise the Boolean model generated. +# (7) Visualise the Boolean model generated. plotBM(final_model) ``` -![](booltrainer_files/figure-markdown_github/unnamed-chunk-29-1.png) +![](vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-31-1.png) diff --git a/vignettes/booltrainer.pdf b/vignettes/booltrainer.pdf index 3e317d7..4fadf43 100644 Binary files a/vignettes/booltrainer.pdf and b/vignettes/booltrainer.pdf differ diff --git a/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-17-1.png b/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-17-1.png new file mode 100644 index 0000000..35b3cda Binary files /dev/null and b/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-17-1.png differ diff --git a/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-23-1.png b/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-23-1.png new file mode 100644 index 0000000..35b3cda Binary files /dev/null and b/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-23-1.png differ diff --git a/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-31-1.png b/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-31-1.png new file mode 100644 index 0000000..35b3cda Binary files /dev/null and b/vignettes/booltrainer_files/figure-markdown_github/unnamed-chunk-31-1.png differ