Merge pull request #29 from corehunter/feature/seed

allow to set seed
corehunter · Jun 22, 2017 · c87b7a3 · c87b7a3
2 parents 81cc5d8 + abcb372
commit c87b7a3
Show file tree

Hide file tree

Showing 20 changed files with 186 additions and 67 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: corehunter
 Title: Multi-Purpose Core Subset Selection
-Version: 3.1.0
+Version: 3.1.0.9000
 Date: 2017-01-27
 Authors@R: c(person("Herman", "De Beukelaer", email = "herman.debeukelaer@gmail.com", role = c("aut", "cre")),
              person("Guy", "Davenport", email = "daveneti@gmail.com", role = "aut"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -23,5 +23,6 @@ export(setRange)
 import(naturalsort)
 import(rJava)
 importFrom(methods,is)
+importFrom(stats,runif)
 importFrom(utils,read.delim)
 importFrom(utils,write.csv)
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,11 @@
 Core Hunter 3
 =============
 
+Version 3.1.0.9000 (dev)
+------------------------
+
+ - Using `set.seed` prior to executing Core Hunter now yields reproducible results.
+
 Version 3.1.0 (27/01/2017)
 --------------------------
 

diff --git a/R/data.R b/R/data.R
@@ -19,6 +19,9 @@
 #'         Selection Index to Real and Simulated Data",
 #'         \url{http://hdl.handle.net/11529/10199} V10
 #'
+#' @examples
+#' exampleData()
+#'
 #' @return Core Hunter data of class \code{chdata}
 #' @export
 exampleData <- function(){
@@ -704,7 +707,7 @@ print.chgeno <- function(x, include.size = TRUE, ...){
 #'   to some or all individuals.
 #'
 #' @param types Variable types (optional).
-#'   Vector of characters of length one or two.
+#'   Vector of characters, each of length one or two.
 #'   Ignored when reading from file.
 #'
 #'   The first letter indicates the scale type and should be one of \code{N} (nominal),
@@ -1070,15 +1073,20 @@ print.chpheno <- function(x, include.size = TRUE, ...){
 #'
 #' @importFrom utils read.delim
 #' @export
-read.autodelim <- function(file, row.names = 1, check.names = FALSE, stringsAsFactors = FALSE,
-                           strip.white = TRUE, quote = "'\"", ...){
+read.autodelim <- function(file, quote = "'\"",
+                           row.names = 1,
+                           na.strings = "",
+                           check.names = FALSE,
+                           strip.white = TRUE,
+                           stringsAsFactors = FALSE,
+                           ...){
   sep <- switch(tolower(tools::file_ext(file)),
                 "csv" = ",",
                 "txt" = "\t")
-  read.delim(file, sep = sep,
-             row.names = row.names, check.names = check.names,
-             stringsAsFactors = stringsAsFactors, strip.white = strip.white,
-             quote = quote, ...)
+  read.delim(file, sep = sep, quote = quote,
+             row.names = row.names, na.string = na.strings, check.names = check.names,
+             strip.white = strip.white, stringsAsFactors = stringsAsFactors,
+             ...)
 }
 
 # ----------------- #

diff --git a/R/execution.R b/R/execution.R
@@ -13,6 +13,16 @@
 #' is being minimized, the roles of upper and lower bound are interchanged, and the
 #' Pareto maximum is used instead.
 #'
+#' Because Core Hunter uses stochastic algorithms, repeated runs may produce different
+#' results. To eliminate randomness, you may set a random number generation seed using
+#' \code{\link{set.seed}} prior to executing Core Hunter. Note however that Core Hunter
+#' uses runtime-based stop conditions, meaning that it is not entirely guaranteed that
+#' the same final selection will be obtained when using the same seed, since runtimes
+#' may be influenced by external factors such as the current CPU workload. Therefore,
+#' the number of executed steps may vary across runs, which may affect the returned
+#' solution. When aiming for reproducible results, it is thus also important to allow
+#' sufficient execution time to ensure convergence of the optimization algorithm.
+#'
 #' @param data Core Hunter data (\code{chdata}) containing genotypes,
 #'   phenotypes and/or a precomputed distance matrix. Can also be an
 #'   object of class \code{chdist}, \code{chgeno} or \code{chpheno}
@@ -21,16 +31,16 @@
 #'   If no objectives are specified Core Hunter maximizes a weighted
 #'   index including the default entry-to-nearest-entry distance
 #'   (\code{EN}) for each available data type.
-#'   For genotyes, the Modified Roger's distance (\code{MR}) is
+#'   For genotypes, the Modified Roger's distance (\code{MR}) is
 #'   used. For phenotypes, Gower's distance (\code{GD}) is applied.
 #' @param size Desired core subset size (numeric). If larger than one the value
 #'   is used as the absolute core size after rounding. Else it is used as the
 #'   sampling rate and multiplied with the dataset size to determine the size of
 #'   the core. The default sampling rate is 0.2.
 #' @param mode Execution mode (\code{default} or \code{fast}). In default mode,
 #'   the normalization searches terminate when no improvement is found for ten
-#'   seconds. In fast mode, searches terminated as soon as no improvement is
-#'   made for two seconds. Stop conditions can be overriden with arguments
+#'   seconds. In fast mode, searches terminate as soon as no improvement is
+#'   made for two seconds. These stop conditions can be overriden using arguments
 #'   \code{time} and \code{impr.time}.
 #' @param time Absolute runtime limit in seconds. Not used by default. If used
 #'   it should be a strictly positive value and is rounded to the nearest integer.
@@ -80,7 +90,10 @@ getNormalizationRanges <- function(data, obj, size = 0.2, mode = c("default", "f
 
   # run Core Hunter normalization
   api <- ch.api()
-  ranges <- .jevalArray(api$getNormalizationRanges(j.args, mode, time, impr.time), simplify = TRUE)
+  ranges <- .jevalArray(
+    api$getNormalizationRanges(j.args, mode, time, impr.time, genSeed()),
+    simplify = TRUE
+  )
   obj.ids <- sapply(obj, function(o){
     id <- o$type
     if(!is.null(o$meas)){
@@ -96,8 +109,20 @@ getNormalizationRanges <- function(data, obj, size = 0.2, mode = c("default", "f
 
 }
 
+#' Sample a core collection.
+#'
 #' Sample a core collection from the given data.
 #'
+#' Because Core Hunter uses stochastic algorithms, repeated runs may produce different
+#' results. To eliminate randomness, you may set a random number generation seed using
+#' \code{\link{set.seed}} prior to executing Core Hunter. Note however that Core Hunter
+#' uses runtime-based stop conditions, meaning that it is not entirely guaranteed that
+#' the same final selection will be obtained when using the same seed, since runtimes
+#' may be influenced by external factors such as the current CPU workload. Therefore,
+#' the number of executed steps may vary across runs, which may affect the returned
+#' solution. When aiming for reproducible results, it is thus also important to allow
+#' sufficient execution time to ensure convergence of the optimization algorithm.
+#'
 #' @param data Core Hunter data (\code{chdata}) containing genotypes,
 #'   phenotypes and/or a precomputed distance matrix. Typically the
 #'   data is obtained with \code{\link{coreHunterData}}. Can also be
@@ -213,7 +238,7 @@ sampleCore <- function(data, obj, size = 0.2, mode = c("default", "fast"), norma
 
   # run Core Hunter
   api <- ch.api()
-  sel <- api$sampleCore(j.args, mode, time, impr.time, !verbose)
+  sel <- api$sampleCore(j.args, mode, time, impr.time, genSeed(), !verbose)
   if(indices){
     sel <- toRIndices(sel)
   } else {
@@ -323,6 +348,11 @@ createArguments <- function(data, obj, size, normalize){
 
 }
 
+#' @importFrom stats runif
+genSeed <- function(){
+  .jlong(ceiling(runif(1, 0, 2^31-1)))
+}
+
 # ---------- #
 # OBJECTIVES #
 # ---------- #

diff --git a/inst/java/corehunter-3.1.0.jar → inst/java/corehunter-3.2.0-SNAPSHOT.jar b/inst/java/corehunter-3.1.0.jar → inst/java/corehunter-3.2.0-SNAPSHOT.jar
diff --git a/man/exampleData.Rd b/man/exampleData.Rd
diff --git a/man/getNormalizationRanges.Rd b/man/getNormalizationRanges.Rd
diff --git a/man/phenotypes.Rd b/man/phenotypes.Rd
diff --git a/man/read.autodelim.Rd b/man/read.autodelim.Rd
diff --git a/man/sampleCore.Rd b/man/sampleCore.Rd
diff --git a/tests/testthat/data/distances-small.txt b/tests/testthat/data/distances-small.txt
@@ -1,6 +1,6 @@
-ID  	NAME	Alice	Dave	"Bob-1"	Bob-2	Carol
-Alice	    	0.0 	0.2 	0.4 	0.6 	0.8
-'Dave'	    	0.2 	0.0 	0.2 	0.4 	0.6
-Bob-1	Bob 	0.4 	0.2 	0.0 	0.1 	0.4
-Bob-2	"Bob" 	0.6 	0.4 	0.1 	0.0 	0.2
-Carol	    	0.8 	0.6 	0.4 	0.2 	0.0
+ID    	NAME	Alice	Dave	"Bob"	"Bob'"	Carol
+Alice 	    	0.0 	0.2 	0.4  	0.6   	0.8
+'Dave'	    	0.2 	0.0 	0.2  	0.4   	0.6
+Bob   	Bob 	0.4 	0.2 	0.0  	0.1   	0.4
+"Bob'"	"Bob"	0.6 	0.4 	0.1  	0.0   	0.2
+Carol 	    	0.8 	0.6 	0.4  	0.2   	0.0
diff --git a/tests/testthat/data/genotypes-bi-small.csv b/tests/testthat/data/genotypes-bi-small.csv
@@ -1,6 +1,6 @@
-ID     , NAME   , "mk1", mk2, mk3, mk4
-Alice  , Alice  , 1    , 0  , 2  , 1  
-Dave   ,        , 2    , 0  , 2  , 0  
-"Bob-1", 'Bob'  , 1    , 0  ,    , 0  
-Bob-2  , Bob    , 1    , 0  , 1  , 1  
-'Carol', "Carol", 1    , 0  ,    , 0  
+ID     , NAME   , "mk1", "mk,2", "mk'3", mk4
+Alice  , Alice  , 1    , 0     , 2     , 1  
+Dave   ,        , 2    , 0     , 2     , 0  
+"Bob"  , 'Bob'  , 1    , 0     ,       , 0  
+"Bob'" , Bob    , 1    , 0     , 1     , 1  
+'Carol', "Carol", 1    , 0     ,       , 0  
diff --git a/tests/testthat/data/genotypes-freq-small.csv b/tests/testthat/data/genotypes-freq-small.csv
@@ -1,7 +1,7 @@
-ID     , NAME   , mk1  , mk1    , 'mk1', mk2  , mk2  , mk3  , mk3  , "mk3" , mk4  , mk4  , mk4  , mk4 
-ALLELE ,        , mk1-1, "mk1-2", mk1-3, mk2-1, mk2-2,      , mk3-2, ""    , mk4-1, mk4-2, mk4-3, 'mk4-4'
-Alice  , Alice  ,      ,        ,      , 0.50 , 0.50 , 0.00 , 0.50 , 0.50  , 0.00 , 0.00 , 0.50 , 0.50 
-Dave   ,        , 1.00 , 0.00   , 0.00 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50  , 1.00 , 0.00 , 0.00 , 0.00 
-Bob-1  , Bob    , 0.60 , 0.00   , 0.40 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50  , 0.25 , 0.25 , 0.25 , 0.25 
-"Bob-2", Bob    ,      ,        ,      , 1.00 , 0.00 ,      ,      ,       , 0.00 , 0.00 , 1.00 , 0.00 
-Carol  , Carol  , 0.33 , 0.33   , 0.33 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50  , 0.50 , 0.00 , 0.50 , 0.00 
+ID     , NAME   , mk1  , mk1    , 'mk1', "mk,2", "mk,2", "mk'3", "mk'3", "mk'3", mk4  , mk4  , mk4  , mk4 
+ALLELE ,        , mk1-1, "mk1-2", mk1-3, mk2-1 , mk2-2 ,       , mk3-2 , ""    , mk4-1, mk4-2, mk4-3, 'mk4-4'
+Alice  , Alice  ,      ,        ,      , 0.50  , 0.50  , 0.00  , 0.50  , 0.50  , 0.00 , 0.00 , 0.50 , 0.50 
+Dave   ,        , 1.00 , 0.00   , 0.00 , 0.50  , 0.50  , 0.00  , 0.50  , 0.50  , 1.00 , 0.00 , 0.00 , 0.00 
+Bob    , Bob    , 0.60 , 0.00   , 0.40 , 0.50  , 0.50  , 0.00  , 0.50  , 0.50  , 0.25 , 0.25 , 0.25 , 0.25 
+"Bob'" , Bob    ,      ,        ,      , 1.00  , 0.00  ,       ,       ,       , 0.00 , 0.00 , 1.00 , 0.00 
+Carol  , Carol  , 0.33 , 0.33   , 0.33 , 0.50  , 0.50  , 0.00  , 0.50  , 0.50  , 0.50 , 0.00 , 0.50 , 0.00 
diff --git a/tests/testthat/data/genotypes-small.csv b/tests/testthat/data/genotypes-small.csv
@@ -1,6 +1,6 @@
-ID     , NAME , 'mk1-1', mk1-2, mk2-1, mk2-2, mk3 , mk3-2, mk4-1, "mk4-22"
-Alice  ,      , 1      , 3    , "B"  , B    , a1  , a1
-Dave   ,      , 2      , 2    , 'C'  , A    , a1  , a2   , +    , -
-Bob-1  , Bob  , 1      , 2    , 'D'  , D    , a2  , a2   , +    , +
-Bob-2  , "Bob", 2      , 3    , "B"  , B    , 'a2', "a1" , +    , -
-'Carol',      , 1      , 1    ,      ,      , a1  , a1   , -    , -
+ID     , NAME , 'mk1-1', mk1-2, "mk,2-1", "mk,2-2", "mk'3", "mk'3-2", mk4-1, "mk4-22"
+Alice  ,      , 1      , 3    , "B"     , B       , a1    , a1
+Dave   ,      , 2      , 2    , 'C'     , A       , a1    , a2      , +    , -
+Bob    , Bob  , 1      , 2    , 'D'     , D       , a2    , a2      , +    , +
+"Bob'" , "Bob", 2      , 3    , "B"     , B       , 'a2'  , "a1"    , +    , -
+'Carol',      , 1      , 1    ,         ,         , a1    , a1      , -    , -
diff --git a/tests/testthat/data/phenotypes-no-types.csv b/tests/testthat/data/phenotypes-no-types.csv
@@ -1,6 +1,6 @@
  ID     , NAME , trait 1, trait 2, "trait 3", trait 4, 'trait 5'
  "Alice",      , A      , x      , 4        , 1.4    , false
  Dave   ,      , B      , b      , 5        , 0.5    , true
- Bob-1  , 'Bob', A      , a      , 6        , 0.5    , true
- Bob-2  , Bob  , C      , c      , 9        , 0.5    , false
+ Bob    , 'Bob', A      , a      , 6        , 0.5    , true
+ "Bob'" , Bob  , C      , c      , 9        , 0.5    , false
  Carol  ,      , B      , c      , 1        , 1.3    , true
diff --git a/tests/testthat/data/phenotypes-small.csv b/tests/testthat/data/phenotypes-small.csv
@@ -4,6 +4,6 @@
  MAX    ,      ,          ,        , 10     , 2.0    ,
  Alice  ,      , "A"      , 'x'    , 4      , 1.4    , false
  "Dave" ,      , B        , b      , 5      , 0.5    , true
- Bob-1  , Bob  , A        , a      , 6      , 0.5    , true
- Bob-2  , "Bob", C        , "c"    , 9      , 0.5    , false
+ Bob    , Bob  , A        , a      , 6      , 0.5    , true
+ "Bob'" , "Bob", C        , "c"    , 9      , 0.5    , false
  'Carol',      , 'B'      , c      , 1      , 1.3    , true