version 0.1.0

cran · Oct 31, 2023 · 93853d5 · 93853d5
commit 93853d5
Show file tree

Hide file tree

Showing 14 changed files with 676 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,25 @@
+Package: portion
+Type: Package
+Title: Extracting a Data Portion
+Version: 0.1.0
+Authors@R: c(
+    person("Lennart", "Oelschl\u00e4ger", 
+           email = "oelschlaeger.lennart@gmail.com", 
+           role = c("aut", "cre"))
+           )
+Description: Provides a simple method to extract portions of a vector, matrix, or data.frame. 
+    The relative portion size and the way the portion is selected can be chosen.
+License: GPL (>= 3)
+Encoding: UTF-8
+RoxygenNote: 7.2.3
+Suggests: testthat (>= 3.0.0)
+Config/testthat/edition: 3
+Imports: stats
+URL: https://github.com/loelschlaeger/portion
+BugReports: https://github.com/loelschlaeger/portion/issues
+NeedsCompilation: no
+Packaged: 2023-10-30 18:52:13 UTC; loelschlaeger
+Author: Lennart Oelschläger [aut, cre]
+Maintainer: Lennart Oelschläger <oelschlaeger.lennart@gmail.com>
+Repository: CRAN
+Date/Publication: 2023-10-31 16:40:03 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,13 @@
+38b4d2555b18020ef907fa8a7fc918e9 *DESCRIPTION
+8ee185833d37ff704a13f8d20db43cc3 *NAMESPACE
+f0282165a6800e7e304710ecd96e9302 *NEWS.md
+0cfec0fc1cd395fdb5bef2ecf076e847 *R/portion-package.R
+9c7cd4c2b29ce4e053233c9ffb5a74b3 *R/portion.R
+70b02dfbb1ad04e8bc51dbd2646791d1 *README.md
+fe7cd3617e0d384316e0f6869009a2a5 *man/build_cluster.Rd
+055ba12fa0d80170da00cec745d2c903 *man/cluster_indices.Rd
+dd3cc9b8e5380dfa75b6246aa49a3939 *man/figures/logo.png
+d3dfb258a558ebe20ddfa9a4c1141e56 *man/portion-package.Rd
+2eaca7daecc2a29e587c8d503d4758d0 *man/portion.Rd
+065cd7a05778c2f0138b6c02517f2502 *tests/testthat.R
+1811466860511e9e8f3d15a19ba25eb3 *tests/testthat/test-portion.R
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,8 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(portion,data.frame)
+S3method(portion,list)
+S3method(portion,matrix)
+S3method(portion,numeric)
+export(portion)
+importFrom(stats,kmeans)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,3 @@
+# portion 0.1.0
+
+* Initial CRAN release.
diff --git a/R/portion-package.R b/R/portion-package.R
@@ -0,0 +1,8 @@
+#' @aliases portion-package
+#' @keywords internal
+"_PACKAGE"
+
+## usethis namespace: start
+#' @importFrom stats kmeans
+## usethis namespace: end
+NULL
diff --git a/R/portion.R b/R/portion.R
@@ -0,0 +1,206 @@
+#' Extracting a data portion
+#'
+#' @description
+#' extract a portion of data saved as a \code{vector}, \code{matrix},
+#' \code{data.frame}, or \code{list}
+#'
+#' @param x
+#' an object to be portioned
+#'
+#' @param proportion
+#' a \code{numeric} between 0 and 1, the relative portion size
+#'
+#' @param how
+#' a \code{character}, specifying the portion method, one of:
+#' - \code{"random"} (default), portion at random
+#' - \code{"first"}, portion to the first elements
+#' - \code{"last"}, portion to the last elements
+#' - \code{"similar"}, portion to similar elements based on clustering
+#' - \code{"dissimilar"}, portion to dissimilar elements based on clustering
+#'
+#' @param centers
+#' (only relevant if \code{how} is \code{"similar} or \code{"dissimilar)})
+#' an \code{integer} (default is \code{2}), passed on to
+#' \code{\link[stats]{kmeans}}
+#'
+#' @param byrow
+#' \code{TRUE} to portion row-wise (default) or \code{FALSE} to portion
+#' column-wise
+#'
+#' @param ignore
+#' (only relevant if \code{how} is \code{"similar} or \code{"dissimilar)})
+#' an \code{integer} vector of row indices (or column indices if
+#' \code{byrow = FALSE}) to ignore during clustering
+#'
+#' @param ...
+#' further arguments to be passed to or from other methods
+#'
+#' @return
+#' the portioned input \code{x} with the (row, column) indices used
+#' added as attributes \code{"indices"}
+#'
+#' @export
+#'
+#' @examples
+#' # can portion vectors, matrices, data.frames, and lists of such types
+#' portion(
+#'   list(
+#'     1:10,
+#'     matrix(LETTERS[1:12], nrow = 3, ncol = 4),
+#'     data.frame(a = 1:6, b = -6:-1)
+#'   ),
+#'   proportion = 0.5,
+#'   how = "first"
+#' )
+#'
+#' # can portion similar elements
+#' portion(c(rep(1, 5), rep(2, 5)), proportion = 0.5, how = "similar")
+
+portion <- function(x, proportion, how, centers = 2, ...) {
+  if (missing(proportion)) {
+    stop("please specify 'proportion'")
+  }
+  stopifnot(
+    "please set 'proportion' to a numeric between 0 and 1" =
+      is.numeric(proportion) && length(proportion) == 1 && proportion <= 1 &&
+      proportion >= 0
+  )
+  if (missing(how)) {
+    stop("please specify 'how'")
+  }
+  how <- match.arg(how, c("random", "first", "last", "similar", "dissimilar"))
+  UseMethod("portion")
+}
+
+#' @export
+#' @rdname portion
+
+portion.numeric <- function(x, proportion, how, centers = 2, ...) {
+  n <- length(x)
+  m <- ceiling(n * proportion)
+  if (how == "random") {
+    ind <- sort(sample.int(n, m))
+  } else if (how == "first") {
+    ind <- seq_len(m)
+  } else if (how == "last") {
+    ind <- sort(rev(seq_len(n))[1:m])
+  } else if (how %in% c("similar", "dissimilar")) {
+    cluster <- build_cluster(x, centers)
+    ind <- cluster_indices(cluster, m, similar = how == "similar")
+  } else {
+    stop("please use a valid method for 'how'")
+  }
+  structure(x[ind], "indices" = ind)
+}
+
+#' @export
+#' @rdname portion
+
+portion.matrix <- function(
+    x, proportion, how, centers = 2, byrow = TRUE, ignore = integer(), ...
+) {
+  if (!byrow) x <- t(x)
+  n <- nrow(x)
+  m <- ceiling(n * proportion)
+  if (how == "random") {
+    ind <- sort(sample.int(n, m))
+  } else if (how == "first") {
+    ind <- seq_len(m)
+  } else if (how == "last") {
+    ind <- sort(rev(seq_len(n))[1:m])
+  } else if (how %in% c("similar", "dissimilar")) {
+    if (length(ignore) > 0) {
+      x_select <- x[-ignore, , drop = FALSE]
+    } else {
+      x_select <- x
+    }
+    cluster <- build_cluster(x_select, centers)
+    ind <- cluster_indices(cluster, m, similar = how == "similar")
+  } else {
+    stop("please use a valid method for 'how'")
+  }
+  x <- x[ind, , drop = FALSE]
+  if (!byrow) x <- t(x)
+  structure(x, "indices" = ind)
+}
+
+#' @export
+#' @rdname portion
+
+portion.data.frame <- function(
+    x, proportion, how, centers = 2, byrow = TRUE, ignore = integer(), ...
+) {
+  if (length(ignore) > 0) {
+    if (byrow) {
+      x_select <- x[-ignore, , drop = FALSE]
+    } else {
+      x_select <- x[, -ignore, drop = FALSE]
+    }
+  } else {
+    x_select <- x
+  }
+  x_portion <- portion(
+    as.matrix(x), proportion = proportion, how = how, centers = centers,
+    byrow = byrow, ignore = integer()
+  )
+  ind <- attr(x_portion, "indices")
+  if (byrow) {
+    structure(x[ind, ], "indices" = ind)
+  } else {
+    structure(x[, ind], "indices" = ind)
+  }
+}
+
+#' @export
+#' @rdname portion
+
+portion.list <- function(x, proportion, how, centers = 2, ...) {
+  lapply(x, portion, proportion = proportion, how = how, centers = centers, ...)
+}
+
+#' build clusters
+#' @keywords internal
+#' @return
+#' a \code{vector} of indices, indicating the allocated class
+
+build_cluster <- function(x, centers) {
+  stopifnot("'x' must be numeric" = is.numeric(x))
+  stopifnot("'centers' must be a single integer" = length(centers) == 1 &&
+              is.numeric(centers) && centers == as.integer(centers))
+  stats::kmeans(x, centers = centers)$cluster
+}
+
+#' choose cluster indices
+#' @keywords internal
+#' @param cluster
+#' a \code{vector} of indices, indicating the allocated class
+#' @param m
+#' the number of required indices
+#' @param similar
+#' either \code{TRUE} for similar classes, or \code{FALSE} for dissimilar
+#' @return
+#' a subset of \code{ind} of length \code{m}
+
+cluster_indices <- function(cluster, m, similar = TRUE) {
+  centers <- length(unique(cluster))
+  ind <- integer(0)
+  if (similar) {
+    i <- 1
+    while (length(ind) < m && i <= centers) {
+      ind_i <- which(cluster == i)
+      ind <- c(ind, ind_i[seq_len(min(m - length(ind), length(ind_i)))])
+      i <- i + 1
+    }
+  } else {
+    ind_cluster <- split(seq_along(cluster), cluster)
+    i <- 0
+    while (length(ind) < m) {
+      i_mod <- i %% centers + 1
+      i <- i + 1
+      if (length(ind_cluster[[i_mod]]) == 0) next
+      ind <- c(ind, ind_cluster[[i_mod]][1])
+      ind_cluster[[i_mod]] <- ind_cluster[[i_mod]][-1]
+    }
+  }
+  sort(ind)
+}
diff --git a/README.md b/README.md
@@ -0,0 +1,113 @@
+
+<!-- README.md is generated from README.Rmd. Please edit that file -->
+
+# Extracting a Data Portion <img src="man/figures/logo.png" align="right" height="139" alt="" />
+
+<!-- badges: start -->
+
+[![R-CMD-check](https://github.com/loelschlaeger/portion/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/loelschlaeger/portion/actions/workflows/R-CMD-check.yaml)
+[![CRAN
+status](https://www.r-pkg.org/badges/version/portion)](https://CRAN.R-project.org/package=portion)
+[![Codecov test
+coverage](https://codecov.io/gh/loelschlaeger/portion/branch/master/graph/badge.svg)](https://app.codecov.io/gh/loelschlaeger/portion?branch=master)
+<!-- badges: end -->
+
+`{portion}` is a small `R` package that helps to extract a data portion:
+
+1.  works for `vector`, `matrix`, `data.frame`, and `list` objects
+
+2.  the relative portion size can be selected
+
+3.  allows to select first, last, random, similar or dissimilar data
+    points
+
+4.  can portion either row- or column-wise
+
+## Installation
+
+``` r
+install.packages("portion")
+```
+
+And the development version from [GitHub](https://github.com/) with:
+
+``` r
+# install.packages("devtools")
+devtools::install_github("loelschlaeger/portion")
+```
+
+## Example
+
+Can portion a `vector`:
+
+``` r
+portion(c(1:5, 51:55), proportion = 0.5, how = "similar")
+#> [1] 1 2 3 4 5
+#> attr(,"indices")
+#> [1] 1 2 3 4 5
+portion(1:10, proportion = 0.4, how = "dissimilar", centers = 4)
+#> [1] 1 3 5 8
+#> attr(,"indices")
+#> [1] 1 3 5 8
+```
+
+Can portion a `matrix`:
+
+``` r
+portion(matrix(LETTERS[1:24], nrow = 4), proportion = 0.5, how = "first")
+#>      [,1] [,2] [,3] [,4] [,5] [,6]
+#> [1,] "A"  "E"  "I"  "M"  "Q"  "U" 
+#> [2,] "B"  "F"  "J"  "N"  "R"  "V" 
+#> attr(,"indices")
+#> [1] 1 2
+portion(matrix(LETTERS[1:24], nrow = 4), proportion = 0.5, how = "first", byrow = FALSE)
+#>      [,1] [,2] [,3]
+#> [1,] "A"  "E"  "I" 
+#> [2,] "B"  "F"  "J" 
+#> [3,] "C"  "G"  "K" 
+#> [4,] "D"  "H"  "L" 
+#> attr(,"indices")
+#> [1] 1 2 3
+```
+
+Can portion a `data.frame`:
+
+``` r
+portion(as.data.frame(diag(8)), proportion = 0.3, how = "random")
+#>   V1 V2 V3 V4 V5 V6 V7 V8
+#> 3  0  0  1  0  0  0  0  0
+#> 4  0  0  0  1  0  0  0  0
+#> 5  0  0  0  0  1  0  0  0
+portion(as.data.frame(diag(8)), proportion = 0.3, how = "random", byrow = FALSE)
+#>   V2 V4 V8
+#> 1  0  0  0
+#> 2  1  0  0
+#> 3  0  0  0
+#> 4  0  1  0
+#> 5  0  0  0
+#> 6  0  0  0
+#> 7  0  0  0
+#> 8  0  0  1
+```
+
+Can work on a `list`:
+
+``` r
+portion(list(1:5, diag(3), data.frame(1:3, 2:4)), proportion = 0.5, how = "last")
+#> [[1]]
+#> [1] 3 4 5
+#> attr(,"indices")
+#> [1] 3 4 5
+#> 
+#> [[2]]
+#>      [,1] [,2] [,3]
+#> [1,]    0    1    0
+#> [2,]    0    0    1
+#> attr(,"indices")
+#> [1] 2 3
+#> 
+#> [[3]]
+#>   X1.3 X2.4
+#> 2    2    3
+#> 3    3    4
+```
diff --git a/man/build_cluster.Rd b/man/build_cluster.Rd