Skip to content

Commit

Permalink
version 0.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
loelschlaeger authored and cran-robot committed Oct 31, 2023
0 parents commit 93853d5
Show file tree
Hide file tree
Showing 14 changed files with 676 additions and 0 deletions.
25 changes: 25 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
Package: portion
Type: Package
Title: Extracting a Data Portion
Version: 0.1.0
Authors@R: c(
person("Lennart", "Oelschl\u00e4ger",
email = "oelschlaeger.lennart@gmail.com",
role = c("aut", "cre"))
)
Description: Provides a simple method to extract portions of a vector, matrix, or data.frame.
The relative portion size and the way the portion is selected can be chosen.
License: GPL (>= 3)
Encoding: UTF-8
RoxygenNote: 7.2.3
Suggests: testthat (>= 3.0.0)
Config/testthat/edition: 3
Imports: stats
URL: https://github.com/loelschlaeger/portion
BugReports: https://github.com/loelschlaeger/portion/issues
NeedsCompilation: no
Packaged: 2023-10-30 18:52:13 UTC; loelschlaeger
Author: Lennart Oelschläger [aut, cre]
Maintainer: Lennart Oelschläger <oelschlaeger.lennart@gmail.com>
Repository: CRAN
Date/Publication: 2023-10-31 16:40:03 UTC
13 changes: 13 additions & 0 deletions MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
38b4d2555b18020ef907fa8a7fc918e9 *DESCRIPTION
8ee185833d37ff704a13f8d20db43cc3 *NAMESPACE
f0282165a6800e7e304710ecd96e9302 *NEWS.md
0cfec0fc1cd395fdb5bef2ecf076e847 *R/portion-package.R
9c7cd4c2b29ce4e053233c9ffb5a74b3 *R/portion.R
70b02dfbb1ad04e8bc51dbd2646791d1 *README.md
fe7cd3617e0d384316e0f6869009a2a5 *man/build_cluster.Rd
055ba12fa0d80170da00cec745d2c903 *man/cluster_indices.Rd
dd3cc9b8e5380dfa75b6246aa49a3939 *man/figures/logo.png
d3dfb258a558ebe20ddfa9a4c1141e56 *man/portion-package.Rd
2eaca7daecc2a29e587c8d503d4758d0 *man/portion.Rd
065cd7a05778c2f0138b6c02517f2502 *tests/testthat.R
1811466860511e9e8f3d15a19ba25eb3 *tests/testthat/test-portion.R
8 changes: 8 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Generated by roxygen2: do not edit by hand

S3method(portion,data.frame)
S3method(portion,list)
S3method(portion,matrix)
S3method(portion,numeric)
export(portion)
importFrom(stats,kmeans)
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# portion 0.1.0

* Initial CRAN release.
8 changes: 8 additions & 0 deletions R/portion-package.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#' @aliases portion-package
#' @keywords internal
"_PACKAGE"

## usethis namespace: start
#' @importFrom stats kmeans
## usethis namespace: end
NULL
206 changes: 206 additions & 0 deletions R/portion.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
#' Extracting a data portion
#'
#' @description
#' extract a portion of data saved as a \code{vector}, \code{matrix},
#' \code{data.frame}, or \code{list}
#'
#' @param x
#' an object to be portioned
#'
#' @param proportion
#' a \code{numeric} between 0 and 1, the relative portion size
#'
#' @param how
#' a \code{character}, specifying the portion method, one of:
#' - \code{"random"} (default), portion at random
#' - \code{"first"}, portion to the first elements
#' - \code{"last"}, portion to the last elements
#' - \code{"similar"}, portion to similar elements based on clustering
#' - \code{"dissimilar"}, portion to dissimilar elements based on clustering
#'
#' @param centers
#' (only relevant if \code{how} is \code{"similar} or \code{"dissimilar)})
#' an \code{integer} (default is \code{2}), passed on to
#' \code{\link[stats]{kmeans}}
#'
#' @param byrow
#' \code{TRUE} to portion row-wise (default) or \code{FALSE} to portion
#' column-wise
#'
#' @param ignore
#' (only relevant if \code{how} is \code{"similar} or \code{"dissimilar)})
#' an \code{integer} vector of row indices (or column indices if
#' \code{byrow = FALSE}) to ignore during clustering
#'
#' @param ...
#' further arguments to be passed to or from other methods
#'
#' @return
#' the portioned input \code{x} with the (row, column) indices used
#' added as attributes \code{"indices"}
#'
#' @export
#'
#' @examples
#' # can portion vectors, matrices, data.frames, and lists of such types
#' portion(
#' list(
#' 1:10,
#' matrix(LETTERS[1:12], nrow = 3, ncol = 4),
#' data.frame(a = 1:6, b = -6:-1)
#' ),
#' proportion = 0.5,
#' how = "first"
#' )
#'
#' # can portion similar elements
#' portion(c(rep(1, 5), rep(2, 5)), proportion = 0.5, how = "similar")

portion <- function(x, proportion, how, centers = 2, ...) {
if (missing(proportion)) {
stop("please specify 'proportion'")
}
stopifnot(
"please set 'proportion' to a numeric between 0 and 1" =
is.numeric(proportion) && length(proportion) == 1 && proportion <= 1 &&
proportion >= 0
)
if (missing(how)) {
stop("please specify 'how'")
}
how <- match.arg(how, c("random", "first", "last", "similar", "dissimilar"))
UseMethod("portion")
}

#' @export
#' @rdname portion

portion.numeric <- function(x, proportion, how, centers = 2, ...) {
n <- length(x)
m <- ceiling(n * proportion)
if (how == "random") {
ind <- sort(sample.int(n, m))
} else if (how == "first") {
ind <- seq_len(m)
} else if (how == "last") {
ind <- sort(rev(seq_len(n))[1:m])
} else if (how %in% c("similar", "dissimilar")) {
cluster <- build_cluster(x, centers)
ind <- cluster_indices(cluster, m, similar = how == "similar")
} else {
stop("please use a valid method for 'how'")
}
structure(x[ind], "indices" = ind)
}

#' @export
#' @rdname portion

portion.matrix <- function(
x, proportion, how, centers = 2, byrow = TRUE, ignore = integer(), ...
) {
if (!byrow) x <- t(x)
n <- nrow(x)
m <- ceiling(n * proportion)
if (how == "random") {
ind <- sort(sample.int(n, m))
} else if (how == "first") {
ind <- seq_len(m)
} else if (how == "last") {
ind <- sort(rev(seq_len(n))[1:m])
} else if (how %in% c("similar", "dissimilar")) {
if (length(ignore) > 0) {
x_select <- x[-ignore, , drop = FALSE]
} else {
x_select <- x
}
cluster <- build_cluster(x_select, centers)
ind <- cluster_indices(cluster, m, similar = how == "similar")
} else {
stop("please use a valid method for 'how'")
}
x <- x[ind, , drop = FALSE]
if (!byrow) x <- t(x)
structure(x, "indices" = ind)
}

#' @export
#' @rdname portion

portion.data.frame <- function(
x, proportion, how, centers = 2, byrow = TRUE, ignore = integer(), ...
) {
if (length(ignore) > 0) {
if (byrow) {
x_select <- x[-ignore, , drop = FALSE]
} else {
x_select <- x[, -ignore, drop = FALSE]
}
} else {
x_select <- x
}
x_portion <- portion(
as.matrix(x), proportion = proportion, how = how, centers = centers,
byrow = byrow, ignore = integer()
)
ind <- attr(x_portion, "indices")
if (byrow) {
structure(x[ind, ], "indices" = ind)
} else {
structure(x[, ind], "indices" = ind)
}
}

#' @export
#' @rdname portion

portion.list <- function(x, proportion, how, centers = 2, ...) {
lapply(x, portion, proportion = proportion, how = how, centers = centers, ...)
}

#' build clusters
#' @keywords internal
#' @return
#' a \code{vector} of indices, indicating the allocated class

build_cluster <- function(x, centers) {
stopifnot("'x' must be numeric" = is.numeric(x))
stopifnot("'centers' must be a single integer" = length(centers) == 1 &&
is.numeric(centers) && centers == as.integer(centers))
stats::kmeans(x, centers = centers)$cluster
}

#' choose cluster indices
#' @keywords internal
#' @param cluster
#' a \code{vector} of indices, indicating the allocated class
#' @param m
#' the number of required indices
#' @param similar
#' either \code{TRUE} for similar classes, or \code{FALSE} for dissimilar
#' @return
#' a subset of \code{ind} of length \code{m}

cluster_indices <- function(cluster, m, similar = TRUE) {
centers <- length(unique(cluster))
ind <- integer(0)
if (similar) {
i <- 1
while (length(ind) < m && i <= centers) {
ind_i <- which(cluster == i)
ind <- c(ind, ind_i[seq_len(min(m - length(ind), length(ind_i)))])
i <- i + 1
}
} else {
ind_cluster <- split(seq_along(cluster), cluster)
i <- 0
while (length(ind) < m) {
i_mod <- i %% centers + 1
i <- i + 1
if (length(ind_cluster[[i_mod]]) == 0) next
ind <- c(ind, ind_cluster[[i_mod]][1])
ind_cluster[[i_mod]] <- ind_cluster[[i_mod]][-1]
}
}
sort(ind)
}
113 changes: 113 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@

<!-- README.md is generated from README.Rmd. Please edit that file -->

# Extracting a Data Portion <img src="man/figures/logo.png" align="right" height="139" alt="" />

<!-- badges: start -->

[![R-CMD-check](https://github.com/loelschlaeger/portion/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/loelschlaeger/portion/actions/workflows/R-CMD-check.yaml)
[![CRAN
status](https://www.r-pkg.org/badges/version/portion)](https://CRAN.R-project.org/package=portion)
[![Codecov test
coverage](https://codecov.io/gh/loelschlaeger/portion/branch/master/graph/badge.svg)](https://app.codecov.io/gh/loelschlaeger/portion?branch=master)
<!-- badges: end -->

`{portion}` is a small `R` package that helps to extract a data portion:

1. works for `vector`, `matrix`, `data.frame`, and `list` objects

2. the relative portion size can be selected

3. allows to select first, last, random, similar or dissimilar data
points

4. can portion either row- or column-wise

## Installation

``` r
install.packages("portion")
```

And the development version from [GitHub](https://github.com/) with:

``` r
# install.packages("devtools")
devtools::install_github("loelschlaeger/portion")
```

## Example

Can portion a `vector`:

``` r
portion(c(1:5, 51:55), proportion = 0.5, how = "similar")
#> [1] 1 2 3 4 5
#> attr(,"indices")
#> [1] 1 2 3 4 5
portion(1:10, proportion = 0.4, how = "dissimilar", centers = 4)
#> [1] 1 3 5 8
#> attr(,"indices")
#> [1] 1 3 5 8
```

Can portion a `matrix`:

``` r
portion(matrix(LETTERS[1:24], nrow = 4), proportion = 0.5, how = "first")
#> [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,] "A" "E" "I" "M" "Q" "U"
#> [2,] "B" "F" "J" "N" "R" "V"
#> attr(,"indices")
#> [1] 1 2
portion(matrix(LETTERS[1:24], nrow = 4), proportion = 0.5, how = "first", byrow = FALSE)
#> [,1] [,2] [,3]
#> [1,] "A" "E" "I"
#> [2,] "B" "F" "J"
#> [3,] "C" "G" "K"
#> [4,] "D" "H" "L"
#> attr(,"indices")
#> [1] 1 2 3
```

Can portion a `data.frame`:

``` r
portion(as.data.frame(diag(8)), proportion = 0.3, how = "random")
#> V1 V2 V3 V4 V5 V6 V7 V8
#> 3 0 0 1 0 0 0 0 0
#> 4 0 0 0 1 0 0 0 0
#> 5 0 0 0 0 1 0 0 0
portion(as.data.frame(diag(8)), proportion = 0.3, how = "random", byrow = FALSE)
#> V2 V4 V8
#> 1 0 0 0
#> 2 1 0 0
#> 3 0 0 0
#> 4 0 1 0
#> 5 0 0 0
#> 6 0 0 0
#> 7 0 0 0
#> 8 0 0 1
```

Can work on a `list`:

``` r
portion(list(1:5, diag(3), data.frame(1:3, 2:4)), proportion = 0.5, how = "last")
#> [[1]]
#> [1] 3 4 5
#> attr(,"indices")
#> [1] 3 4 5
#>
#> [[2]]
#> [,1] [,2] [,3]
#> [1,] 0 1 0
#> [2,] 0 0 1
#> attr(,"indices")
#> [1] 2 3
#>
#> [[3]]
#> X1.3 X2.4
#> 2 2 3
#> 3 3 4
```
15 changes: 15 additions & 0 deletions man/build_cluster.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 93853d5

Please sign in to comment.