version 1.0.0

cran · Dec 5, 2023 · 8c665b1 · 8c665b1
commit 8c665b1
Show file tree

Hide file tree

Showing 51 changed files with 2,275 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,57 @@
+Package: profrep
+Title: Profile Repeatability
+Version: 1.0.0
+Authors@R: c(
+  person("Ursula K.", "Beattie",
+         email="ursula.beattie@tufts.edu", 
+         role=c("cre", "aut", "cph"),
+         comment=c(ORCID="0000-0002-7131-3712")
+        ),
+  person("David", "Harris",
+         email="drharris115@gmail.com",
+         role=c("aut", "cph") 
+        ),
+  person("L. Michael", "Romero",
+         email="michael.romero@tufts.edu",
+         role=c("aut", "cph"),
+         comment=c(ORCID="0000-0002-8854-8884")
+        ),
+  person("J. Michael", "Reed",
+         email="michael.reed@tufts.edu",
+         role=c("aut", "cph"),
+         comment=c(ORCID="0000-0002-3571-2652")
+        ),
+  person("Zachary R.", "Weaver",
+         email="zrweaver447@gmail.com",
+         role=c("aut", "cph"),
+         comment=c(ORCID="0000-0001-6314-0690")
+         )
+  )
+Description: Calculates profile repeatability for replicate stress response 
+  curves, or similar time-series data. Profile repeatability is an individual 
+  repeatability metric that uses the variances at each timepoint, the maximum 
+  variance, the number of crossings (lines that cross over each other), and 
+  the number of replicates to compute the repeatability score. 
+  For more information see Reed et al. (2019) <doi:10.1016/j.ygcen.2018.09.015>. 
+Depends: R (>= 2.10)
+License: MIT + file LICENSE
+URL: https://ubeattie.github.io/profrep/
+BugReports: https://github.com/ubeattie/profrep/issues
+Encoding: UTF-8
+RoxygenNote: 7.2.3
+Suggests: knitr, rmarkdown, testthat (>= 3.0.0)
+Config/testthat/edition: 3
+Imports: stats
+LazyData: true
+VignetteBuilder: knitr
+NeedsCompilation: no
+Packaged: 2023-12-01 20:50:46 UTC; cleanaccount
+Author: Ursula K. Beattie [cre, aut, cph]
+    (<https://orcid.org/0000-0002-7131-3712>),
+  David Harris [aut, cph],
+  L. Michael Romero [aut, cph] (<https://orcid.org/0000-0002-8854-8884>),
+  J. Michael Reed [aut, cph] (<https://orcid.org/0000-0002-3571-2652>),
+  Zachary R. Weaver [aut, cph] (<https://orcid.org/0000-0001-6314-0690>)
+Maintainer: Ursula K. Beattie <ursula.beattie@tufts.edu>
+Repository: CRAN
+Date/Publication: 2023-12-04 16:40:08 UTC
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2023
+COPYRIGHT HOLDER: profrep authors
diff --git a/MD5 b/MD5
@@ -0,0 +1,50 @@
+e6298f479f277bbe4522ec63cc3de286 *DESCRIPTION
+c673bd4b5854835da51ddf9b6885ebc2 *LICENSE
+34572eac501bc5100ad23dc98830af0c *NAMESPACE
+8e51344e6eb12ecb2394afd845c8b3b2 *NEWS.md
+1ca78358f97042fdb8f491ba0fc57861 *R/calculate_crossovers.R
+576119294ad4eb059317f3cd504bae99 *R/clean_data.R
+192656bb75e3666f3ad4367424fefcba *R/data.R
+3acfb0709fbe2bef428ef8040ad7aeaf *R/do_ordering.R
+2e8033940e5f5e800d851fcce734f318 *R/find_next_good_datapoint.R
+0b203e9d2c4881782960fa67b703a6d1 *R/get_vars.R
+8acd46b20b3211d4845d85cb9589f9da *R/math_helpers.R
+43bd67968b1c1a40cda56c7469e1f026 *R/profrep.R
+52147f37fce73f735b29e5de35f67693 *R/retrieve_good_data.R
+b647f1598704d670b6be1c17cb4c8c56 *R/score_dfs.R
+f7ddb49946273e29ecad6bbd1f4c4026 *R/score_individual_df.R
+ecceae992e494864bbeda8ac08163299 *README.md
+2213ddbc2f31d22f2d576c97aacc63ea *build/vignette.rds
+6e92e02fe5e0c6bad0cadfd134d1838f *data/example_two_point_data.rda
+0a29398ccd3773964931980954f69f4d *data/sparrow_repeatability_three_point.rda
+bfe85f57e0bf875f0b0f40e40faef7ea *data/synthetic_data_four_point.rda
+c16ea34db695b640bdfb46fcc766728e *inst/CITATION
+8f483c5d47aff30c5f4045c5678da77a *inst/doc/profrep.R
+096f0937466d622974f36edf21650d4d *inst/doc/profrep.Rmd
+fa542f5e23c2ac5c724dce2031627267 *inst/doc/profrep.html
+9d75fe6f389eccc0c05a90062104afc5 *man/calculate_crossovers.Rd
+d63166952e259db0a2d8b701545b2481 *man/clean_data.Rd
+e858adbe1e78c77d56c706d5b0deff8d *man/do_ordering.Rd
+25bca58a5b0a6ac4bf34e858cc05dba4 *man/example_two_point_data.Rd
+32cb276b6dcaf4f2afe9e06868a42cfa *man/figures/README-pressure-1.png
+b27988176fca5a36829bb73a5442e914 *man/find_next_good_datapoint.Rd
+82d548899046cedd9ac1c8cc8fd85512 *man/get_vars.Rd
+4006b9244131e17d8ef06b742e2341be *man/profrep.Rd
+c4489fbc9057aae1a890031d5b208ddf *man/retrieve_good_data.Rd
+5b5335651cdbd2cafa97487fb47b4b7e *man/score_dfs.Rd
+d5f65da02a92cbd56f352a3225a490a9 *man/score_individual_df.Rd
+e272951c8e2d5206662a25cd78744881 *man/sigmoid.Rd
+3970882116ae02715df5ccbbcc4f1051 *man/sparrow_repeatability_three_point.Rd
+4b0ee5042ea9bce950f1dc828cab52d7 *man/synthetic_data_four_point.Rd
+c8bdad69e0dcb2358b5765e3203a4bc6 *tests/testthat.R
+f3a39141a83bd329b1f3e36de27a36eb *tests/testthat/test-calculate_crossovers.R
+29d8c55a3b60c64b2ab023326503c1e9 *tests/testthat/test-clean_data.R
+5448bbec379e9f4a4679412378d6992e *tests/testthat/test-do_ordering.R
+770e8726889cb9c67fdba310e5da362d *tests/testthat/test-find_next_good_datapoint.R
+6b4b9b9111dcf49fa85b8ca19b9d79db *tests/testthat/test-get_vars.R
+ec9f3de3019bb3919d55266853a8880f *tests/testthat/test-math_helpers.R
+976396153e4653c0701196ff42840011 *tests/testthat/test-profrep.R
+e05c02d8b4d0e4cf29872a2fb038bed8 *tests/testthat/test-retrieve_good_data.R
+5236d851dcc82c65b2730f76d0fc129c *tests/testthat/test-score_dfs.R
+40bf69d9d539f62e04eca95337a0f942 *tests/testthat/test-score_individual_df.R
+096f0937466d622974f36edf21650d4d *vignettes/profrep.Rmd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,14 @@
+# Generated by roxygen2: do not edit by hand
+
+export(calculate_crossovers)
+export(clean_data)
+export(do_ordering)
+export(find_next_good_datapoint)
+export(get_vars)
+export(profrep)
+export(retrieve_good_data)
+export(score_dfs)
+export(score_individual_df)
+export(sigmoid)
+importFrom(stats,na.omit)
+importFrom(utils,tail)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,5 @@
+# profrep 1.0.0
+
+# profrep 1.0.0
+
+* Initial CRAN submission.
diff --git a/R/calculate_crossovers.R b/R/calculate_crossovers.R
@@ -0,0 +1,59 @@
+#' @title Calculate the Number of Crossovers
+#' 
+#' @details
+#' This function calculates the number of crossovers in a dataset by comparing
+#' the values of replicates across multiple trials. It assumes that missing
+#' values (NAs) have been interpolated using the `clean_data` function.
+#'
+#' @param individual_df A data frame containing the individual dataset.
+#' @param n_trials The total number of trials in the dataset (the number of rows)
+#' @param n_replicates The total number of replicates in each trial (the number of columns - 2)
+#'
+#' @return The number of crossovers detected in the dataset.
+#'
+#' @seealso \code{\link{clean_data}} for information on data cleaning.
+#'
+#' @examples
+#' data <- matrix(
+#'    c(
+#'      1, 60, 1, 2, 3, 4, 5,   # No NA values
+#'      1, 90, 9, NA, 4, NA, 2,  # NA Values in row
+#'      1, 120, 3, 6, NA, NA, 9  # Consecutive NA values
+#'     ),
+#'     nrow = 3,
+#'     byrow=TRUE
+#' )
+#' n_trials <- nrow(data)
+#' n_replicates <- ncol(data) - 2
+#' crossovers <- calculate_crossovers(data, n_trials, n_replicates)
+#' cat("Number of crossovers:", crossovers, "\n")
+#'
+#' @export
+calculate_crossovers <- function(individual_df, n_trials, n_replicates) {
+  indicators <- c()
+  clean_df <- clean_data(data=individual_df, n_trials=n_trials, n_replicates=n_replicates)
+
+  for (t in 1:n_trials) {
+    for (i in 1:(n_replicates - 1)) {
+      val <- clean_df[t, i + 2]
+      for (j in (i + 1):n_replicates) {
+        nxt <- -1
+        if (val >= clean_df[t, j + 2]) {
+          nxt <- 1
+        }
+        indicators <- c(indicators, nxt)
+      }
+    }
+  }
+  n_pairs = n_replicates * (n_replicates - 1) / 2
+  M <- matrix(indicators, n_pairs, n_trials)
+  n_crossings <- 0
+  for (t in 1:(n_trials-1)) {
+    for (i in 1:n_pairs) {
+      if (M[i,t] * M[i, t+1] == -1) {
+        n_crossings <- n_crossings + 1
+      }
+    }
+  }
+  return(n_crossings)
+}
diff --git a/R/clean_data.R b/R/clean_data.R
@@ -0,0 +1,60 @@
+#' @title Clean Data by Interpolating Missing Values
+#'
+#' @details
+#' This function cleans a dataset by interpolating missing values in the replicate
+#' columns of each row using neighboring values. If the data frame ends in null values
+#' (the last columns are nulls), it will extrapolate from the last value. If the
+#' first value is null, it will loop around and pull from the last replicate to
+#' perform the interpolation between the last replicate and the second replicate.
+#'
+#' @param data A data frame containing the dataset to be cleaned.
+#' @param n_trials The total number of rows in the dataset.
+#' @param n_replicates The total number of replicate columns in each row.
+#'
+#' @return A cleaned data frame with missing values interpolated.
+#'
+#' @seealso \code{\link{find_next_good_datapoint}} for details on the interpolation process.
+#'
+#' @examples
+#' my_data <- matrix(
+#'    c(
+#'      1, 60, 1, 2, 3, 4, 5,   # No NA values
+#'      1, 90, 9, NA, 4, NA, 2,  # NA Values in row
+#'      1, 120, 3, 6, NA, NA, 9  # Consecutive NA values
+#'     ),
+#'     nrow = 3,
+#'     byrow=TRUE
+#'  )
+#' cleaned_data <- clean_data(my_data, n_trials = 3, n_replicates = 5)
+#' print(my_data)
+#' print(cleaned_data)
+#' 
+#' @importFrom stats na.omit
+#' @importFrom utils tail
+#'
+#' @export
+clean_data <- function(data, n_trials, n_replicates) {
+  missing_set <- c()
+
+  for (t in 1:n_trials) { # loops through rows
+    trial_row <- data[t, 3:ncol(data)] # get the row of replicate data only
+
+    for (i_rep in 1:n_replicates) { # loops through replicate columns
+      # replicate columns are really index 3 to the end of the data frame
+      if (is.na(trial_row[i_rep])) {
+        n_missing <- length(missing_set) + 1 # increase number missing 
+        missing_set <- c(missing_set, t*(n_replicates -1) + i_rep) # add to missing set
+
+        interp_val <- find_next_good_datapoint(trial_row, i_rep, n_replicates)
+        # If the first value is null, need to pull the last replicate value.
+        if (i_rep == 1) {first_val = utils::tail(stats::na.omit(unlist(trial_row[1,])), 1)}
+        else {first_val = trial_row[i_rep - 1]}
+
+        replacement_val <- first_val + interp_val / 2 
+        trial_row[i_rep] <- replacement_val
+      }
+    }
+    data[t, 3:ncol(data)] <- trial_row
+  }
+  return(data)
+}
diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,56 @@
+#' Example Data: Two Point Data
+#' 
+#' An example of data that one would perform profile repeatability on. 
+#' Consists of 9 individual animals, with corticosterone data taken at 2 timepoints (n_trials = 2), baseline (time = 3) and stress-induced (time = 30).
+#' Then, there are 28 replicate columns.
+#' 
+#' @format ## `example_two_point_data`
+#' A dataframe with 10 rows and 30 columns:
+#' \describe{
+#'    \item{Animal}{The animal name/unique identifier}
+#'    \item{Time}{The time of the measurement, in days.}
+#'    \item{SD.DR}{The name of the replicate column.}
+#' }
+#' 
+#' @source This data was extracted from Romero & Rich 2007 (Comp Biochem. Physiol. Part A Mol. Integr. Physiol. 147, 562-568. https://doi.org/10.1016/j.cbpa.2007.02.004)
+"example_two_point_data"
+
+#' Example Data: Sparrow Repeatability (3 Point Data)
+#' 
+#' An example of data that one would perform profile repeatability on.
+#' Consists of 12 individual animals, with corticosterone data taken at 3 times (n_trials = 3), baseline (time = 0) and two stress-induced (time = 15 and 30).
+#' Then, there are 10 replicate columns. This example also shows what happens
+#' when there are null data records for some individuals.
+#' 
+#' @format ## `sparrow_repeatability_three_point`
+#' A dataframe with 36 rows and 12 columns:
+#' \describe{
+#'    \item{Animal}{The animal name/unique identifier}
+#'    \item{TIME}{The time of the measurement, in days}
+#'    \item{LD.500}{The name of the replicate column}
+#' }
+#' 
+#' @source This data was extracted from Rich & Romero 2001 (J. Comp. Physiol. Part B Biochem. Syst. Environ. Physiol. 171, 543-647. https://doi.org/10.1007/s003600100204)
+"sparrow_repeatability_three_point"
+
+#' Example Data: Synthetic 4-Point Data
+#' 
+#' An example of data that one would perform profile repeatability on.
+#' The data is synthetic data created for testing purposes and is designed to span a range of perceived repeatability scores.
+#' Consists of 11 individual animals, with data taken at 4 times (n_trials = 4), baseline (time = 0) and three stress-induced (time = 15, 30, and 45).
+#' Then, there are four replicate columns. Replicate column names refer to 
+#' sample tests performed on the animal.
+#' 
+#' @format ## `synthetic_data_four_point`
+#' A dataframe with 44 rows and 6 columns:
+#' \describe{
+#'    \item{Animal}{The animal name/unique identifier}
+#'    \item{TIME}{The time of the measurement (unit not important)}
+#'    \item{A}{The (unimportant) name of a replicate column.}
+#'    \item{B}{The (unimportant) name of a replicate column.}
+#'    \item{C}{The (unimportant) name of a replicate column.}
+#'    \item{D}{The (unimportant) name of a replicate column.}
+#' }
+#' 
+#' @source Data created for testing purposes by Reed et al., 2019 (Gen. Comp. Endocrinol. 270, 1-9. https://doi.org/10.1016/j.ygcen.2018.09.015)
+"synthetic_data_four_point"
diff --git a/R/do_ordering.R b/R/do_ordering.R
@@ -0,0 +1,74 @@
+#' @title Score and Order Data
+#' 
+#' @details
+#' Performs the ordering of input data by scoring each individual data frame.
+#' 
+#' The main function of the package, this will send each individuals data out 
+#' for scoring. Then, when all scores are computed, it will order the result
+#' data frame by score and assign a rank.
+#' 
+#' Ranks are assigned with ties allowed - if N individuals have a tie, their rank
+#' is averaged. For example, if the max score is 1, and two individuals have 
+#' that score, their rank is 1.5
+#'
+#' @param n_trials The number of rows an individual sample will have.
+#' @param id_list The list of unique individual or sample names
+#' @param df_list The list of data frames per unique individual
+#' @param n_replicates The number of replicates in the study.
+#' @param verbose A boolean parameter the defaults to FALSE. Determines whether messages are printed.
+#' @param sort A boolean parameter that defaults to TRUE. If TRUE, sorts the returned data frame by score. If FALSE, returns the data in the individual order it was provided in
+#' 
+#' @returns Returns a data frame of the results, in the following form:
+#' 
+#'          - Column 1: "individual" - the unique identifier of an individual or sample
+#'          - Column 2: "n_crossings" - the calculated number of crossings.
+#'          - Column 3: "max_variance" - the maximum of the variances of the replicate measurements at a single time for the individual or sample.
+#'          - Column 4: "ave_variance" - the average of the variances of the replicate measurements at a single time for the individual or sample.
+#'          - Column 5: "base_score" - the original, unnormalized profile repeatability score. Smaller numbers rank higher.
+#'          - Column 6: "final_score" - the base score, normalized by the sigmoid function. Constrained to be between 0 and 1. Scores closer to 1 rank higher.
+#'          - Column 7: "rank" - the calculated ranking of the individual or sample, against all other individuals or samples in the data set.
+#' 
+#' @examples
+#' df <- data.frame(
+#'     col_a = c('A', 'A', 'B', 'B'),
+#'     col_b = c(5, 15, 5, 15),
+#'     col_c = c(5, 10, 1, 2),
+#'     col_d = c(10, 15, 3, 4)
+#'   )
+#' id_list <- unique(df[, 1])
+#' individuals <- list()
+#' for (i in 1:length(id_list)) {
+#'   individuals[[i]] <- df[df[, 1] == id_list[i], ]
+#' }
+#' ret_df <- do_ordering(n_trials=2, id_list=id_list, df_list=individuals, n_replicates=2)
+#' print(ret_df)
+#' 
+#' @export
+do_ordering <- function(n_trials, id_list, df_list, n_replicates, verbose=FALSE, sort=TRUE) {
+  # Generate Scores
+  if (verbose) {message("Scoring each set of data per individual.")}
+  scores_df <- score_dfs(
+    id_list=id_list, 
+    df_list=df_list, 
+    n_replicates=n_replicates, 
+    n_trials=n_trials,
+    verbose=verbose
+  )
+
+  # Order Scores and Individuals
+  if (sort) {
+    if (verbose) {message("Ordering by score.")}
+    ordered_df <- scores_df[order(scores_df$final_score, decreasing=TRUE), ]
+  }
+  else {ordered_df <- scores_df}
+
+  # need the minus to rank the highest score the highest
+  # average will make ties be the same decimal value
+  ordered_df$rank <- rank(-ordered_df$final_score, ties.method="average")
+
+  # Resets the index
+  rownames(ordered_df) <- NULL
+
+  return(ordered_df)
+}
+