Skip to content

Commit

Permalink
Merge pull request #24 from ecohealthalliance/fix/correct_data_types
Browse files Browse the repository at this point in the history
Fix/correct data types
  • Loading branch information
collinschwantes committed May 29, 2024
2 parents 23810e4 + 8d5b6af commit cff3dc4
Show file tree
Hide file tree
Showing 9 changed files with 158 additions and 2 deletions.
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: ohcleandat
Type: Package
Title: One Health Data Cleaning and Quality Checking Package
Version: 0.2.2
Version: 0.2.3
Authors@R: c(
person("Collin", "Schwantes", email = "schwantes@ecohealthalliance.org", role = c("cre", "aut"), comment = c(ORCID = "0000-0003-4014-4896")),
person("Johana", "Teigen", email = "teigen@ecohealthalliance.org", role = "aut", comment = c(ORCID = "0000-0002-6209-2321")),
Expand Down Expand Up @@ -44,3 +44,5 @@ Remotes:
fcampelo/rdrop2,
ropensci/ruODK
URL: https://ecohealthalliance.github.io/ohcleandat/
Depends:
R (>= 2.10)
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ export(get_dropbox_val_logs)
export(get_odk_form_schema)
export(get_odk_responses)
export(get_species_letter)
export(guess_col_type)
export(id_checker)
export(make_report_urls)
export(othertext_lookup)
Expand Down
18 changes: 18 additions & 0 deletions R/class_to_col_type.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#' Class to Column Type lookup table
#'
#' A table that links classes to `readr` column types.
#' Created from csv file of the same name in inst/
#'
#' class_to_col_type <- read.csv(file = "inst/class_to_col_type.csv")
#' usethis::use_data(class_to_col_type,overwrite = TRUE)
#'
#' @format ## `class_to_col_type`
#' A data frame with 9 rows and 3 columns:
#' \describe{
#' \item{col_type}{Type of column as described in `readr`}
#' \item{col_class}{Class of R object that matches that column type}
#' \item{col_abv}{Abbreviation for that column type from `reader`}
#' ...
#' }
#' @seealso [reader::cols()]
"class_to_col_type"
4 changes: 3 additions & 1 deletion R/correct_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ correct_data <- function(validation_log, data, primary_key){
}

# preserve col types. The log deals only in character types so need to fix later
col_types <- paste(purrr::map_chr(data, ~stringr::str_sub(class(.), 1, 1)), collapse = "")
## need to map out readr col types to different classes

col_types <- paste(guess_col_type(data), collapse = "")

# transform to character for imputation
dat_chr <- data |>
Expand Down
59 changes: 59 additions & 0 deletions R/guess_col_type.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#' Guess the column type
#'
#' uses column class to set readr column type
#'
#' @param data data.frame Data who column types you would like to guess
#' @param default_col_abv string. Column type abbreviation from [readr::cols()].
#' Use "g" to guess the column type.
#'
#' @return character vector of column abbreviations
#' @export
#'
#' @examples
#' data <- data.frame(time = Sys.time(),
#' char = "hello", num = 1, log = TRUE,
#' date = Sys.Date(), list_col = list("hello") )
#'
#' guess_col_type(data)
#'
#' ## change default value of default column abbreviation
#'
#' guess_col_type(data, default_col_abv = "g")
#'
#'
guess_col_type <- function(data, default_col_abv = "c"){

## get col type abbreviations
purrr::map_chr(data, function(x){
class_x <- data.frame(col_class = class(x))

# class_to_col_type is created from class_to_col_type.csv in the inst folder
class_df <- dplyr::left_join(class_x,ohcleandat::class_to_col_type,"col_class")

col_abv <- dplyr::distinct(class_df,col_abv ) |>
dplyr::pull(col_abv)

na_check <- is.na(col_abv)

if(all(na_check)){
class_for_msg <- paste(class(x),collapse = ", ")
msg <- sprintf("column type %s might not be supported, defaulting to %s", class_for_msg,default_col_abv)
rlang::warn(msg)
return(default_col_abv)
}

if(any(na_check)){
# use non na class(es)
col_abv <- col_abv[which(!na_check)]
}

if(length(col_abv) > 1){
rlang::warn("column has multiple classes of distinct types,
using first class as column type")
col_abv <- col_abv[1]
}

return(col_abv)
})

}
Binary file added data/class_to_col_type.rda
Binary file not shown.
10 changes: 10 additions & 0 deletions inst/class_to_col_type.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
col_type,col_class,col_abv
col_time,POSIXct,T
col_time,POSIXt,T
col_character,character,c
col_number,numeric,n
col_logical,logical,l
col_date,Date,D
col_factor,factor,f
col_double,double,d
col_integer,integer,i
31 changes: 31 additions & 0 deletions man/class_to_col_type.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 33 additions & 0 deletions man/guess_col_type.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit cff3dc4

Please sign in to comment.