Merge pull request #71 from darwin-eu/develop

v1.2
darwin-eu · May 4, 2023 · 990a669 · 990a669
2 parents 0691a0f + 7e659f7
commit 990a669
Show file tree

Hide file tree

Showing 18 changed files with 703 additions and 57 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: CodelistGenerator
 Title: Generate Code Lists for the OMOP Common Data Model
-Version: 1.1.0
+Version: 1.2.0
 Authors@R: 
     person("Edward", "Burn", , "edward.burn@ndorms.ox.ac.uk", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-9286-1128"))
@@ -19,7 +19,8 @@ Imports:
     glue (>= 1.5.0),
     stringr (>= 1.4.0),
     tidyselect (>= 1.2.0),
-    tidyr  (>= 1.2.0)
+    tidyr (>= 1.2.0),
+    cli (>= 3.1.0)
 Suggests: 
     arrow (>= 9.0.0),
     covr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,11 +2,13 @@
 
 export("%>%")
 export(compareCodelists)
+export(getATCCodes)
 export(getCandidateCodes)
 export(getConceptClassId)
 export(getDescendants)
 export(getDomains)
 export(getDoseForm)
+export(getDrugIngredientCodes)
 export(getMappings)
 export(getVocabVersion)
 export(getVocabularies)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,6 @@
+# CodelistGenerator 1.2.0
+* Added functions getATCCodes and getDrugIngredientCodes. 
+
 # CodelistGenerator 1.1.0
 * Added exactMatch and includeSequela options to getCandidateCodes function.
 

diff --git a/R/compareCodelists.R b/R/compareCodelists.R
@@ -1,4 +1,4 @@
-# Copyright 2022 DARWIN EU®
+# Copyright 2023 DARWIN EU®
 #
 # This file is part of IncidencePrevalence
 #

diff --git a/R/drugCodes.R b/R/drugCodes.R
@@ -0,0 +1,267 @@
+# Copyright 2023 DARWIN EU®
+#
+# This file is part of IncidencePrevalence
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Get descendant codes for ATC levels
+#'
+#' @param cdm cdm_reference via CDMConnector
+#' @param level ATC level. Can be one or more of "ATC 1st", "ATC 2nd",
+#' "ATC 3rd", "ATC 4th", and "ATC 5th"
+#' @param name ATC name of interest. For example, c("Dermatologicals",
+#' "Nervous System"), would result in a list of length two with the descendant
+#' concepts for these two particular ATC groups.
+#' @param doseForm Only descendants codes with the specified dose form
+#' will be returned. If NULL, descendant codes will be returned regardless
+#' of dose form.
+#'
+#' @return A named list, with each element containing the descendant
+#' concepts for a particular ATC group
+#' @export
+#'
+#' @examples
+#' cdm <- mockVocabRef()
+#' getATCCodes(cdm = cdm, level = "ATC 1st")
+#' DBI::dbDisconnect(attr(cdm, "dbcon"), shutdown = TRUE)
+getATCCodes <- function(cdm,
+                        level = c("ATC 1st"),
+                        name = NULL,
+                        doseForm = NULL) {
+  errorMessage <- checkmate::makeAssertCollection()
+  checkDbType(cdm = cdm, type = "cdm_reference", messageStore = errorMessage)
+  levelCheck <- all(level %in%
+    c(
+      "ATC 1st",
+      "ATC 2nd",
+      "ATC 3rd",
+      "ATC 4th",
+      "ATC 5th"
+    ))
+  if (!isTRUE(levelCheck)) {
+    errorMessage$push(
+      "- level can only be from: ATC 1st, ATC 2nd, ATC 3rd, ATC 4th, ATC 5th"
+    )
+  }
+  checkmate::assertTRUE(levelCheck, add = errorMessage)
+  checkmate::assertVector(name,
+    add = errorMessage,
+    null.ok = TRUE
+  )
+  checkmate::reportAssertions(collection = errorMessage)
+
+  atc_groups <- cdm$concept %>%
+    dplyr::filter(.data$vocabulary_id == "ATC") %>%
+    dplyr::filter(.data$concept_class_id %in% .env$level) %>%
+    dplyr::collect()
+
+  if (!is.null(name)) {
+    atc_groups <- atc_groups %>%
+      dplyr::filter(tolower(.data$concept_name) %in% tolower(.env$name))
+  }
+
+  errorMessage <- checkmate::makeAssertCollection()
+  atcCheck <- nrow(atc_groups) > 0
+  if (!isTRUE(atcCheck)) {
+    errorMessage$push(
+      "- No matching ATC codes found"
+    )
+  }
+  checkmate::assertTRUE(atcCheck, add = errorMessage)
+  checkmate::reportAssertions(collection = errorMessage)
+
+
+  atc_descendants <- getDescendants(
+    cdm = cdm,
+    conceptId = atc_groups$concept_id,
+    withAncestor = TRUE,
+    doseForm = doseForm
+  )
+  if (nrow(atc_descendants) > 0) {
+    atc_descendants <- atc_descendants %>%
+      dplyr::select(c("concept_id", "ancestor_concept_id")) %>%
+      # split different ancestors into multiple cols
+      tidyr::separate_wider_delim(
+        cols = "ancestor_concept_id",
+        delim = ";",
+        names_sep = "",
+        too_few = "align_start"
+      )
+
+    atc_descendants <- atc_descendants %>%
+      # one row per concept + ancestor
+      tidyr::pivot_longer(!"concept_id",
+        names_to = NULL,
+        values_to = "ancestor_concept_id",
+        values_drop_na = TRUE
+      )
+
+    # to list
+    # one item per concept
+    atc_descendants <- split(
+      x = atc_descendants,
+      f = as.factor(atc_descendants$ancestor_concept_id),
+      drop = TRUE
+    )
+
+    # for each item in the list - pull out concepts and name
+    for (i in seq_along(atc_descendants)) {
+      workingLevel <- atc_groups %>%
+        dplyr::filter(.data$concept_id == names(atc_descendants)[i]) %>%
+        dplyr::pull("concept_class_id")
+      workingName <- atc_groups %>%
+        dplyr::filter(.data$concept_id == names(atc_descendants)[i]) %>%
+        dplyr::pull("concept_name")
+
+      atc_descendants[[i]] <- atc_descendants[[i]] %>%
+        dplyr::select("concept_id") %>%
+        dplyr::distinct() %>%
+        dplyr::pull()
+
+      names(atc_descendants)[i] <- paste0(
+        workingLevel, ": ", workingName,
+        " (", names(atc_descendants)[i], ")"
+      )
+    }
+  }
+  return(atc_descendants)
+}
+
+#' Get descendant codes for drug ingredients
+#'
+#' @param cdm cdm_reference via CDMConnector
+#' @param name Names of ingredients of interest. For example, c("acetaminophen",
+#' "codeine"), would result in a list of length two with the descendant
+#' concepts for these two particular drug ingredients.
+#' @param doseForm Only descendants codes with the specified dose form
+#' will be returned. If NULL, descendant codes will be returned regardless
+#' of dose form.
+#'
+#' @return A named list, with each item containing descendant concepts of
+#' an ingredient
+#' @export
+#'
+#' @examples
+#'cdm <- mockVocabRef()
+#'getDrugIngredientCodes(cdm = cdm, name = "Adalimumab")
+#'DBI::dbDisconnect(attr(cdm, "dbcon"), shutdown = TRUE)
+getDrugIngredientCodes <- function(cdm, name = NULL, doseForm = NULL) {
+  errorMessage <- checkmate::makeAssertCollection()
+  checkDbType(cdm = cdm, type = "cdm_reference", messageStore = errorMessage)
+  checkmate::assertVector(name,
+    add = errorMessage,
+    null.ok = TRUE
+  )
+  checkmate::reportAssertions(collection = errorMessage)
+
+  ingredientConcepts <- cdm$concept %>%
+    dplyr::filter(.data$standard_concept == "S") %>%
+    dplyr::filter(.data$concept_class_id == "Ingredient") %>%
+    dplyr::select("concept_id", "concept_name") %>%
+    dplyr::collect()
+
+  if (!is.null(name)) {
+    ingredientConcepts <- ingredientConcepts %>%
+      dplyr::filter(tolower(.data$concept_name) %in% tolower(.env$name))
+  }
+
+  errorMessage <- checkmate::makeAssertCollection()
+  ingredientCheck <- nrow(ingredientConcepts) > 0
+  if (!isTRUE(ingredientCheck)) {
+    errorMessage$push(
+      "- No matching Ingredient codes found"
+    )
+  }
+  checkmate::assertTRUE(ingredientCheck, add = errorMessage)
+  checkmate::reportAssertions(collection = errorMessage)
+
+  # to avoid potential memory problems will batch
+  if (nrow(ingredientConcepts) > 0) {
+    ingredientCodes <- fetchBatchedDrugIngredientCodes(cdm,
+      codes = ingredientConcepts$concept_id,
+      batchSize = 500,
+      doseForm = doseForm
+    )
+  }
+
+      ingredientCodes <- ingredientCodes %>%
+      dplyr::select(c("concept_id", "ancestor_concept_id")) %>%
+      # split different ancestors into multiple cols
+      tidyr::separate_wider_delim(
+        cols = "ancestor_concept_id",
+        delim = ";",
+        names_sep = "",
+        too_few = "align_start"
+      )
+
+    ingredientCodes <- ingredientCodes %>%
+      # one row per concept + ancestor
+      tidyr::pivot_longer(!"concept_id",
+        names_to = NULL,
+        values_to = "ancestor_concept_id",
+        values_drop_na = TRUE
+      )
+
+    # to list
+    # one item per concept
+    ingredientCodes <- split(
+      x = ingredientCodes,
+      f = as.factor(ingredientCodes$ancestor_concept_id),
+      drop = TRUE
+    )
+
+    # for each item in the list - pull out concepts and name
+    for (i in seq_along(ingredientCodes)) {
+      workingName <- ingredientConcepts %>%
+        dplyr::filter(.data$concept_id == names(ingredientCodes)[[i]]) %>%
+        dplyr::pull("concept_name")
+
+      ingredientCodes[[i]] <- ingredientCodes[[i]] %>%
+        dplyr::select("concept_id") %>%
+        dplyr::distinct() %>%
+        dplyr::pull()
+
+      names(ingredientCodes)[[i]] <- paste0(
+        "Ingredient", ": ", workingName,
+        " (", names(ingredientCodes)[[i]], ")"
+      )
+    }
+    return(ingredientCodes)
+}
+
+fetchBatchedDrugIngredientCodes <- function(cdm, codes, batchSize, doseForm) {
+  codeBatches <- split(
+    codes,
+    ceiling(seq_along(codes) / batchSize)
+  )
+
+  descendants <- vector("list", length(codeBatches))
+
+  cli::cli_progress_bar(
+    total = length(descendants),
+    format = " -- getting descendants {cli::pb_bar} {cli::pb_current} of {cli::pb_total} ingredient groups"
+  )
+  for (i in seq_along(descendants)) {
+    cli::cli_progress_update()
+    descendants[[i]] <- getDescendants(
+      cdm = cdm,
+      conceptId = codeBatches[[i]],
+      withAncestor = TRUE,
+      doseForm = doseForm
+    )
+  }
+  cli::cli_progress_done()
+  descendants <- dplyr::bind_rows(descendants)
+
+  return(descendants)
+}
diff --git a/R/getCandidateCodes.R b/R/getCandidateCodes.R
@@ -1,4 +1,4 @@
-# Copyright 2022 DARWIN EU®
+# Copyright 2023 DARWIN EU®
 #
 # This file is part of IncidencePrevalence
 #
@@ -77,7 +77,7 @@
 #'   keywords = "osteoarthritis"
 #'  )
 #' DBI::dbDisconnect(attr(cdm, "dbcon"), shutdown = TRUE)
-getCandidateCodes <- function(cdm = NULL,
+getCandidateCodes <- function(cdm,
                               keywords,
                               exclude = NULL,
                               domains = "Condition",
@@ -162,13 +162,18 @@ getCandidateCodes <- function(cdm = NULL,
   checkmate::assert_numeric(maxDistanceCost, add = errorMessage)
   checkmate::assert_logical(verbose, add = errorMessage)
   checkmate::reportAssertions(collection = errorMessage)
-  # in addition, now will check we have the required tables
+
   errorMessage <- checkmate::makeAssertCollection()
-  checkTableExists(cdm, "concept", errorMessage)
-  checkTableExists(cdm, "concept_relationship", errorMessage)
-  checkTableExists(cdm, "concept_ancestor", errorMessage)
-  checkTableExists(cdm, "concept_synonym", errorMessage)
-  checkTableExists(cdm, "vocabulary", errorMessage)
+  assertTablesExist(cdm, tableName = c("concept",
+                                       "concept_relationship",
+                                       "concept_ancestor",
+                                       "concept_synonym",
+                                       "vocabulary"),
+                    messageStore = errorMessage)
+  if ("drug" %in% tolower(domains)) {
+    assertTablesExist(cdm, tableName = c("drug_strength"),
+                      messageStore = errorMessage)
+  }
   checkmate::reportAssertions(collection = errorMessage)
 
   errorMessage <- checkmate::makeAssertCollection()

diff --git a/R/getMappings.R b/R/getMappings.R
@@ -1,4 +1,4 @@
-# Copyright 2022 DARWIN EU®
+# Copyright 2023 DARWIN EU®
 #
 # This file is part of IncidencePrevalence
 #
@@ -44,18 +44,20 @@ getMappings <- function(candidateCodelist,
                            "RxNorm", "RxNorm Extension",
                            "SNOMED"
                          )) {
+
   errorMessage <- checkmate::makeAssertCollection()
   checkDbType(cdm = cdm, type = "cdm_reference", messageStore = errorMessage)
   checkmate::assertVector(nonStandardVocabularies, add = errorMessage)
   checkmate::assertDataFrame(candidateCodelist, add = errorMessage)
   checkmate::reportAssertions(collection = errorMessage)
-  # in addition, now will check we have the required tables
+
   errorMessage <- checkmate::makeAssertCollection()
-  checkTableExists(cdm, "concept", errorMessage)
-  checkTableExists(cdm, "concept_relationship", errorMessage)
-  checkTableExists(cdm, "concept_ancestor", errorMessage)
-  checkTableExists(cdm, "concept_synonym", errorMessage)
-  checkTableExists(cdm, "vocabulary", errorMessage)
+  assertTablesExist(cdm, tableName = c("concept",
+                                       "concept_relationship",
+                                       "concept_ancestor",
+                                       "concept_synonym",
+                                       "vocabulary"),
+                    messageStore = errorMessage)
   checkmate::reportAssertions(collection = errorMessage)
 
   conceptDb <- cdm$concept