From 7e659f76b9322511ee1f63939685b98d43d754a0 Mon Sep 17 00:00:00 2001
From: edward-burn <9583964+edward-burn@users.noreply.github.com>
Date: Wed, 3 May 2023 17:13:46 +0100
Subject: [PATCH] v1.2

---
 DESCRIPTION                             |   5 +-
 NAMESPACE                               |   2 +
 NEWS.md                                 |   3 +
 R/compareCodelists.R                    |   2 +-
 R/drugCodes.R                           | 267 ++++++++++++++++++++++++
 R/getCandidateCodes.R                   |  21 +-
 R/getMappings.R                         |  16 +-
 R/{utils.R => inputValidation.R}        |  15 +-
 R/mockVocabRef.R                        |  69 +++++-
 R/runSearch.R                           |   2 +-
 R/vocabUtilities.R                      | 157 ++++++++++++--
 man/getATCCodes.Rd                      |  34 +++
 man/getCandidateCodes.Rd                |   2 +-
 man/getDescendants.Rd                   |   9 +-
 man/getDrugIngredientCodes.Rd           |  31 +++
 tests/testthat/test-drugCodes.R         |  78 +++++++
 tests/testthat/test-getCandidateCodes.R |   2 +-
 tests/testthat/test-vocabUtilities.R    |  45 +++-
 18 files changed, 703 insertions(+), 57 deletions(-)
 create mode 100644 R/drugCodes.R
 rename R/{utils.R => inputValidation.R} (70%)
 create mode 100644 man/getATCCodes.Rd
 create mode 100644 man/getDrugIngredientCodes.Rd
 create mode 100644 tests/testthat/test-drugCodes.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 9625f70..b244fc6 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: CodelistGenerator
 Title: Generate Code Lists for the OMOP Common Data Model
-Version: 1.1.0
+Version: 1.2.0
 Authors@R: 
     person("Edward", "Burn", , "edward.burn@ndorms.ox.ac.uk", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-9286-1128"))
@@ -19,7 +19,8 @@ Imports:
     glue (>= 1.5.0),
     stringr (>= 1.4.0),
     tidyselect (>= 1.2.0),
-    tidyr  (>= 1.2.0)
+    tidyr (>= 1.2.0),
+    cli (>= 3.1.0)
 Suggests: 
     arrow (>= 9.0.0),
     covr,
diff --git a/NAMESPACE b/NAMESPACE
index 5d7586a..a191138 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -2,11 +2,13 @@
 
 export("%>%")
 export(compareCodelists)
+export(getATCCodes)
 export(getCandidateCodes)
 export(getConceptClassId)
 export(getDescendants)
 export(getDomains)
 export(getDoseForm)
+export(getDrugIngredientCodes)
 export(getMappings)
 export(getVocabVersion)
 export(getVocabularies)
diff --git a/NEWS.md b/NEWS.md
index d80d199..d047090 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,6 @@
+# CodelistGenerator 1.2.0
+* Added functions getATCCodes and getDrugIngredientCodes. 
+
 # CodelistGenerator 1.1.0
 * Added exactMatch and includeSequela options to getCandidateCodes function.
 
diff --git a/R/compareCodelists.R b/R/compareCodelists.R
index 79aacdb..46c3d6a 100644
--- a/R/compareCodelists.R
+++ b/R/compareCodelists.R
@@ -1,4 +1,4 @@
-# Copyright 2022 DARWIN EU®
+# Copyright 2023 DARWIN EU®
 #
 # This file is part of IncidencePrevalence
 #
diff --git a/R/drugCodes.R b/R/drugCodes.R
new file mode 100644
index 0000000..6b179a3
--- /dev/null
+++ b/R/drugCodes.R
@@ -0,0 +1,267 @@
+# Copyright 2023 DARWIN EU®
+#
+# This file is part of IncidencePrevalence
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Get descendant codes for ATC levels
+#'
+#' @param cdm cdm_reference via CDMConnector
+#' @param level ATC level. Can be one or more of "ATC 1st", "ATC 2nd",
+#' "ATC 3rd", "ATC 4th", and "ATC 5th"
+#' @param name ATC name of interest. For example, c("Dermatologicals",
+#' "Nervous System"), would result in a list of length two with the descendant
+#' concepts for these two particular ATC groups.
+#' @param doseForm Only descendants codes with the specified dose form
+#' will be returned. If NULL, descendant codes will be returned regardless
+#' of dose form.
+#'
+#' @return A named list, with each element containing the descendant
+#' concepts for a particular ATC group
+#' @export
+#'
+#' @examples
+#' cdm <- mockVocabRef()
+#' getATCCodes(cdm = cdm, level = "ATC 1st")
+#' DBI::dbDisconnect(attr(cdm, "dbcon"), shutdown = TRUE)
+getATCCodes <- function(cdm,
+                        level = c("ATC 1st"),
+                        name = NULL,
+                        doseForm = NULL) {
+  errorMessage <- checkmate::makeAssertCollection()
+  checkDbType(cdm = cdm, type = "cdm_reference", messageStore = errorMessage)
+  levelCheck <- all(level %in%
+    c(
+      "ATC 1st",
+      "ATC 2nd",
+      "ATC 3rd",
+      "ATC 4th",
+      "ATC 5th"
+    ))
+  if (!isTRUE(levelCheck)) {
+    errorMessage$push(
+      "- level can only be from: ATC 1st, ATC 2nd, ATC 3rd, ATC 4th, ATC 5th"
+    )
+  }
+  checkmate::assertTRUE(levelCheck, add = errorMessage)
+  checkmate::assertVector(name,
+    add = errorMessage,
+    null.ok = TRUE
+  )
+  checkmate::reportAssertions(collection = errorMessage)
+
+  atc_groups <- cdm$concept %>%
+    dplyr::filter(.data$vocabulary_id == "ATC") %>%
+    dplyr::filter(.data$concept_class_id %in% .env$level) %>%
+    dplyr::collect()
+
+  if (!is.null(name)) {
+    atc_groups <- atc_groups %>%
+      dplyr::filter(tolower(.data$concept_name) %in% tolower(.env$name))
+  }
+
+  errorMessage <- checkmate::makeAssertCollection()
+  atcCheck <- nrow(atc_groups) > 0
+  if (!isTRUE(atcCheck)) {
+    errorMessage$push(
+      "- No matching ATC codes found"
+    )
+  }
+  checkmate::assertTRUE(atcCheck, add = errorMessage)
+  checkmate::reportAssertions(collection = errorMessage)
+
+
+  atc_descendants <- getDescendants(
+    cdm = cdm,
+    conceptId = atc_groups$concept_id,
+    withAncestor = TRUE,
+    doseForm = doseForm
+  )
+  if (nrow(atc_descendants) > 0) {
+    atc_descendants <- atc_descendants %>%
+      dplyr::select(c("concept_id", "ancestor_concept_id")) %>%
+      # split different ancestors into multiple cols
+      tidyr::separate_wider_delim(
+        cols = "ancestor_concept_id",
+        delim = ";",
+        names_sep = "",
+        too_few = "align_start"
+      )
+
+    atc_descendants <- atc_descendants %>%
+      # one row per concept + ancestor
+      tidyr::pivot_longer(!"concept_id",
+        names_to = NULL,
+        values_to = "ancestor_concept_id",
+        values_drop_na = TRUE
+      )
+
+    # to list
+    # one item per concept
+    atc_descendants <- split(
+      x = atc_descendants,
+      f = as.factor(atc_descendants$ancestor_concept_id),
+      drop = TRUE
+    )
+
+    # for each item in the list - pull out concepts and name
+    for (i in seq_along(atc_descendants)) {
+      workingLevel <- atc_groups %>%
+        dplyr::filter(.data$concept_id == names(atc_descendants)[i]) %>%
+        dplyr::pull("concept_class_id")
+      workingName <- atc_groups %>%
+        dplyr::filter(.data$concept_id == names(atc_descendants)[i]) %>%
+        dplyr::pull("concept_name")
+
+      atc_descendants[[i]] <- atc_descendants[[i]] %>%
+        dplyr::select("concept_id") %>%
+        dplyr::distinct() %>%
+        dplyr::pull()
+
+      names(atc_descendants)[i] <- paste0(
+        workingLevel, ": ", workingName,
+        " (", names(atc_descendants)[i], ")"
+      )
+    }
+  }
+  return(atc_descendants)
+}
+
+#' Get descendant codes for drug ingredients
+#'
+#' @param cdm cdm_reference via CDMConnector
+#' @param name Names of ingredients of interest. For example, c("acetaminophen",
+#' "codeine"), would result in a list of length two with the descendant
+#' concepts for these two particular drug ingredients.
+#' @param doseForm Only descendants codes with the specified dose form
+#' will be returned. If NULL, descendant codes will be returned regardless
+#' of dose form.
+#'
+#' @return A named list, with each item containing descendant concepts of
+#' an ingredient
+#' @export
+#'
+#' @examples
+#'cdm <- mockVocabRef()
+#'getDrugIngredientCodes(cdm = cdm, name = "Adalimumab")
+#'DBI::dbDisconnect(attr(cdm, "dbcon"), shutdown = TRUE)
+getDrugIngredientCodes <- function(cdm, name = NULL, doseForm = NULL) {
+  errorMessage <- checkmate::makeAssertCollection()
+  checkDbType(cdm = cdm, type = "cdm_reference", messageStore = errorMessage)
+  checkmate::assertVector(name,
+    add = errorMessage,
+    null.ok = TRUE
+  )
+  checkmate::reportAssertions(collection = errorMessage)
+
+  ingredientConcepts <- cdm$concept %>%
+    dplyr::filter(.data$standard_concept == "S") %>%
+    dplyr::filter(.data$concept_class_id == "Ingredient") %>%
+    dplyr::select("concept_id", "concept_name") %>%
+    dplyr::collect()
+
+  if (!is.null(name)) {
+    ingredientConcepts <- ingredientConcepts %>%
+      dplyr::filter(tolower(.data$concept_name) %in% tolower(.env$name))
+  }
+
+  errorMessage <- checkmate::makeAssertCollection()
+  ingredientCheck <- nrow(ingredientConcepts) > 0
+  if (!isTRUE(ingredientCheck)) {
+    errorMessage$push(
+      "- No matching Ingredient codes found"
+    )
+  }
+  checkmate::assertTRUE(ingredientCheck, add = errorMessage)
+  checkmate::reportAssertions(collection = errorMessage)
+
+  # to avoid potential memory problems will batch
+  if (nrow(ingredientConcepts) > 0) {
+    ingredientCodes <- fetchBatchedDrugIngredientCodes(cdm,
+      codes = ingredientConcepts$concept_id,
+      batchSize = 500,
+      doseForm = doseForm
+    )
+  }
+
+      ingredientCodes <- ingredientCodes %>%
+      dplyr::select(c("concept_id", "ancestor_concept_id")) %>%
+      # split different ancestors into multiple cols
+      tidyr::separate_wider_delim(
+        cols = "ancestor_concept_id",
+        delim = ";",
+        names_sep = "",
+        too_few = "align_start"
+      )
+
+    ingredientCodes <- ingredientCodes %>%
+      # one row per concept + ancestor
+      tidyr::pivot_longer(!"concept_id",
+        names_to = NULL,
+        values_to = "ancestor_concept_id",
+        values_drop_na = TRUE
+      )
+
+    # to list
+    # one item per concept
+    ingredientCodes <- split(
+      x = ingredientCodes,
+      f = as.factor(ingredientCodes$ancestor_concept_id),
+      drop = TRUE
+    )
+
+    # for each item in the list - pull out concepts and name
+    for (i in seq_along(ingredientCodes)) {
+      workingName <- ingredientConcepts %>%
+        dplyr::filter(.data$concept_id == names(ingredientCodes)[[i]]) %>%
+        dplyr::pull("concept_name")
+
+      ingredientCodes[[i]] <- ingredientCodes[[i]] %>%
+        dplyr::select("concept_id") %>%
+        dplyr::distinct() %>%
+        dplyr::pull()
+
+      names(ingredientCodes)[[i]] <- paste0(
+        "Ingredient", ": ", workingName,
+        " (", names(ingredientCodes)[[i]], ")"
+      )
+    }
+    return(ingredientCodes)
+}
+
+fetchBatchedDrugIngredientCodes <- function(cdm, codes, batchSize, doseForm) {
+  codeBatches <- split(
+    codes,
+    ceiling(seq_along(codes) / batchSize)
+  )
+
+  descendants <- vector("list", length(codeBatches))
+
+  cli::cli_progress_bar(
+    total = length(descendants),
+    format = " -- getting descendants {cli::pb_bar} {cli::pb_current} of {cli::pb_total} ingredient groups"
+  )
+  for (i in seq_along(descendants)) {
+    cli::cli_progress_update()
+    descendants[[i]] <- getDescendants(
+      cdm = cdm,
+      conceptId = codeBatches[[i]],
+      withAncestor = TRUE,
+      doseForm = doseForm
+    )
+  }
+  cli::cli_progress_done()
+  descendants <- dplyr::bind_rows(descendants)
+
+  return(descendants)
+}
diff --git a/R/getCandidateCodes.R b/R/getCandidateCodes.R
index 0252ef1..313b02b 100644
--- a/R/getCandidateCodes.R
+++ b/R/getCandidateCodes.R
@@ -1,4 +1,4 @@
-# Copyright 2022 DARWIN EU®
+# Copyright 2023 DARWIN EU®
 #
 # This file is part of IncidencePrevalence
 #
@@ -77,7 +77,7 @@
 #'   keywords = "osteoarthritis"
 #'  )
 #' DBI::dbDisconnect(attr(cdm, "dbcon"), shutdown = TRUE)
-getCandidateCodes <- function(cdm = NULL,
+getCandidateCodes <- function(cdm,
                               keywords,
                               exclude = NULL,
                               domains = "Condition",
@@ -162,13 +162,18 @@ getCandidateCodes <- function(cdm = NULL,
   checkmate::assert_numeric(maxDistanceCost, add = errorMessage)
   checkmate::assert_logical(verbose, add = errorMessage)
   checkmate::reportAssertions(collection = errorMessage)
-  # in addition, now will check we have the required tables
+
   errorMessage <- checkmate::makeAssertCollection()
-  checkTableExists(cdm, "concept", errorMessage)
-  checkTableExists(cdm, "concept_relationship", errorMessage)
-  checkTableExists(cdm, "concept_ancestor", errorMessage)
-  checkTableExists(cdm, "concept_synonym", errorMessage)
-  checkTableExists(cdm, "vocabulary", errorMessage)
+  assertTablesExist(cdm, tableName = c("concept",
+                                       "concept_relationship",
+                                       "concept_ancestor",
+                                       "concept_synonym",
+                                       "vocabulary"),
+                    messageStore = errorMessage)
+  if ("drug" %in% tolower(domains)) {
+    assertTablesExist(cdm, tableName = c("drug_strength"),
+                      messageStore = errorMessage)
+  }
   checkmate::reportAssertions(collection = errorMessage)
 
   errorMessage <- checkmate::makeAssertCollection()
diff --git a/R/getMappings.R b/R/getMappings.R
index 6bd5c3b..4c11506 100644
--- a/R/getMappings.R
+++ b/R/getMappings.R
@@ -1,4 +1,4 @@
-# Copyright 2022 DARWIN EU®
+# Copyright 2023 DARWIN EU®
 #
 # This file is part of IncidencePrevalence
 #
@@ -44,18 +44,20 @@ getMappings <- function(candidateCodelist,
                            "RxNorm", "RxNorm Extension",
                            "SNOMED"
                          )) {
+
   errorMessage <- checkmate::makeAssertCollection()
   checkDbType(cdm = cdm, type = "cdm_reference", messageStore = errorMessage)
   checkmate::assertVector(nonStandardVocabularies, add = errorMessage)
   checkmate::assertDataFrame(candidateCodelist, add = errorMessage)
   checkmate::reportAssertions(collection = errorMessage)
-  # in addition, now will check we have the required tables
+
   errorMessage <- checkmate::makeAssertCollection()
-  checkTableExists(cdm, "concept", errorMessage)
-  checkTableExists(cdm, "concept_relationship", errorMessage)
-  checkTableExists(cdm, "concept_ancestor", errorMessage)
-  checkTableExists(cdm, "concept_synonym", errorMessage)
-  checkTableExists(cdm, "vocabulary", errorMessage)
+  assertTablesExist(cdm, tableName = c("concept",
+                                       "concept_relationship",
+                                       "concept_ancestor",
+                                       "concept_synonym",
+                                       "vocabulary"),
+                    messageStore = errorMessage)
   checkmate::reportAssertions(collection = errorMessage)
 
   conceptDb <- cdm$concept
diff --git a/R/utils.R b/R/inputValidation.R
similarity index 70%
rename from R/utils.R
rename to R/inputValidation.R
index e2e0814..3b6cf50 100644
--- a/R/utils.R
+++ b/R/inputValidation.R
@@ -1,4 +1,4 @@
-# Copyright 2022 DARWIN EU®
+# Copyright 2023 DARWIN EU®
 #
 # This file is part of IncidencePrevalence
 #
@@ -23,12 +23,15 @@ checkDbType <- function(cdm, type = "cdm_reference", messageStore) {
   }
 }
 
-checkTableExists <- function(cdm, tableName, messageStore) {
-  tableExists <- inherits(cdm[[tableName]], c("tbl_dbi",
-                                               "ArrowObject", "ArrowTabular",
-                                               "tbl",  "data.frame"))
+assertTablesExist <- function(cdm, tableName, messageStore) {
+
+  for(i in seq_along(tableName)){
+  tableExists <- inherits(cdm[[tableName[[i]]]], c("tbl_dbi",
+                                              "ArrowObject", "ArrowTabular",
+                                              "tbl",  "data.frame"))
   checkmate::assertTRUE(tableExists, add = messageStore)
   if (!isTRUE(tableExists)) {
-    messageStore$push(glue::glue("- {tableName} is not found in the cdm reference"))
+    messageStore$push(glue::glue("- {tableName[[i]]} is not found in the cdm reference"))
+  }
   }
 }
diff --git a/R/mockVocabRef.R b/R/mockVocabRef.R
index 7324280..730089a 100644
--- a/R/mockVocabRef.R
+++ b/R/mockVocabRef.R
@@ -1,4 +1,4 @@
-# Copyright 2022 DARWIN EU®
+# Copyright 2023 DARWIN EU®
 #
 # This file is part of IncidencePrevalence
 #
@@ -33,7 +33,7 @@ mockVocabRef <- function(backend = "database") {
 
   # tables
   concept <- data.frame(
-    concept_id = 1:11,
+    concept_id = 1:14,
     concept_name = c(
       "Musculoskeletal disorder",
       "Osteoarthrosis",
@@ -45,27 +45,43 @@ mockVocabRef <- function(backend = "database") {
       "Knee osteoarthritis",
       "H/O osteoarthritis",
       "Adalimumab",
-      "Injection"
+      "Injection",
+      "ALIMENTARY TRACT AND METABOLISM",
+      "Descendant drug",
+      "Injectable"
     ),
-    domain_id = c(rep("Condition", 8), "Observation", "Drug", "Drug"),
+    domain_id = c(rep("Condition", 8), "Observation",rep("Drug", 5)),
     vocabulary_id = c(
       rep("SNOMED", 6),
       rep("Read", 2),
-      "LOINC", "RxNorm", "OMOP"
+      "LOINC", "RxNorm", "OMOP",
+      "ATC",
+      "RxNorm", "OMOP"
     ),
     standard_concept = c(
       rep("S", 6),
       rep(NA, 2),
-      "S", "S", NA
+      "S", "S", NA,
+      NA, "S", NA
     ),
     concept_class_id = c(
       rep("Clinical Finding", 6),
       rep("Diagnosis", 2),
-      "Observation", "Ingredient", "Dose Form"
+      "Observation", "Ingredient", "Dose Form",
+      "ATC 1st", "Drug", "Dose Form"
     ),
-    concept_code = "1234"
+    concept_code = "1234",
+    valid_start_date = NA,
+    valid_end_date = NA,
+    invalid_reason = NA
   )
   conceptAncestor <- dplyr::bind_rows(
+    data.frame(
+      ancestor_concept_id = 1L,
+      descendant_concept_id = 1L,
+      min_levels_of_separation = 1,
+      max_levels_of_separation = 1
+    ),
     data.frame(
       ancestor_concept_id = 1L,
       descendant_concept_id = 2L,
@@ -101,6 +117,30 @@ mockVocabRef <- function(backend = "database") {
       descendant_concept_id = 5L,
       min_levels_of_separation = 1,
       max_levels_of_separation = 1
+    ),
+    data.frame(
+      ancestor_concept_id = 10L,
+      descendant_concept_id = 10L,
+      min_levels_of_separation = 1,
+      max_levels_of_separation = 1
+    ),
+    data.frame(
+      ancestor_concept_id = 10L,
+      descendant_concept_id = 13L,
+      min_levels_of_separation = 1,
+      max_levels_of_separation = 1
+    ),
+    data.frame(
+      ancestor_concept_id = 12L,
+      descendant_concept_id = 12L,
+      min_levels_of_separation = 1,
+      max_levels_of_separation = 1
+    ),
+    data.frame(
+      ancestor_concept_id = 12L,
+      descendant_concept_id = 13L,
+      min_levels_of_separation = 1,
+      max_levels_of_separation = 1
     )
   )
   conceptSynonym <- dplyr::bind_rows(
@@ -112,7 +152,8 @@ mockVocabRef <- function(backend = "database") {
       concept_id = 3L,
       concept_synonym_name = "Osteoarthrosis"
     )
-  )
+  )%>%
+    dplyr::mutate(language_concept_id  = NA)
   conceptRelationship <- dplyr::bind_rows(
     data.frame(
       concept_id_1 = 2L,
@@ -133,8 +174,16 @@ mockVocabRef <- function(backend = "database") {
       concept_id_1 = 3L,
       concept_id_2 = 6L,
       relationship_id = "Due to of"
+    ),
+    data.frame(
+      concept_id_1 = 13L,
+      concept_id_2 = 14L,
+      relationship_id = "RxNorm has dose form"
     )
-  )
+  ) %>%
+    dplyr::mutate(valid_start_date = NA,
+                  valid_end_date = NA,
+                  invalid_reason = NA)
   vocabulary <- dplyr::bind_rows(
     data.frame(
       vocabulary_id = "SNOMED",
diff --git a/R/runSearch.R b/R/runSearch.R
index 833a783..031f374 100644
--- a/R/runSearch.R
+++ b/R/runSearch.R
@@ -1,4 +1,4 @@
-# Copyright 2022 DARWIN EU®
+# Copyright 2023 DARWIN EU®
 #
 # This file is part of IncidencePrevalence
 #
diff --git a/R/vocabUtilities.R b/R/vocabUtilities.R
index 10c33aa..7149f81 100644
--- a/R/vocabUtilities.R
+++ b/R/vocabUtilities.R
@@ -1,4 +1,4 @@
-# Copyright 2022 DARWIN EU®
+# Copyright 2023 DARWIN EU®
 #
 # This file is part of IncidencePrevalence
 #
@@ -287,6 +287,11 @@ getDoseForm <- function(cdm) {
 #'
 #' @param cdm cdm_reference via CDMConnector
 #' @param conceptId concpet_id to search
+#' @param withAncestor If TRUE, return column with ancestor. In case of multiple
+#' ancestors, concepts will be separated by ";"
+#' @param doseForm Only descendants codes with the specified drug dose form
+#' will be returned. If NULL, descendant codes will be returned regardless
+#' of dose form.
 #'
 #' @return The descendants of a given concept id
 #' @export
@@ -295,11 +300,16 @@ getDoseForm <- function(cdm) {
 #' cdm <- mockVocabRef()
 #' getDescendants(cdm = cdm, conceptId = 1)
 #' DBI::dbDisconnect(attr(cdm, "dbcon"), shutdown = TRUE)
-getDescendants <- function(cdm, conceptId) {
+
+getDescendants <- function(cdm,
+                           conceptId,
+                           withAncestor = FALSE,
+                           doseForm = NULL) {
+
   errorMessage <- checkmate::makeAssertCollection()
   cdmInheritsCheck <- inherits(cdm, "cdm_reference")
   checkmate::assertTRUE(cdmInheritsCheck,
-    add = errorMessage
+                        add = errorMessage
   )
   if (!isTRUE(cdmInheritsCheck)) {
     errorMessage$push(
@@ -307,28 +317,145 @@ getDescendants <- function(cdm, conceptId) {
     )
   }
   checkmate::assert_numeric(conceptId,
-    add = errorMessage
+                            add = errorMessage
   )
   checkmate::reportAssertions(collection = errorMessage)
 
+if(isFALSE(withAncestor)){
+  descendants <- getDescendantsOnly(cdm, conceptId, doseForm)}
+
+  if(isTRUE(withAncestor)){
+    descendants <- getDescendantsAndAncestor(cdm, conceptId, doseForm)}
+
+  return(descendants)
+}
+
+getDescendantsOnly <- function(cdm, conceptId, doseForm) {
   descendants <- cdm$concept_ancestor %>%
     dplyr::filter(.data$ancestor_concept_id %in% .env$conceptId) %>%
     dplyr::select("descendant_concept_id") %>%
     dplyr::distinct() %>%
     dplyr::rename("concept_id" = "descendant_concept_id") %>%
     dplyr::left_join(cdm$concept,
-      by = "concept_id"
+                     by = "concept_id")
+
+  if(!is.null(doseForm)){
+    descendantDoseForms <- getPresentDoseForms(cdm, concepts = descendants)
+  }
+
+  descendants <- descendants  %>%
+    dplyr::collect()
+
+  if(!is.null(doseForm)){
+    descendants <-  filterOnDoseForm(concepts = descendants,
+                       conceptDoseForms = descendantDoseForms,
+                     doseForm = doseForm)
+  }
+
+  # nb conceptId will also be a descendant of itself (if we don't specify dose)
+  return(descendants)
+}
+
+getDescendantsAndAncestor <- function(cdm, conceptId, doseForm) {
+
+  descendants <- cdm$concept_ancestor %>%
+    dplyr::inner_join(dplyr::tibble(ancestor_concept_id = as.integer(conceptId)),
+                      by = "ancestor_concept_id",
+                      copy = TRUE) %>%
+    dplyr::rename("concept_id" = "descendant_concept_id") %>%
+    dplyr::left_join(cdm$concept,
+                     by = "concept_id") %>%
+    dplyr::mutate(name = paste0("concept_", .data$ancestor_concept_id))
+
+  if(!is.null(doseForm)){
+    descendantDoseForms <- getPresentDoseForms(cdm, concepts = descendants)
+  }
+
+  descendants <- descendants %>%
+    dplyr::collect()
+
+    if(nrow(descendants)>0){
+descendants <- descendants %>%
+        tidyr::pivot_wider(names_from = "name",
+                           values_from = "ancestor_concept_id")
+
+  # one row per concept, with ancestor (of which there may be multiple)
+  working_cols <- stringr::str_subset(string = colnames(descendants),
+                                      pattern = paste(c(colnames(cdm$concept),
+                                                        colnames(cdm$concept_ancestor)),
+                                                      collapse = "|"),
+                                      negate = TRUE)
+
+descendants <- descendants %>%
+     tidyr::unite(col="ancestor_concept_id",
+                  working_cols, sep=";")
+# quicker to replace NAs afterwards rather than inside unite
+# (especially when there are many columns)
+descendants$ancestor_concept_id <- stringr::str_replace_all(
+  string = descendants$ancestor_concept_id,
+  pattern = ";NA|NA;",
+  replacement = ""
+)
+    }
+
+  if(!is.null(doseForm)){
+    descendants <-  filterOnDoseForm(concepts = descendants,
+                                     conceptDoseForms = descendantDoseForms,
+                                     doseForm = doseForm)
+  }
+
+  # nb conceptId will also be a descendant of itself
+  return(descendants)
+
+}
+
+getPresentDoseForms <- function(cdm, concepts){
+
+  presentDoseForms <- concepts %>%
+    dplyr::left_join(
+      cdm$concept_relationship %>%
+        dplyr::filter(.data$relationship_id == "RxNorm has dose form") %>%
+        dplyr::select("concept_id_1", "concept_id_2") %>%
+        dplyr::rename("concept_id" = "concept_id_2") %>%
+        dplyr::distinct() %>%
+        dplyr::left_join(cdm$concept, by = "concept_id") %>%
+        dplyr::select("concept_id_1", "concept_name") %>%
+        dplyr::rename("concept_id"="concept_id_1",
+                      "dose_form"="concept_name")  ,
+      by ="concept_id"
     ) %>%
+    dplyr::select("concept_id", "dose_form") %>%
     dplyr::collect()
-  # return concept_id used along with descendants
-  all <- dplyr::bind_rows(
-    cdm$concept %>%
-      dplyr::filter(.data$concept_id %in% .env$conceptId) %>%
-      dplyr::collect(),
-    descendants
-  ) %>%
-    dplyr::distinct() %>%
-    dplyr::arrange("concept_id")
 
-  return(all)
+  presentDoseForms <- presentDoseForms %>%
+    dplyr::group_by(.data$concept_id) %>%
+    dplyr::mutate(seq = dplyr::row_number()) %>%
+    tidyr::pivot_wider(
+      names_from = "seq",
+      values_from = "dose_form"
+    )
+  presentDoseForms <- presentDoseForms %>%
+    tidyr::unite(
+      col = "dose_form", 2:ncol(presentDoseForms), sep = "; ",
+      na.rm = TRUE
+    )
+  return(presentDoseForms)
+
+}
+
+filterOnDoseForm <- function(concepts, conceptDoseForms, doseForm){
+  concepts <- concepts %>%
+    dplyr::inner_join(
+      conceptDoseForms %>%
+        dplyr::filter(stringr::str_detect(
+          string = tolower(.data$dose_form),
+          pattern = paste(tolower(.env$doseForm),
+                          collapse = "|"
+          )
+        )) %>%
+        dplyr::select("concept_id"),
+      by = "concept_id")
+
+  return(concepts)
+
 }
diff --git a/man/getATCCodes.Rd b/man/getATCCodes.Rd
new file mode 100644
index 0000000..5ca9981
--- /dev/null
+++ b/man/getATCCodes.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/drugCodes.R
+\name{getATCCodes}
+\alias{getATCCodes}
+\title{Get descendant codes for ATC levels}
+\usage{
+getATCCodes(cdm, level = c("ATC 1st"), name = NULL, doseForm = NULL)
+}
+\arguments{
+\item{cdm}{cdm_reference via CDMConnector}
+
+\item{level}{ATC level. Can be one or more of "ATC 1st", "ATC 2nd",
+"ATC 3rd", "ATC 4th", and "ATC 5th"}
+
+\item{name}{ATC name of interest. For example, c("Dermatologicals",
+"Nervous System"), would result in a list of length two with the descendant
+concepts for these two particular ATC groups.}
+
+\item{doseForm}{Only descendants codes with the specified dose form
+will be returned. If NULL, descendant codes will be returned regardless
+of dose form.}
+}
+\value{
+A named list, with each element containing the descendant
+concepts for a particular ATC group
+}
+\description{
+Get descendant codes for ATC levels
+}
+\examples{
+cdm <- mockVocabRef()
+getATCCodes(cdm = cdm, level = "ATC 1st")
+DBI::dbDisconnect(attr(cdm, "dbcon"), shutdown = TRUE)
+}
diff --git a/man/getCandidateCodes.Rd b/man/getCandidateCodes.Rd
index 30e6124..e74344b 100644
--- a/man/getCandidateCodes.Rd
+++ b/man/getCandidateCodes.Rd
@@ -5,7 +5,7 @@
 \title{Generate candidate codelist for the OMOP CDM}
 \usage{
 getCandidateCodes(
-  cdm = NULL,
+  cdm,
   keywords,
   exclude = NULL,
   domains = "Condition",
diff --git a/man/getDescendants.Rd b/man/getDescendants.Rd
index ab5a470..516bc3c 100644
--- a/man/getDescendants.Rd
+++ b/man/getDescendants.Rd
@@ -4,12 +4,19 @@
 \alias{getDescendants}
 \title{getDescendants}
 \usage{
-getDescendants(cdm, conceptId)
+getDescendants(cdm, conceptId, withAncestor = FALSE, doseForm = NULL)
 }
 \arguments{
 \item{cdm}{cdm_reference via CDMConnector}
 
 \item{conceptId}{concpet_id to search}
+
+\item{withAncestor}{If TRUE, return column with ancestor. In case of multiple
+ancestors, concepts will be separated by ";"}
+
+\item{doseForm}{Only descendants codes with the specified drug dose form
+will be returned. If NULL, descendant codes will be returned regardless
+of dose form.}
 }
 \value{
 The descendants of a given concept id
diff --git a/man/getDrugIngredientCodes.Rd b/man/getDrugIngredientCodes.Rd
new file mode 100644
index 0000000..ef1f2dc
--- /dev/null
+++ b/man/getDrugIngredientCodes.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/drugCodes.R
+\name{getDrugIngredientCodes}
+\alias{getDrugIngredientCodes}
+\title{Get descendant codes for drug ingredients}
+\usage{
+getDrugIngredientCodes(cdm, name = NULL, doseForm = NULL)
+}
+\arguments{
+\item{cdm}{cdm_reference via CDMConnector}
+
+\item{name}{Names of ingredients of interest. For example, c("acetaminophen",
+"codeine"), would result in a list of length two with the descendant
+concepts for these two particular drug ingredients.}
+
+\item{doseForm}{Only descendants codes with the specified dose form
+will be returned. If NULL, descendant codes will be returned regardless
+of dose form.}
+}
+\value{
+A named list, with each item containing descendant concepts of
+an ingredient
+}
+\description{
+Get descendant codes for drug ingredients
+}
+\examples{
+cdm <- mockVocabRef()
+getDrugIngredientCodes(cdm = cdm, name = "Adalimumab")
+DBI::dbDisconnect(attr(cdm, "dbcon"), shutdown = TRUE)
+}
diff --git a/tests/testthat/test-drugCodes.R b/tests/testthat/test-drugCodes.R
new file mode 100644
index 0000000..8c9eca6
--- /dev/null
+++ b/tests/testthat/test-drugCodes.R
@@ -0,0 +1,78 @@
+test_that("getATCCodes working", {
+
+  backends <- c("database", "arrow", "data_frame")
+  for (i in seq_along(backends)) {
+  cdm <- mockVocabRef(backend = backends[i])
+  atcCodes <- getATCCodes(cdm, level = "ATC 1st")
+  expect_true(all(atcCodes[[1]] == c(12,13)))
+
+  atcCodes2 <- getATCCodes(cdm, level = "ATC 1st",
+                           name = "ALIMENTARY TRACT AND METABOLISM")
+  expect_true(all(atcCodes2[[1]] == c(12,13)))
+
+  if (backends[[i]] == "database") {
+    DBI::dbDisconnect(attr(cdm, "dbcon"), shutdown = TRUE)
+  }
+
+  }
+
+})
+
+test_that("getATCCodes expected errors", {
+
+  backends <- c("database", "arrow", "data_frame")
+  for (i in seq_along(backends)) {
+    cdm <- mockVocabRef(backend = backends[i])
+    expect_error(getATCCodes(cdm, level = "Not an ATC level"))
+    expect_error(getATCCodes(cdm, level = "ATC 1st",
+                             name = "Not an ATC name"))
+
+    if (backends[[i]] == "database") {
+      DBI::dbDisconnect(attr(cdm, "dbcon"), shutdown = TRUE)
+    }
+  }
+
+})
+
+test_that("getDrugIngredientCodes working", {
+
+  backends <- c("database", "arrow", "data_frame")
+  for (i in seq_along(backends)) {
+    cdm <- mockVocabRef(backend = backends[i])
+    ing_codes <- getDrugIngredientCodes(cdm)
+    expect_true(all(ing_codes[[1]] == c(10,13)))
+
+    ing_codes2 <- getDrugIngredientCodes(cdm, name = "Adalimumab")
+    expect_true(all(ing_codes2[[1]] == c(10,13)))
+
+    ing_codes3 <- getDrugIngredientCodes(cdm,
+                                         name = "Adalimumab",
+                                         doseForm = "injectable")
+    expect_true(all(ing_codes3[[1]] == c(13)))
+
+    ing_codes4 <- getDrugIngredientCodes(cdm,
+                                         name = "Adalimumab",
+                                         doseForm = "injection")
+    expect_true(all(ing_codes4[[1]] == c(10)))
+
+    if (backends[[i]] == "database") {
+      DBI::dbDisconnect(attr(cdm, "dbcon"), shutdown = TRUE)
+    }
+
+  }
+})
+
+test_that("getDrugIngredientCodes expected errors", {
+
+  backends <- c("database", "arrow", "data_frame")
+  for (i in seq_along(backends)) {
+    cdm <- mockVocabRef(backend = backends[i])
+    expect_error(getDrugIngredientCodes(cdm, name = "Not an Ingredient"))
+
+    if (backends[[i]] == "database") {
+      DBI::dbDisconnect(attr(cdm, "dbcon"), shutdown = TRUE)
+    }
+  }
+
+})
+
diff --git a/tests/testthat/test-getCandidateCodes.R b/tests/testthat/test-getCandidateCodes.R
index 94b2216..c43b329 100644
--- a/tests/testthat/test-getCandidateCodes.R
+++ b/tests/testthat/test-getCandidateCodes.R
@@ -224,7 +224,7 @@ test_that("tests with mock db", {
       domains = "Drug",
       searchInSynonyms = TRUE,
       searchViaSynonyms = TRUE,
-      includeDescendants = TRUE,
+      includeDescendants = FALSE,
       includeAncestor = TRUE
     )
     expect_true(all(c(
diff --git a/tests/testthat/test-vocabUtilities.R b/tests/testthat/test-vocabUtilities.R
index 10bdf51..101b847 100644
--- a/tests/testthat/test-vocabUtilities.R
+++ b/tests/testthat/test-vocabUtilities.R
@@ -26,15 +26,52 @@ test_that("tests with mock db", {
     )
     expect_true(is.character(conceptClasses))
 
-    descendants <- getDescendants(
+    descendants1 <- getDescendants(
       cdm = cdm,
-      conceptId = 1
+      conceptId = 1,
+      withAncestor = FALSE
     )
-    expect_true(all(descendants$concept_id == c(1, 2, 3, 4, 5)))
+    expect_true(all(descendants1$concept_id == c(1, 2, 3, 4, 5)))
+    expect_true("concept_name" %in% colnames(descendants1))
+
+    descendants2 <- getDescendants(
+      cdm = cdm,
+      conceptId = 1,
+      withAncestor = TRUE
+    )
+    expect_true(all(descendants2$concept_id == c(1, 2, 3, 4, 5)))
+    expect_true("ancestor_concept_id" %in% colnames(descendants2))
+    expect_true(all(descendants2$ancestor_concept_id == 1))
+
+    descendants3 <- getDescendants(
+      cdm = cdm,
+      conceptId = 10,
+      withAncestor = FALSE,
+      doseForm = c("Injection", "Injectable")
+    )
+    expect_true(all(descendants3$concept_id == c(10, 13)))
+
+    descendants4 <- getDescendants(
+      cdm = cdm,
+      conceptId = 10,
+      withAncestor = TRUE,
+      doseForm = c("Injection", "Injectable")
+    )
+    expect_true(all(descendants4$concept_id == c(10, 13)))
+
+
+    descendants5 <- getDescendants(
+      cdm = cdm,
+      conceptId = 10,
+      withAncestor = TRUE,
+      doseForm = c("Injectable")
+    )
+    expect_true(all(descendants5$concept_id == c(13)))
+
 
 
     doseForms <- getDoseForm(cdm = cdm)
-    expect_true(doseForms == "Injection")
+    expect_true(all(doseForms == c("Injection", "Injectable")))
 
     # expected errors
     expect_error(getVocabVersion(cdm = "a"))