# “dx extract_dataset” in R
<hr/>
***As-Is Software Disclaimer***

This content in this repository is delivered “As-Is”. Notwithstanding anything to the contrary, DNAnexus will have no warranty, support, liability or other obligations with respect to Materials provided hereunder.

<hr/>

This notebook demonstrates usage of the dx command `extract_dataset` for:
* Retrieval of Apollo-stored data, as referenced within entities and fields of a Dataset or Cohort object on the platform
* Retrieval of the underlying data dictionary files used to generate a Dataset object on the platform

<a href="https://github.com/dnanexus/OpenBio/blob/master/LICENSE.md">MIT License</a> applies to this notebook.

## Preparing your environment
### Launch spec:

* App name: JupyterLab with Python, R, Stata, ML ()
* Kernel: R
* Instance type: mem1_ssd1_v2_x2
* Cost: < $0.2
* Runtime: =~ 10 min
* Data description: Input for this notebook is a v3.0 Dataset or Cohort object ID

### Install dxpy
extract_dataset requires dxpy version >= 0.329.0. However, a more recent version of dxpy on PyPI may already be available and installed, making the below "pip" install unecessary. If running the command from your local environment (i.e. off of the DNAnexus platform), it may be required to also install pandas. For example, pip3 install -U dxpy[pandas]

In [None]:
system("pip3 show dxpy", intern = TRUE)
system("pip3 install -U dxpy==0.363.0")

### Install tidyverse for data processing

In [None]:
install.packages("readr")

### Import packages

In [None]:
library(dplyr)
library(readr)
library(stringr)

### 1. Assign environment variables

In [None]:
# The referenced Dataset is private and provided only to demonstrate an example input. The user will need to supply a permissible and valid record-id
# Assign project-id of dataset
pid <- 'project-G5BzYk80kP5bvbXy5J7PQZ36'
# Assign dataset record-id
rid <- 'record-GJ3Y7jQ0VKyy592yPxB4yG7Y'
# Assign joint dataset project-id:record-id
dataset = paste(pid, rid, sep = ":")

### 2. Call “dx extract_dataset” using a supplied dataset

In [None]:
cmd <- paste("dx extract_dataset", dataset, "-ddd --delimiter ','")
system(cmd, intern = TRUE)

#### Preview data in the three dictionary (*.csv) files

In [None]:
codings_file <- system("ls *.codings.csv", intern = TRUE)
codings_df <- read_csv(codings_file, show_col_types = FALSE)
head(codings_df)

In [None]:
data_dict_file <- system("ls *.data_dictionary.csv", intern=TRUE)
data_dict_df <- read_csv(data_dict_file, show_col_types = FALSE)
head(data_dict_df)

In [None]:
entity_dict_file <- system("ls *.entity_dictionary.csv", intern=TRUE)
entity_dict_df <- read_csv(entity_dict_file, show_col_types = FALSE)
head(entity_dict_df)

### 3. Parse returned metadata and extract entity/field names

In [None]:
data_dict_df <- data_dict_df %>%
  rowwise() %>%
  mutate(ent_field = paste(entity, name, sep = "."))

field_list_str <- paste(data_dict_df %>% pull(ent_field), collapse = ",")

### 4. Use extracted entity and field names as input to the called function, “dx extract_dataset” and extract data

In [None]:
cmd <- paste("dx extract_dataset", dataset, " --fields", field_list_str, "-o extracted_data.csv")
system(cmd)

#### Preview data in the retrieved data file

In [None]:
data_df <- read_csv("extracted_data.csv", show_col_types = FALSE)
head(data_df)

#### Alternitavely, save the extracted entity into a file and supply it by using "--fields-file" option

In [None]:
field_list_break_line <- gsub(",", "\n", field_list_str) 
write(field_list_break_line, file = "entity_field_input_file.txt")

cmd <- paste("dx extract_dataset", dataset, "--fields-file entity_field_input_file.txt", "-o extracted_data_entity_field_input_file.csv")
system(cmd, intern = TRUE)

data_df <- read_csv("extracted_data_entity_field_input_file.csv", show_col_types = FALSE)
head(data_df)

### 5. Replace any coded column values of extracted data with the coded meaning

Create a subset of the codings tibble and join with the data dictionary tibble to get only coded columns

In [None]:
coded_col_df <- codings_df %>%
  filter(!is.na(coding_name)) %>%
  inner_join(data_dict_df %>%
    select(
      entity,
      name,
      ent_field,
      coding_name,
      is_multi_select,
      is_sparse_coding
    ),
  by = "coding_name"
  )
head(coded_col_df)

In [None]:
# Get a list of unique coded ent_field
coded_cols <- coded_col_df %>%
  distinct(ent_field) %>%
  pull(ent_field)
head(coded_cols)

In [None]:
# Function for decoding single coded values
decode_single <- function(single_code, coded_col_df, curr_col) {
    single_code <- ifelse(
      typeof(single_code) != "character",
      ifelse(
          as.character(single_code) %in% (coded_col_df %>% filter(ent_field == curr_col) %>% pull(code)),
          as.character(single_code),
          paste(as.character(single_code), ".0", sep="")
      ),
        single_code
  )
  ifelse(single_code %in% (coded_col_df %>% filter(ent_field == curr_col) %>% pull(code)),
    toString(coded_col_df %>% filter(
      ent_field == curr_col,
      code == single_code
    ) %>% pull(meaning)), 
         single_code
  )
}

# Function for decoding multi_select coded values
decode_multi_select <- function(multi_code, coded_col_df, curr_col) {
  multi_code_substr <- str_replace_all(multi_code, '\\[|\\]|\\"', "")
  multi_code_vector <- unlist(strsplit(multi_code_substr, ','))
  decoded <- c()
  for (code in multi_code_vector) {
      meaning <- decode_single(code, coded_col_df, curr_col)
    decoded <- append(decoded, meaning)
  }
    decode_list <- paste(decoded, collapse='","')
    paste0('["', decode_list, '"]')
}

# Decoding function
decode_fun <- function(code, curr_col) {
  ifelse(
    !is.na(code),
    ifelse(
      is.na(coded_col_df %>% filter(ent_field == curr_col) %>% distinct(is_multi_select) %>% pull(is_multi_select)),
      decode_single(code, coded_col_df, curr_col),
      decode_multi_select(code, coded_col_df, curr_col)
    ),
    NA
  )
}

In [None]:
decoded_data <- data_df %>%
  rowwise() %>%
  mutate(across(
    all_of(coded_cols),
    ~ decode_fun(.x, cur_column())
  ))
head(decoded_data)

### 6. Drop sparsely coded values

In [None]:
# Function for dropping single sparsely coded values
decode_single_sparse <- function(single_code, coded_col_df, curr_col) {
  single_code <- ifelse(
    typeof(single_code) != "character",
    ifelse(
      as.character(single_code) %in% (coded_col_df %>% filter(ent_field == curr_col) %>% pull(code)),
      as.character(single_code),
      paste(as.character(single_code), ".0", sep = "")
    ),
    single_code
  )
  ifelse(single_code %in% (coded_col_df %>% filter(ent_field == curr_col) %>% pull(code)),
    NA,
    single_code
  )
}

# Function for dropping multi select sparsely coded values
decode_multi_sparse <- function(multi_code, coded_col_df, curr_col) {
  multi_code_substr <- str_replace_all(multi_code, '\\[|\\]|\\"', "")
  multi_code_vector <- unlist(strsplit(multi_code_substr, ','))
    decoded <- c()
    for (code in multi_code_vector) {
      meaning <- decode_single_sparse(code, coded_col_df, curr_col)
    decoded <- append(decoded, meaning)
    }
    decode_list <- paste(decoded, collapse='","')
    paste0('["', decode_list, '"]')
}

# Function for dropping sparsely coded values
decode_fun_sparse <- function(code, curr_col) {
  ifelse(
    !is.na(code),
    ifelse(
      !is.na(coded_col_df %>% filter(ent_field == curr_col) %>% distinct(is_sparse_coding) %>% pull(is_sparse_coding)),
      ifelse(
        is.na(coded_col_df %>% filter(ent_field == curr_col) %>% distinct(is_multi_select) %>% pull(is_multi_select)),
        decode_single_sparse(code, coded_col_df, curr_col),
        decode_multi_sparse(code, coded_col_df, curr_col)
      ),
      toString(code)
    ),
    NA
  )
}

In [None]:
decoded_data_sparse <- data_df %>%
  rowwise() %>%
  mutate(across(
    all_of(coded_cols),
    ~ decode_fun_sparse(.x, cur_column())
  ))
head(decoded_data_sparse)

### 7. Replace the column titles (field names) of extracted data with the field titles

In [None]:
col_list <- colnames(data_df)
head(col_list)

In [None]:
title_list = c()
duplicate_titles = c()
for (col in col_list){
    field_title <- data_dict_df %>% filter(ent_field == col) %>% pull(title)
    if (field_title %in% title_list) {
        duplicate_titles <- append(duplicate_titles, field_title)
    }
    else {
        title_list <- append(title_list, field_title)
    }
}

In [None]:
rename_func <- function(col){
    field_title <- data_dict_df %>% filter(ent_field == col) %>% pull(title)
    ifelse(
        (field_title %in% duplicate_titles),
        str_replace_all(col, "\\.", "-"),
        field_title
    )
}

In [None]:
data_df_rename <- rename_with(data_df, ~unlist(lapply(col_list, rename_func)))
head(data_df_rename)

### 8. Export the output and upload to the project

In [None]:
write_csv(decoded_data, file = "extracted_data_with_code_meanings.csv")
write_csv(decoded_data_sparse, file = "extracted_data_with_sparse_code_drop.csv")
write_csv(data_df_rename, file = "extracted_data_with_updated_titles.csv")

In [None]:
system("dx upload *.csv")