08 Extract table 4.1 codes

Extract participants and codes for table 4.1 (UC and Chron's)

In [1]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "WGS table 4.1 codes" for domain "condition" and was generated for All of Us Controlled Tier Dataset v5
dataset_82374778_condition_sql <- paste("
    SELECT
        c_occurrence.person_id,
        c_occurrence.condition_concept_id,
        c_standard_concept.concept_name as standard_concept_name,
        c_standard_concept.concept_code as standard_concept_code,
        c_standard_concept.vocabulary_id as standard_vocabulary,
        c_occurrence.condition_start_datetime,
        c_occurrence.condition_end_datetime,
        c_occurrence.condition_type_concept_id,
        c_type.concept_name as condition_type_concept_name,
        c_occurrence.stop_reason,
        c_occurrence.visit_occurrence_id,
        visit.concept_name as visit_occurrence_concept_name,
        c_occurrence.condition_source_value,
        c_occurrence.condition_source_concept_id,
        c_source_concept.concept_name as source_concept_name,
        c_source_concept.concept_code as source_concept_code,
        c_source_concept.vocabulary_id as source_vocabulary,
        c_occurrence.condition_status_source_value,
        c_occurrence.condition_status_concept_id,
        c_status.concept_name as condition_status_concept_name 
    FROM
        ( SELECT
            * 
        FROM
            `condition_occurrence` c_occurrence 
        WHERE
            (
                condition_concept_id IN  (
                    SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (
                            select
                                cast(cr.id as string) as id 
                            FROM
                                `cb_criteria` cr 
                            WHERE
                                concept_id IN (
                                    194077, 194684, 4242392, 195585, 201606, 4342656, 201773
                                ) 
                                AND full_text LIKE '%_rank1]%'
                        ) a 
                            ON (
                                c.path LIKE CONCAT('%.',
                            a.id,
                            '.%') 
                            OR c.path LIKE CONCAT('%.',
                            a.id) 
                            OR c.path LIKE CONCAT(a.id,
                            '.%') 
                            OR c.path = a.id) 
                        WHERE
                            is_standard = 1 
                            AND is_selectable = 1
                        ) 
                        OR  condition_source_concept_id IN  (
                            SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (
                                    select
                                        cast(cr.id as string) as id 
                                    FROM
                                        `cb_criteria` cr 
                                    WHERE
                                        concept_id IN (
                                            45538532, 45586718, 1569632, 45586723, 45567307, 45576989, 44820945, 45601176, 45596326, 45581901, 44829046, 44819799, 45572212, 45586717, 45533598, 45596332, 45538535, 44831313, 44820946, 45596325, 45557657, 45533605, 45552931, 45567310, 45601175, 45567309, 45552934, 45605939, 45586719, 45543317, 45567306, 45538530, 45572213, 44827872, 44837172, 45576987, 45562489, 45552932, 45601172, 44822028, 45533606, 45576988, 45596327, 45601174, 45567311, 45543315, 45596331, 45548161, 44837173, 45605940, 45601177, 45576986, 45591593, 45533600, 1569610, 45562492, 44830157, 45562491, 45586722, 45562494, 1569619, 45576990, 45601178, 1569620, 45572211, 44833647, 45596333, 45562493, 45548160, 45586716, 45596328, 44825522, 44822029, 45543316, 45596330, 44830158, 45538534, 45533601, 45552935, 45581902, 45548159, 45557654, 45552933, 45586721, 45538531, 45567308, 45533602, 45557653, 45572214, 45543318
                                        ) 
                                        AND full_text LIKE '%_rank1]%'
                                ) a 
                                    ON (
                                        c.path LIKE CONCAT('%.',
                                    a.id,
                                    '.%') 
                                    OR c.path LIKE CONCAT('%.',
                                    a.id) 
                                    OR c.path LIKE CONCAT(a.id,
                                    '.%') 
                                    OR c.path = a.id) 
                                WHERE
                                    is_standard = 0 
                                    AND is_selectable = 1
                                )
                        )  
                        AND (
                            c_occurrence.PERSON_ID IN (
                                SELECT
                                    distinct person_id  
                                FROM
                                    `cb_search_person` cb_search_person  
                                WHERE
                                    cb_search_person.person_id IN (
                                        SELECT
                                            person_id 
                                        FROM
                                            `cb_search_person` p 
                                        WHERE
                                            has_whole_genome_variant = 1 
                                    ) 
                                )
                        )
                    ) c_occurrence 
                LEFT JOIN
                    `concept` c_standard_concept 
                        ON c_occurrence.condition_concept_id = c_standard_concept.concept_id 
                LEFT JOIN
                    `concept` c_type 
                        ON c_occurrence.condition_type_concept_id = c_type.concept_id 
                LEFT JOIN
                    `visit_occurrence` v 
                        ON c_occurrence.visit_occurrence_id = v.visit_occurrence_id 
                LEFT JOIN
                    `concept` visit 
                        ON v.visit_concept_id = visit.concept_id 
                LEFT JOIN
                    `concept` c_source_concept 
                        ON c_occurrence.condition_source_concept_id = c_source_concept.concept_id 
                LEFT JOIN
                    `concept` c_status 
                        ON c_occurrence.condition_status_concept_id = c_status.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
condition_82374778_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "condition_82374778",
  "condition_82374778_*.csv")
message(str_glue('The data will be written to {condition_82374778_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_82374778_condition_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  condition_82374778_path,
  destination_format = "CSV")



“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

The data will be written to gs://fc-secure-329f909f-3a71-416a-9c18-95db1c1f801d/bq_exports/earosenthal@preprod.researchallofus.org/20220526/condition_82374778/condition_82374778_*.csv. Use this path when reading the data into your notebooks in the future.



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {condition_82374778_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- NULL
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
dataset_82374778_condition_df <- read_bq_export_from_workspace_bucket(condition_82374778_path)

dim(dataset_82374778_condition_df)

head(dataset_82374778_condition_df, 5)