In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "DBPSBP_srWGS_dataset" for domain "person" and was generated for All of Us Controlled Tier Dataset v8
dataset_84117830_person_sql <- paste("
    SELECT
        person.person_id,
        person.gender_concept_id,
        p_gender_concept.concept_name as gender,
        person.birth_datetime as date_of_birth,
        person.race_concept_id,
        p_race_concept.concept_name as race,
        person.ethnicity_concept_id,
        p_ethnicity_concept.concept_name as ethnicity,
        person.sex_at_birth_concept_id,
        p_sex_at_birth_concept.concept_name as sex_at_birth,
        person.self_reported_category_concept_id,
        p_self_reported_category_concept.concept_name as self_reported_category 
    FROM
        `person` person 
    LEFT JOIN
        `concept` p_gender_concept 
            ON person.gender_concept_id = p_gender_concept.concept_id 
    LEFT JOIN
        `concept` p_race_concept 
            ON person.race_concept_id = p_race_concept.concept_id 
    LEFT JOIN
        `concept` p_ethnicity_concept 
            ON person.ethnicity_concept_id = p_ethnicity_concept.concept_id 
    LEFT JOIN
        `concept` p_sex_at_birth_concept 
            ON person.sex_at_birth_concept_id = p_sex_at_birth_concept.concept_id 
    LEFT JOIN
        `concept` p_self_reported_category_concept 
            ON person.self_reported_category_concept_id = p_self_reported_category_concept.concept_id  
    WHERE
        person.PERSON_ID IN (SELECT
            distinct person_id  
        FROM
            `cb_search_person` cb_search_person  
        WHERE
            cb_search_person.person_id IN (SELECT
                person_id 
            FROM
                `cb_search_person` p 
            WHERE
                has_whole_genome_variant = 1 ) 
            AND cb_search_person.person_id IN (SELECT
                criteria.person_id 
            FROM
                (SELECT
                    DISTINCT person_id, entry_date, concept_id 
                FROM
                    `cb_search_all_events` 
                WHERE
                    (concept_id IN (903118, 903115) 
                    AND is_standard = 0 )) criteria 
            UNION
            DISTINCT SELECT
                criteria.person_id 
            FROM
                (SELECT
                    DISTINCT person_id, entry_date, concept_id 
                FROM
                    `cb_search_all_events` 
                WHERE
                    (concept_id IN(SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `cb_criteria` c 
                    JOIN
                        (SELECT
                            CAST(cr.id as string) AS id       
                        FROM
                            `cb_criteria` cr       
                        WHERE
                            concept_id IN (3018586, 3034703, 4152194, 3004249, 3012888, 4154790)       
                            AND full_text LIKE '%_rank1]%'      ) a 
                            ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                            OR c.path LIKE CONCAT('%.', a.id) 
                            OR c.path LIKE CONCAT(a.id, '.%') 
                            OR c.path = a.id) 
                    WHERE
                        is_standard = 1 
                        AND is_selectable = 1) 
                    AND is_standard = 1 )) criteria ) )", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
person_84117830_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "person_84117830",
  "person_84117830_*.csv")
message(str_glue('The data will be written to {person_84117830_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_84117830_person_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  person_84117830_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {person_84117830_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(gender = col_character(), race = col_character(), ethnicity = col_character(), sex_at_birth = col_character(), self_reported_category = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
dataset_84117830_person_df <- read_bq_export_from_workspace_bucket(person_84117830_path)

dim(dataset_84117830_person_df)

head(dataset_84117830_person_df, 5)

In [None]:
summary(dataset_84117830_person_df)

In [None]:
table(duplicated(dataset_84117830_person_df$person_id))

In [None]:
dataset_84117830_person_df$sexM = NA
dataset_84117830_person_df$sexM[dataset_84117830_person_df$sex_at_birth == "Female"] = 0
dataset_84117830_person_df$sexM[dataset_84117830_person_df$sex_at_birth == "Male"] = 1

In [None]:
write.table(
    dataset_84117830_person_df[, c("person_id", "sexM", "date_of_birth")],
    sep = "\t",
    na = "NA",
    row.names = FALSE,
    quote = FALSE,
    file = "DBPSBP_srWGS_person.txt")

In [None]:
library(tidyverse)
library(bigrquery)

# This query represents dataset "DBPSBP_srWGS_dataset" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_84117830_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (3004249, 3012888, 3018586, 3034703, 3038553, 4152194, 4154790)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1) 
                OR  measurement_source_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (903115, 903118)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 0 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        person_id 
                    FROM
                        `cb_search_person` p 
                    WHERE
                        has_whole_genome_variant = 1 ) 
                    AND cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN (903118, 903115) 
                            AND is_standard = 0 )) criteria 
                    UNION
                    DISTINCT SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (3018586, 3034703, 4152194, 3004249, 3012888, 4154790)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 1 
                                AND is_selectable = 1) 
                            AND is_standard = 1 )) criteria ) )
                )
            ) measurement 
        LEFT JOIN
            `concept` m_standard_concept 
                ON measurement.measurement_concept_id = m_standard_concept.concept_id 
        LEFT JOIN
            `concept` m_type 
                ON measurement.measurement_type_concept_id = m_type.concept_id 
        LEFT JOIN
            `concept` m_operator 
                ON measurement.operator_concept_id = m_operator.concept_id 
        LEFT JOIN
            `concept` m_value 
                ON measurement.value_as_concept_id = m_value.concept_id 
        LEFT JOIN
            `concept` m_unit 
                ON measurement.unit_concept_id = m_unit.concept_id 
        LEFT JOIn
            `visit_occurrence` v 
                ON measurement.visit_occurrence_id = v.visit_occurrence_id 
        LEFT JOIN
            `concept` m_visit 
                ON v.visit_concept_id = m_visit.concept_id 
        LEFT JOIN
            `concept` m_source_concept 
                ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_84117830_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_84117830",
  "measurement_84117830_*.csv")
message(str_glue('The data will be written to {measurement_84117830_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_84117830_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_84117830_path,
  destination_format = "CSV")



In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_84117830_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
dataset_84117830_measurement_df <- read_bq_export_from_workspace_bucket(measurement_84117830_path)

dim(dataset_84117830_measurement_df)

head(dataset_84117830_measurement_df, 5)

In [None]:
sort(table(dataset_84117830_measurement_df$standard_concept_name))

In [None]:
sort(table(dataset_84117830_measurement_df$unit_source_value))

In [None]:
obs = dataset_84117830_measurement_df %>%
#select(person_id, standard_concept_name, measurement_datetime, value_as_number) %>%
#na.omit() %>%
filter(standard_concept_name %in%
       c("Sitting systolic blood pressure",
         "Computed systolic blood pressure, mean of 2nd and 3rd measures",
         "Systolic blood pressure--sitting",
         "Systolic blood pressure")) %>%
select(person_id, measurement_datetime, value_as_number) %>%
mutate(measurement_datetime = as.Date(measurement_datetime)) %>%
na.omit() %>%
filter(value_as_number >= 20) %>%
filter(value_as_number <= 300)

In [None]:
obs = dataset_84117830_measurement_df %>%
#select(person_id, standard_concept_name, measurement_datetime, value_as_number) %>%
#na.omit() %>%
filter(standard_concept_name %in%
       c("Sitting diastolic blood pressure",
         "Computed diastolic blood pressure, mean of 2nd and 3rd measures",
         "Diastolic blood pressure--sitting",
         "Diastolic blood pressure")) %>%
select(person_id, measurement_datetime, value_as_number) %>%
mutate(measurement_datetime = as.Date(measurement_datetime)) %>%
na.omit() %>%
filter(value_as_number >= 20) %>%
filter(value_as_number <= 300)

In [None]:
x = quantile(obs$value_as_number, seq(0.001, 0.999, 0.001))
plot(x)
obs$value_as_number[(obs$value_as_number < min(x) | obs$value_as_number > max(x))] = NA

obs = obs[!is.na(obs$value_as_number), ]

In [None]:
#write.table(obs, "DBPSBP_srWGS.SBP.txt", row.names=FALSE, quote=FALSE, sep="\t")
write.table(obs, "DBPSBP_srWGS.DBP.txt", row.names=FALSE, quote=FALSE, sep="\t")

In [None]:
obs = as.data.frame(data.table::fread("DBPSBP_srWGS.SBP.txt.gz"))

In [None]:
head(obs)

In [None]:
data = do.call(
  rbind,
  lapply(
    c("C02", "C03", "C07", "C08", "C09"),
    function (x) {
      foo = readRDS(paste0("scriptseries.", x, ".rds"))
      foo$drug = x
      return(foo)
    }))

In [None]:
obs = obs %>%
  left_join(data, by="person_id", relationship="many-to-many") %>%
  mutate(measurementinscriptseries = measurement_datetime >= start & measurement_datetime <= stop) %>%
  group_by(person_id, measurement_datetime, value_as_number) %>%
  summarize(measurementinscriptseriesany = sum(measurementinscriptseries) > 0, .groups="drop") %>%
  filter(! measurementinscriptseriesany) %>%
  select(person_id, measurement_datetime, value_as_number)

In [None]:
#write.table(obs, "DBPSBP_srWGS.SBP.nodrug.txt", row.names=FALSE, quote=FALSE, sep="\t")
write.table(obs, "DBPSBP_srWGS.DBP.nodrug.txt", row.names=FALSE, quote=FALSE, sep="\t")

In [None]:
system("gsutil -u $GOOGLE_PROJECT cp gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv .", intern=T)

In [None]:
Anc <- read_tsv('ancestry_preds.tsv')
dim(Anc)
head(Anc)

In [None]:
x = Anc$ancestry_pred[match(obs$person_id, Anc$research_id)]
obs = obs[x=="eur", ]

In [None]:
#write.table(obs, "DBPSBP_srWGS.SBP.nodrug.eur.txt", row.names=FALSE, quote=FALSE, sep="\t")
write.table(obs, "DBPSBP_srWGS.DBP.nodrug.eur.txt", row.names=FALSE, quote=FALSE, sep="\t")

In [None]:
obsSBP = as.data.frame(data.table::fread("DBPSBP_srWGS.SBP.nodrug.eur.txt.gz"))
obsDBP = as.data.frame(data.table::fread("DBPSBP_srWGS.DBP.nodrug.eur.txt.gz"))

In [None]:
obsSBP$trait = "SBP"
obsDBP$trait = "DBP"

In [None]:
summary(obsDBP)

In [None]:
obs = rbind(obsSBP, obsDBP) %>%
  group_by(person_id, measurement_datetime, trait) %>%
  summarize(m=mean(value_as_number), .groups="drop_last") %>%
  pivot_wider(names_from="trait", values_from="m") %>%
  mutate(value_as_number=(SBP + 2*DBP)/3) %>%
  select(person_id, measurement_datetime, value_as_number) %>%
  ungroup() %>%
  na.omit()

BP variation project

In [None]:
myboxcox1 = function (x, l=1) {
  if (l==0) {
    return(log(x))
  } else {
    return((x^l - 1)/l)
  }
}

In [None]:
head(obs)

In [None]:
# t max at
# SBP.nodrug.eur; MAP.nodrug.eur
# myboxcox1(value, 0):    x=12; x=12
# myboxcox1(value, -0.8): x=3; x=3
# myboxcox1(value, -1):    x=3; x=4
data = obs %>%
  mutate(value_as_number=myboxcox1(value_as_number, -1)) %>%
  group_by(person_id) %>%
  summarize(n=n(), m=mean(value_as_number), s=sd(value_as_number), .groups="drop")
for (x in 3:12) {
  print(x)
  foo = data %>% filter(n>=x)
  print(cor.test(foo$m, foo$s))
}

In [None]:
# t near 0 at
# SBP.nodrug.eur; MAP.nodrug.eur
# filter(n>=4): x=-0.5; x=0
# filter(n>=5): x=-0.5; x=0
# filter(n>=7): x=-0.6; x=0
for (x in seq(-1, 1, 0.1)) {
  print(x)
  data = obs %>%
    mutate(value_as_number=myboxcox1(value_as_number, x)) %>%
    group_by(person_id) %>%
    summarize(n=n(), m=mean(value_as_number), s=sd(value_as_number), .groups="drop") %>%
    filter(n>=7)
  print(cor.test(data$m,data$s))
}

In [None]:
data = obs %>%
  mutate(value_as_number=myboxcox1(value_as_number, 0)) %>%
  group_by(person_id) %>%
  summarize(n=n(), m=mean(value_as_number), s=sd(value_as_number), .groups="drop") %>%
  filter(n>=4)

output =
  data %>%
  dplyr::select(person_id, m, s) %>%
  arrange(person_id) %>%
  rename(FID=person_id) %>%
  mutate(IID=FID) %>%
  relocate(IID, .after=FID)

write.table(
  output,
#  file="aou_SBPnodrugmyboxcox-05nge4_QT.txt",
  file="aou_MAPnodrugmyboxcox0nge4_QT.txt",
  sep=" ",
  quote=FALSE,
  row.names=FALSE,
  na="NA"
)

In [None]:
output =
dataset_84117830_measurement_df %>%
    filter(standard_concept_name == "Body mass index (BMI) [Ratio]") %>%
    select(person_id, value_as_number) %>%
    na.omit() %>%
    mutate(lower = quantile(value_as_number, 0.01),
           upper = quantile(value_as_number, 0.99)) %>%
    filter(value_as_number >= lower & value_as_number <= upper) %>%
    group_by(person_id) %>%
    summarize(BMI = median(value_as_number))
output

In [None]:
write.table(
    output,
    sep = "\t",
    na = "NA",
    row.names = FALSE,
    quote = FALSE,
    file = "DBPSBP_srWGS_BMI.txt")