In [1]:
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
root_path <- "/mnt/d/fogg_lab_OHSU_samples"

In [3]:
cerv_tumor_df <- read_csv(paste0(root_path, "/filtered_biolibrary/cervix_tumor.csv")) %>%
    dplyr::select(-type)
ut_endo_tumor_df <- read_csv(paste0(root_path, "/filtered_biolibrary/ut_endo_tumor.csv")) %>%
    dplyr::select(-type)
cerv_normal_df <- read_csv(paste0(root_path, "/filtered_biolibrary/cervix_normal.csv")) %>%
    dplyr::select(-type)
ut_endo_normal_df <- read_csv(paste0(root_path, "/filtered_biolibrary/ut_endo_normal.csv")) %>%
    dplyr::select(-type)

Parsed with column specification:
cols(
  subject_id = col_character(),
  collection_year = col_double(),
  clinicaldiag_description = col_character(),
  tissue = col_character(),
  paraffin_block = col_double(),
  type = col_character()
)
Parsed with column specification:
cols(
  subject_id = col_character(),
  collection_year = col_double(),
  clinicaldiag_description = col_character(),
  tissue = col_character(),
  paraffin_block = col_double(),
  type = col_character()
)
Parsed with column specification:
cols(
  subject_id = col_character(),
  collection_year = col_double(),
  clinicaldiag_description = col_character(),
  tissue = col_character(),
  paraffin_block = col_double(),
  type = col_character()
)
Parsed with column specification:
cols(
  subject_id = col_character(),
  collection_year = col_double(),
  clinicaldiag_description = col_character(),
  tissue = col_character(),
  paraffin_block = col_double(),
  type = col_character()
)


In [4]:
# CESC
cesc_tumor_types <- c("Squamous cell carcinoma", "Adenocarcinoma", "Carcinoma in situ")
cesc_tumor_df <- cerv_tumor_df %>%
    dplyr::filter(clinicaldiag_description %in% cesc_tumor_types)

# UCEC
ucec_tumor_types <- c("Adenocarcinoma", "Carcinoma", "Endometrioid carcinoma")
ucec_tumor_df <- ut_endo_tumor_df %>%
    dplyr::filter(clinicaldiag_description %in% ucec_tumor_types)

# UCS
ucs_tumor_types <- c("Carcinosarcoma")
ucs_tumor_df <- ut_endo_tumor_df %>%
    dplyr::filter(clinicaldiag_description %in% ucs_tumor_types)

# Helper functions

In [5]:
# Get sample weights by group
# Weights inversely proportional to group frequency
get_balanced_sample_weights <- function(df, group_col) {
    weight_df <- df %>%
        dplyr::group_by(!!as.name(group_col)) %>%
        dplyr::summarize(n = n()) %>%
        # Down-weight groups by frequency
        dplyr::mutate(weight = 1 / (n * nrow(df)))
    final_df <- df %>%
        dplyr::inner_join(weight_df, by = group_col) %>%
        dplyr::mutate(std_weight = weight / sum(weight))
    return(final_df$std_weight)
}

# CESC

## Tumor

In [6]:
set.seed(12345)
cesc_scc_year_weights <- get_balanced_sample_weights(
    dplyr::filter(cesc_tumor_df, clinicaldiag_description == "Squamous cell carcinoma"),
    group_col = "collection_year"
)
cesc_tumor_samp_df <- bind_rows(
    # Not enough to sample, take all
    cesc_tumor_df %>% dplyr::filter(clinicaldiag_description == "Adenocarcinoma"),
    # Not enough to sample, take all
    cesc_tumor_df %>% dplyr::filter(clinicaldiag_description == "Carcinoma in situ"),
    # Take 10, balanced by year
    cesc_tumor_df %>% dplyr::filter(clinicaldiag_description == "Squamous cell carcinoma") %>%
        dplyr::sample_n(10, weight = cesc_scc_year_weights)
)
# cesc_tumor_samp_df

`summarise()` ungrouping output (override with `.groups` argument)


## Normal

In [7]:
# Tumor & Normal should not have any pared data
# Remove adjacent to tumor samples
filt_cerv_normal_df <- cerv_normal_df %>%
    dplyr::filter(!(subject_id %in% cesc_tumor_samp_df$subject_id)) %>%
    dplyr::filter(clinicaldiag_description != "Normal Adjacent to Tumor")

In [8]:
set.seed(12345)
cesc_normal_weights <- get_balanced_sample_weights(filt_cerv_normal_df, group_col = "collection_year")
# Take 10, balanced by year
cesc_normal_samp_df <- filt_cerv_normal_df %>%
    dplyr::sample_n(10, weights = cesc_normal_weights)
# cesc_normal_samp_df

`summarise()` ungrouping output (override with `.groups` argument)


# UCEC

## Tumor

In [9]:
set.seed(12345)
ucec_ec_year_weights <- get_balanced_sample_weights(
    dplyr::filter(ucec_tumor_df, clinicaldiag_description == "Endometrioid carcinoma"),
    group_col = "collection_year"
)
ucec_tumor_samp_df <- bind_rows(
    # Not enough to sample, take all
    ucec_tumor_df %>% dplyr::filter(clinicaldiag_description == "Adenocarcinoma"),
    # Not enough to sample, take all
    ucec_tumor_df %>% dplyr::filter(clinicaldiag_description == "Carcinoma"),
    # Take 10, balanced by year
    ucec_tumor_df %>% dplyr::filter(clinicaldiag_description == "Endometrioid carcinoma") %>%
        dplyr::sample_n(10, weight = ucec_ec_year_weights)
)
# ucec_tumor_samp_df

`summarise()` ungrouping output (override with `.groups` argument)


# UCS

## Tumor

In [10]:
# Not enough to sample, take all
ucs_tumor_samp_df <- ucs_tumor_df

# UCEC & UCS

## Normal

In [11]:
filt_u_normal_df <- ut_endo_normal_df %>%
    dplyr::filter(!(subject_id %in% ucec_tumor_samp_df$subject_id) & !(subject_id %in% ucs_tumor_samp_df$subject_id)) %>%
    dplyr::filter(clinicaldiag_description != "Normal Adjacent to Tumor")

In [12]:
set.seed(12345)
u_normal_weights <- get_balanced_sample_weights(filt_u_normal_df, group_col = "collection_year")
# Take 10, balanced by year
u_normal_samp_df <- filt_u_normal_df %>%
    dplyr::sample_n(10, weights = u_normal_weights)
# u_normal_samp_df

`summarise()` ungrouping output (override with `.groups` argument)


# Check distributions

In [13]:
round(mean(cesc_tumor_samp_df$collection_year))
round(mean(ucec_tumor_samp_df$collection_year))
round(mean(ucs_tumor_samp_df$collection_year))

In [14]:
round(mean(cesc_normal_samp_df$collection_year))
round(mean(u_normal_samp_df$collection_year))

# Write out selections

In [15]:
write_csv(cesc_tumor_samp_df, paste0(root_path, "/to_purchase/cesc_tumor.csv"))
write_csv(ucec_tumor_samp_df, paste0(root_path, "/to_purchase/ucec_tumor.csv"))
write_csv(ucs_tumor_samp_df, paste0(root_path, "/to_purchase/ucs_tumor.csv"))

In [16]:
write_csv(cesc_normal_samp_df, paste0(root_path, "/to_purchase/cesc_normal.csv"))
write_csv(u_normal_samp_df, paste0(root_path, "/to_purchase/u_normal.csv"))