In [44]:
library(tidyverse)

In [45]:
root_path <- "/mnt/d/fogg_lab_OHSU_samples"

In [46]:
cerv_tumor_df <- read_csv(paste0(root_path, "/filtered_biolibrary/cervix_tumor.csv")) %>%
    dplyr::select(-type)
ut_endo_tumor_df <- read_csv(paste0(root_path, "/filtered_biolibrary/ut_endo_tumor.csv")) %>%
    dplyr::select(-type)
cerv_normal_df <- read_csv(paste0(root_path, "/filtered_biolibrary/cervix_normal.csv")) %>%
    dplyr::select(-type)
ut_endo_normal_df <- read_csv(paste0(root_path, "/filtered_biolibrary/ut_endo_normal.csv")) %>%
    dplyr::select(-type)

Parsed with column specification:
cols(
  subject_id = col_character(),
  collection_year = col_double(),
  clinicaldiag_description = col_character(),
  tissue = col_character(),
  paraffin_block = col_double(),
  type = col_character()
)
Parsed with column specification:
cols(
  subject_id = col_character(),
  collection_year = col_double(),
  clinicaldiag_description = col_character(),
  tissue = col_character(),
  paraffin_block = col_double(),
  type = col_character()
)
Parsed with column specification:
cols(
  subject_id = col_character(),
  collection_year = col_double(),
  clinicaldiag_description = col_character(),
  tissue = col_character(),
  paraffin_block = col_double(),
  type = col_character()
)
Parsed with column specification:
cols(
  subject_id = col_character(),
  collection_year = col_double(),
  clinicaldiag_description = col_character(),
  tissue = col_character(),
  paraffin_block = col_double(),
  type = col_character()
)


In [47]:
# CESC
cesc_tumor_types <- c("Squamous cell carcinoma", "Adenocarcinoma", "Carcinoma in situ")
cesc_tumor_df <- cerv_tumor_df %>%
    dplyr::filter(clinicaldiag_description %in% cesc_tumor_types)

# UCEC
ucec_tumor_types <- c("Adenocarcinoma", "Carcinoma", "Endometrioid carcinoma")
ucec_tumor_df <- ut_endo_tumor_df %>%
    dplyr::filter(clinicaldiag_description %in% ucec_tumor_types)

# UCS
ucs_tumor_types <- c("Carcinosarcoma")
ucs_tumor_df <- ut_endo_tumor_df %>%
    dplyr::filter(clinicaldiag_description %in% ucs_tumor_types)

# Helper functions

In [48]:
# Get sample weights by group
# Weights inversely proportional to group frequency
get_balanced_sample_weights <- function(df, group_col) {
    weight_df <- df %>%
        dplyr::group_by(!!as.name(group_col)) %>%
        dplyr::summarize(n = n()) %>%
        # Down-weight groups by frequency
        dplyr::mutate(weight = 1 / (n * nrow(df)))
    final_df <- df %>%
        dplyr::inner_join(weight_df, by = group_col) %>%
        dplyr::mutate(std_weight = weight / sum(weight))
    return(final_df$std_weight)
}

# CESC

## Tumor

In [49]:
set.seed(12345)
cesc_scc_year_weights <- get_balanced_sample_weights(
    dplyr::filter(cesc_tumor_df, clinicaldiag_description == "Squamous cell carcinoma"),
    group_col = "collection_year"
)
cesc_tumor_samp_df <- bind_rows(
    # Not enough to sample, take all
    cesc_tumor_df %>% dplyr::filter(clinicaldiag_description == "Adenocarcinoma"),
    # Not enough to sample, take all
    cesc_tumor_df %>% dplyr::filter(clinicaldiag_description == "Carcinoma in situ"),
    # Take 10, balanced by year
    cesc_tumor_df %>% dplyr::filter(clinicaldiag_description == "Squamous cell carcinoma") %>%
        dplyr::sample_n(10, weight = cesc_scc_year_weights)
)
# cesc_tumor_samp_df

`summarise()` ungrouping output (override with `.groups` argument)


## Normal

In [50]:
# Tumor & Normal should not have any pared data
filt_cerv_normal_df <- cerv_normal_df %>%
    dplyr::filter(!(subject_id %in% cesc_tumor_samp_df$subject_id))

In [51]:
set.seed(12345)
cesc_normal_weights <- get_balanced_sample_weights(filt_cerv_normal_df, group_col = "collection_year")
# Take 10, balanced by year
cesc_normal_samp_df <- filt_cerv_normal_df %>%
    dplyr::sample_n(10, weights = cesc_normal_weights)
# cesc_normal_samp_df

`summarise()` ungrouping output (override with `.groups` argument)


# UCEC

## Tumor

In [52]:
set.seed(12345)
ucec_ec_year_weights <- get_balanced_sample_weights(
    dplyr::filter(ucec_tumor_df, clinicaldiag_description == "Endometrioid carcinoma"),
    group_col = "collection_year"
)
ucec_tumor_samp_df <- bind_rows(
    # Not enough to sample, take all
    ucec_tumor_df %>% dplyr::filter(clinicaldiag_description == "Adenocarcinoma"),
    # Not enough to sample, take all
    ucec_tumor_df %>% dplyr::filter(clinicaldiag_description == "Carcinoma"),
    # Take 10, balanced by year
    ucec_tumor_df %>% dplyr::filter(clinicaldiag_description == "Endometrioid carcinoma") %>%
        dplyr::sample_n(10, weight = ucec_ec_year_weights)
)
# ucec_tumor_samp_df

`summarise()` ungrouping output (override with `.groups` argument)


# UCS

## Tumor

In [53]:
# Not enough to sample, take all
ucs_tumor_samp_df <- ucs_tumor_df

# UCEC & UCS

## Normal

In [54]:
filt_u_normal_df <- ut_endo_normal_df %>%
    dplyr::filter(!(subject_id %in% ucec_tumor_samp_df$subject_id) & !(subject_id %in% ucs_tumor_samp_df$subject_id))

In [55]:
set.seed(12345)
u_normal_weights <- get_balanced_sample_weights(filt_u_normal_df, group_col = "collection_year")
# Take 10, balanced by year
u_normal_samp_df <- filt_u_normal_df %>%
    dplyr::sample_n(10, weights = u_normal_weights)
# u_normal_samp_df

`summarise()` ungrouping output (override with `.groups` argument)


# Check distributions

In [56]:
round(mean(cesc_tumor_samp_df$collection_year))
round(mean(ucec_tumor_samp_df$collection_year))
round(mean(ucs_tumor_samp_df$collection_year))

In [57]:
round(mean(cesc_normal_samp_df$collection_year))
round(mean(u_normal_samp_df$collection_year))

# Write out selections

In [58]:
write_csv(cesc_tumor_samp_df, paste0(root_path, "/to_purchase/cesc_tumor.csv"))
write_csv(ucec_tumor_samp_df, paste0(root_path, "/to_purchase/ucec_tumor.csv"))
write_csv(ucs_tumor_samp_df, paste0(root_path, "/to_purchase/ucs_tumor.csv"))

In [59]:
write_csv(cesc_normal_samp_df, paste0(root_path, "/to_purchase/cesc_normal.csv"))
write_csv(u_normal_samp_df, paste0(root_path, "/to_purchase/u_normal.csv"))