Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Envest/calculate and plot delta kappa #72

Merged
merged 7 commits into from
Oct 15, 2021
85 changes: 70 additions & 15 deletions 3-plot_category_kappa.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ option_list <- list(
optparse::make_option("--null_model",
action = "store_true",
default = FALSE,
help = "Refer to models with permuted dependent variable (within subtype if predictor is a gene)")
help = "Use null model as baseline for plotting delta kappa")
)

opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
Expand All @@ -29,9 +29,7 @@ source(here::here("util", "color_blind_friendly_palette.R"))
cancer_type <- opt$cancer_type
predictor <- opt$predictor
null_model <- opt$null_model
file_identifier <- ifelse(null_model,
str_c(cancer_type, predictor, "null", sep = "_"),
str_c(cancer_type, predictor, sep = "_"))
file_identifier <- str_c(cancer_type, predictor, sep = "_")

# define directories
plot.dir <- here::here("plots")
Expand All @@ -43,28 +41,82 @@ array.files <- lf[grepl(paste0(file_identifier,
"_train_3_models_array_kappa_"), lf)]
seq.files <- lf[grepl(paste0(file_identifier,
"_train_3_models_seq_kappa_"), lf)]
if (null_model) {
null_array.files <- lf[grepl(paste0(file_identifier,
"_null_train_3_models_array_kappa_"), lf)]
null_seq.files <- lf[grepl(paste0(file_identifier,
"_null_train_3_models_seq_kappa_"), lf)]

# check that we have ordered pairs of regular and null files for array and seq
array_seeds <- stringr::str_sub(array.files, -8, -5)
null_array_seeds <- stringr::str_sub(null_array.files, -8, -5)
seq_seeds <- stringr::str_sub(seq.files, -8, -5)
null_seq_seeds <- stringr::str_sub(null_seq.files, -8, -5)
Comment on lines +50 to +54
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm... my instinct here is to suggest that we use other strings that we expect the seeds to be between to detect the seeds, but that seems potentially just as brittle to file name changes as the indexing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. Not sure what the best solution is. One mitigation of this is we have previously forced the seed to be a four digit number, so as long as it's the last thing to come before .tsv, it will be captured here.

if (!(all(array_seeds == null_array_seeds) &
all(seq_seeds == null_seq_seeds))) {
stop("Array or seq seeds do not match in delta kappa plotting script.")
}

}

# define output files
plot.file.lead <- paste0(file_identifier, "_train_3_models_kappa_")
summary.df.filename <- file.path(res.dir,
plot.file.lead <- ifelse(null_model,
paste0(file_identifier, "_train_3_models_delta_kappa_"),
paste0(file_identifier, "_train_3_models_kappa_"))
summary.df.filename <- ifelse(null_model,
file.path(res.dir,
paste0(file_identifier,
"_train_3_models_delta_kappa_summary_table.tsv")),
file.path(res.dir,
paste0(file_identifier,
"_train_3_models_summary_table.tsv"))
"_train_3_models_kappa_summary_table.tsv")))

#### read in data --------------------------------------------------------------

# read in the tables that contain the kappa statistics for predictions on test
# data
array.list <- list() # initialize list that will hold all array tables
seq.list <- list() # initialize list that will hold all the RNA-seq tables
for (i in 1:length(array.files)) {
array.list[[i]] <- fread(array.files[i], data.table = F)
seq.list[[i]] <- fread(seq.files[i], data.table = F)
for (file_index in 1:length(array.files)) {
array.list[[file_index]] <- fread(array.files[file_index], data.table = F)
seq.list[[file_index]] <- fread(seq.files[file_index], data.table = F)
}

if (null_model) {
null_array.list <- list() # initialize list that will hold null array tables
null_seq.list <- list() # initialize list that will hold null RNA-seq tables
for (null_file_index in 1:length(null_array.files)) {
null_array.list[[null_file_index]] <- fread(null_array.files[null_file_index], data.table = F)
null_seq.list[[null_file_index]] <- fread(null_seq.files[null_file_index], data.table = F)
}

# calculate delta kappa values
delta_kappa_array.list <- list() # list for delta kappa array values
delta_kappa_seq.list <- list() # list for delta kappa seq values
for (pair_index in 1:length(array.files)) {
delta_kappa_array.list[[pair_index]] <- array.list[[pair_index]] %>%
left_join(null_array.list[[pair_index]],
by = c("perc.seq", "classifier", "norm.method")) %>%
mutate(delta_kappa = kappa.x - kappa.y) %>% # regular kappa - null kappa
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would use suffix here so that this can be a little more explicit:

Suggested change
left_join(null_array.list[[pair_index]],
by = c("perc.seq", "classifier", "norm.method")) %>%
mutate(delta_kappa = kappa.x - kappa.y) %>% # regular kappa - null kappa
left_join(null_array.list[[pair_index]],
by = c("perc.seq", "classifier", "norm.method"),
suffix = c(".true", ".null")) %>%
mutate(delta_kappa = kappa.true - kappa.null) %>% # regular kappa - null kappa

If you make this change here, don't forget to apply it to the seq step below too!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oooh I like that!

select(delta_kappa, perc.seq, classifier, norm.method)
delta_kappa_seq.list[[pair_index]] <- seq.list[[pair_index]] %>%
left_join(null_seq.list[[pair_index]],
by = c("perc.seq", "classifier", "norm.method")) %>%
mutate(delta_kappa = kappa.x - kappa.y) %>% # regular kappa - null kappa
select(delta_kappa, perc.seq, classifier, norm.method)
}

}

# combine all tables from each platform into a data.frame
array.df <- data.table::rbindlist(array.list)
seq.df <- data.table::rbindlist(seq.list)
rm(list = c("array.list", "seq.list"))
# cannot use ifelse() because return value must be same dim as conditional test
if (null_model) {
array.df <- data.table::rbindlist(delta_kappa_array.list)
seq.df <- data.table::rbindlist(delta_kappa_seq.list)
} else {
array.df <- data.table::rbindlist(array.list)
seq.df <- data.table::rbindlist(seq.list)
}

#### plot test set results -----------------------------------------------------
# bind all kappa stats together
Expand Down Expand Up @@ -92,7 +144,7 @@ test.df$Classifier <- as.factor(test.df$Classifier)
cls.methods <- unique(test.df$Classifier)
for (cls in cls.methods) {
plot.nm <- file.path(plot.dir,
paste0(plot.file.lead, # includes file_identifier
paste0(plot.file.lead, # includes file_identifier and delta/not delta
stringr::str_replace_all(cls,
pattern = " ",
replacement = "_"),
Expand All @@ -108,6 +160,9 @@ for (cls in cls.methods) {
position = position_dodge(0.7), size = 1) +
ggtitle(paste0(cancer_type, ": ", cls)) +
xlab("% RNA-seq samples") +
ylab(ifelse(null_model,
"Delta Kappa",
"Kappa")) +
theme_bw() +
scale_colour_manual(values = cbPalette[2:3]) +
theme(text = element_text(size = 18)) +
Expand All @@ -123,4 +178,4 @@ summary.df <- test.df %>%
SD = sd(Kappa)) %>%
dplyr::ungroup()
readr::write_tsv(summary.df,
summary.df.filename)
summary.df.filename) # delta or not delta in file name