-
Notifications
You must be signed in to change notification settings - Fork 17
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Envest/calculate and plot delta kappa #72
Changes from 6 commits
72f1f5f
a040064
20c2b74
91a4e1c
9864307
27ff0c8
fb6265f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -14,7 +14,7 @@ option_list <- list( | |||||||||||||||
optparse::make_option("--null_model", | ||||||||||||||||
action = "store_true", | ||||||||||||||||
default = FALSE, | ||||||||||||||||
help = "Refer to models with permuted dependent variable (within subtype if predictor is a gene)") | ||||||||||||||||
help = "Use null model as baseline for plotting delta kappa") | ||||||||||||||||
) | ||||||||||||||||
|
||||||||||||||||
opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) | ||||||||||||||||
|
@@ -29,9 +29,7 @@ source(here::here("util", "color_blind_friendly_palette.R")) | |||||||||||||||
cancer_type <- opt$cancer_type | ||||||||||||||||
predictor <- opt$predictor | ||||||||||||||||
null_model <- opt$null_model | ||||||||||||||||
file_identifier <- ifelse(null_model, | ||||||||||||||||
str_c(cancer_type, predictor, "null", sep = "_"), | ||||||||||||||||
str_c(cancer_type, predictor, sep = "_")) | ||||||||||||||||
file_identifier <- str_c(cancer_type, predictor, sep = "_") | ||||||||||||||||
|
||||||||||||||||
# define directories | ||||||||||||||||
plot.dir <- here::here("plots") | ||||||||||||||||
|
@@ -43,28 +41,82 @@ array.files <- lf[grepl(paste0(file_identifier, | |||||||||||||||
"_train_3_models_array_kappa_"), lf)] | ||||||||||||||||
seq.files <- lf[grepl(paste0(file_identifier, | ||||||||||||||||
"_train_3_models_seq_kappa_"), lf)] | ||||||||||||||||
if (null_model) { | ||||||||||||||||
null_array.files <- lf[grepl(paste0(file_identifier, | ||||||||||||||||
"_null_train_3_models_array_kappa_"), lf)] | ||||||||||||||||
null_seq.files <- lf[grepl(paste0(file_identifier, | ||||||||||||||||
"_null_train_3_models_seq_kappa_"), lf)] | ||||||||||||||||
|
||||||||||||||||
# check that we have ordered pairs of regular and null files for array and seq | ||||||||||||||||
array_seeds <- stringr::str_sub(array.files, -8, -5) | ||||||||||||||||
null_array_seeds <- stringr::str_sub(null_array.files, -8, -5) | ||||||||||||||||
seq_seeds <- stringr::str_sub(seq.files, -8, -5) | ||||||||||||||||
null_seq_seeds <- stringr::str_sub(null_seq.files, -8, -5) | ||||||||||||||||
if (!(all(array_seeds == null_array_seeds) & | ||||||||||||||||
all(seq_seeds == null_seq_seeds))) { | ||||||||||||||||
stop("Array or seq seeds do not match in delta kappa plotting script.") | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
# define output files | ||||||||||||||||
plot.file.lead <- paste0(file_identifier, "_train_3_models_kappa_") | ||||||||||||||||
summary.df.filename <- file.path(res.dir, | ||||||||||||||||
plot.file.lead <- ifelse(null_model, | ||||||||||||||||
paste0(file_identifier, "_train_3_models_delta_kappa_"), | ||||||||||||||||
paste0(file_identifier, "_train_3_models_kappa_")) | ||||||||||||||||
summary.df.filename <- ifelse(null_model, | ||||||||||||||||
file.path(res.dir, | ||||||||||||||||
paste0(file_identifier, | ||||||||||||||||
"_train_3_models_delta_kappa_summary_table.tsv")), | ||||||||||||||||
file.path(res.dir, | ||||||||||||||||
paste0(file_identifier, | ||||||||||||||||
"_train_3_models_summary_table.tsv")) | ||||||||||||||||
"_train_3_models_kappa_summary_table.tsv"))) | ||||||||||||||||
|
||||||||||||||||
#### read in data -------------------------------------------------------------- | ||||||||||||||||
|
||||||||||||||||
# read in the tables that contain the kappa statistics for predictions on test | ||||||||||||||||
# data | ||||||||||||||||
array.list <- list() # initialize list that will hold all array tables | ||||||||||||||||
seq.list <- list() # initialize list that will hold all the RNA-seq tables | ||||||||||||||||
for (i in 1:length(array.files)) { | ||||||||||||||||
array.list[[i]] <- fread(array.files[i], data.table = F) | ||||||||||||||||
seq.list[[i]] <- fread(seq.files[i], data.table = F) | ||||||||||||||||
for (file_index in 1:length(array.files)) { | ||||||||||||||||
array.list[[file_index]] <- fread(array.files[file_index], data.table = F) | ||||||||||||||||
seq.list[[file_index]] <- fread(seq.files[file_index], data.table = F) | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
if (null_model) { | ||||||||||||||||
null_array.list <- list() # initialize list that will hold null array tables | ||||||||||||||||
null_seq.list <- list() # initialize list that will hold null RNA-seq tables | ||||||||||||||||
for (null_file_index in 1:length(null_array.files)) { | ||||||||||||||||
null_array.list[[null_file_index]] <- fread(null_array.files[null_file_index], data.table = F) | ||||||||||||||||
null_seq.list[[null_file_index]] <- fread(null_seq.files[null_file_index], data.table = F) | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
# calculate delta kappa values | ||||||||||||||||
delta_kappa_array.list <- list() # list for delta kappa array values | ||||||||||||||||
delta_kappa_seq.list <- list() # list for delta kappa seq values | ||||||||||||||||
for (pair_index in 1:length(array.files)) { | ||||||||||||||||
delta_kappa_array.list[[pair_index]] <- array.list[[pair_index]] %>% | ||||||||||||||||
left_join(null_array.list[[pair_index]], | ||||||||||||||||
by = c("perc.seq", "classifier", "norm.method")) %>% | ||||||||||||||||
mutate(delta_kappa = kappa.x - kappa.y) %>% # regular kappa - null kappa | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would use
Suggested change
If you make this change here, don't forget to apply it to the seq step below too! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oooh I like that! |
||||||||||||||||
select(delta_kappa, perc.seq, classifier, norm.method) | ||||||||||||||||
delta_kappa_seq.list[[pair_index]] <- seq.list[[pair_index]] %>% | ||||||||||||||||
left_join(null_seq.list[[pair_index]], | ||||||||||||||||
by = c("perc.seq", "classifier", "norm.method")) %>% | ||||||||||||||||
mutate(delta_kappa = kappa.x - kappa.y) %>% # regular kappa - null kappa | ||||||||||||||||
select(delta_kappa, perc.seq, classifier, norm.method) | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
# combine all tables from each platform into a data.frame | ||||||||||||||||
array.df <- data.table::rbindlist(array.list) | ||||||||||||||||
seq.df <- data.table::rbindlist(seq.list) | ||||||||||||||||
rm(list = c("array.list", "seq.list")) | ||||||||||||||||
# cannot use ifelse() because return value must be same dim as conditional test | ||||||||||||||||
if (null_model) { | ||||||||||||||||
array.df <- data.table::rbindlist(delta_kappa_array.list) | ||||||||||||||||
seq.df <- data.table::rbindlist(delta_kappa_seq.list) | ||||||||||||||||
} else { | ||||||||||||||||
array.df <- data.table::rbindlist(array.list) | ||||||||||||||||
seq.df <- data.table::rbindlist(seq.list) | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
#### plot test set results ----------------------------------------------------- | ||||||||||||||||
# bind all kappa stats together | ||||||||||||||||
|
@@ -92,7 +144,7 @@ test.df$Classifier <- as.factor(test.df$Classifier) | |||||||||||||||
cls.methods <- unique(test.df$Classifier) | ||||||||||||||||
for (cls in cls.methods) { | ||||||||||||||||
plot.nm <- file.path(plot.dir, | ||||||||||||||||
paste0(plot.file.lead, # includes file_identifier | ||||||||||||||||
paste0(plot.file.lead, # includes file_identifier and delta/not delta | ||||||||||||||||
stringr::str_replace_all(cls, | ||||||||||||||||
pattern = " ", | ||||||||||||||||
replacement = "_"), | ||||||||||||||||
|
@@ -108,6 +160,9 @@ for (cls in cls.methods) { | |||||||||||||||
position = position_dodge(0.7), size = 1) + | ||||||||||||||||
ggtitle(paste0(cancer_type, ": ", cls)) + | ||||||||||||||||
xlab("% RNA-seq samples") + | ||||||||||||||||
ylab(ifelse(null_model, | ||||||||||||||||
"Delta Kappa", | ||||||||||||||||
"Kappa")) + | ||||||||||||||||
theme_bw() + | ||||||||||||||||
scale_colour_manual(values = cbPalette[2:3]) + | ||||||||||||||||
theme(text = element_text(size = 18)) + | ||||||||||||||||
|
@@ -123,4 +178,4 @@ summary.df <- test.df %>% | |||||||||||||||
SD = sd(Kappa)) %>% | ||||||||||||||||
dplyr::ungroup() | ||||||||||||||||
readr::write_tsv(summary.df, | ||||||||||||||||
summary.df.filename) | ||||||||||||||||
summary.df.filename) # delta or not delta in file name |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmmm... my instinct here is to suggest that we use other strings that we expect the seeds to be between to detect the seeds, but that seems potentially just as brittle to file name changes as the indexing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed. Not sure what the best solution is. One mitigation of this is we have previously forced the seed to be a four digit number, so as long as it's the last thing to come before
.tsv
, it will be captured here.