/
normalize_assay_data.R
139 lines (125 loc) · 5.47 KB
/
normalize_assay_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#' Helper functions to normalize assay data into log2 space.
#'
#' This is defined for the assay_types defined within this package. If you are
#' writing a package to handle new types of data, you need to define a
#' `normalize_assay_matrix.ASSAY_TYPE` function. This is experimental.
#'
#' @export
#' @param x A matrix of raw/unnormalized assay data retrieved from
#' within the `fetch_assay_data()` itself.
#' @param features a feature descriptor data.frame that includes the
#' feature_id's of the rows in `x`, as well as the assay name/type they
#' were pulled from. We assert that all features come from the same assay
#' type, and the rows here match 1:1 the rows in `x`.
#' @param samples a sample descriptor for the columns in `x`. Rows here
#' should match columns in `x` 1:1.
#' @param batch,main parameters sent to [remove_batch_effects()] after assay
#' data has been materialized.
normalize_assay_data <- function(x, features, samples, batch = NULL,
log = TRUE, prior.count = 0.1,
main = NULL, verbose = FALSE, ...,
.fds = NULL) {
stopifnot(
nrow(x) == nrow(features), # x and features are concordant
all(rownames(x) == features$feature_id), # x and features are concordant
ncol(x) == nrow(samples), # x and samples are concordant
all(colnames(x) == samples$samid), # x and samples are concordant
is.character(features$assay_type), # only working with 1 assay_type
length(unique(features$assay_type)) == 1L) # only working with 1 assay_type
# Retrieves the log2-like assay-normalized version of the assay data
atype <- features$assay_type[1L]
norm.fn.name <- paste0("normalize_assay_matrix.", atype)
normfn <- try(getFunction(norm.fn.name), silent = TRUE)
if (is.function(normfn)) {
out <- normfn(x, features, samples, log = log, prior.count = prior.count,
..., .fds = .fds)
} else {
warning("No normalization function defined for assay type (", atype, "), ",
"returning raw data", immediate. = TRUE, call. = FALSE)
out <- x
}
if (atype %in% c("lognorm", "qpcrct", "qpcrdct")) {
log <- TRUE
}
# Now batch correct data if desired (and if explicitly log2-like)
if (test_character(batch, min.len = 1L)) {
if (!log) stop("Do not know how to batch correct un-logged data")
out <- remove_batch_effect(out, samples, batch, main, ...)
}
out
}
# Assay-specific normalization methods -----------------------------------------
#' @noRd
#' @importFrom edgeR cpm
normalize_assay_matrix.rnaseq <- function(x, features, samples,
log = TRUE, prior.count = 0.1,
verbose = FALSE, ...,
.update_norm_factors = FALSE,
.fds = NULL) {
# libsize and normfactor are currently extracted from the database
# but this will change when we suport sample_assay covariates
assert_numeric(samples[["libsize"]])
assert_numeric(samples[["normfactor"]])
# the default libsize and normfactor can be overridden by the user if they
# passed in a samples descriptor with the DGEList$samples `lib.size` and
# `norm.factors` columns appended to it. If not, then we use the default
# values to calculate the effective library size for the samples
if (test_numeric(samples[["lib.size"]])) {
lsize <- samples[["lib.size"]]
} else {
lsize <- samples[["libsize"]]
}
if (test_numeric(samples[["norm.factors"]])) {
nf <- samples[["norm.factors"]]
} else {
nf <- samples[["normfactor"]]
}
if (isTRUE(.update_norm_factors)) {
nf <- edgeR::calcNormFactors(x, lsize, ...)
}
edgeR::cpm(x, lsize * nf, log = log, prior.count = prior.count)
}
#' @noRd
normalize_assay_matrix.pseudobulk <- normalize_assay_matrix.rnaseq
#' @noRd
normalize_assay_matrix.isoseq <- normalize_assay_matrix.rnaseq
#' someone processed their data with salmon or kallisto and wanted to store
#' tpm. Normalizing this is just log2(val + prior.count)
#' @noRd
normalize_assay_matrix.tpm <- function(x, features, samples,
log = TRUE, prior.count = 0.1,
verbose = FALSE, ..., .fds = NULL) {
out <- x
if (log) {
assert_number(prior.count, lower = 0.001)
out <- log2(out + prior.count)
}
out
}
#' `lognorm` assay data already normalized, nothing more to do
#' @noRd
normalize_assay_matrix.lognorm <- function(x, features, samples, ...,
.fds = NULL) {
x
}
#' no internal normalization for `qpcrct` assay data yet, needs to be
#' normalized externally and saved here -- essentially like `lognorm` data
#' @noRd
normalize_assay_matrix.qpcrct <- function(x, features, samples, ...,
.fds = NULL) {
warning("Axe this assay type and just set normalized data as lognorm type")
x
}
#' no internal normalization for `qpcrdct` assay data yet, needs to be
#' normalized externally and saved her -- essentially like `lognorm` data
#' @noRd
normalize_assay_matrix.qpcrdct <- function(x, features, samples, ...,
.fds = NULL) {
warning("Axe this assay type and just set normalized data as lognorm type")
x
}
#' `raw` assay data: we have no idea how to normalize
#' @noRd
normalize_assay_matrix.raw <- function(x, features, samples, ..., .fds = NULL) {
x
}