/
LateConservationTest.R
321 lines (260 loc) · 15.5 KB
/
LateConservationTest.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
#' @title Perform Reductive Late Conservation Test
#' @description The \emph{Reductive Late Conservation Test} aims to statistically evaluate the
#' existence of a monotonically decreasing phylotranscriptomic pattern based on \code{\link{TAI}} or \code{\link{TDI}} computations.
#' The corresponding p-value quantifies the probability that a given TAI or TDI pattern (or any phylotranscriptomics pattern)
#' does not follow an late conservation like pattern. A p-value < 0.05 indicates that the corresponding phylotranscriptomics pattern does
#' indeed follow an late conservation (high-high-low) shape.
#' @param ExpressionSet a standard PhyloExpressionSet or DivergenceExpressionSet object.
#' @param modules a list storing three elements: early, mid, and late. Each element expects a numeric
#' vector specifying the developmental stages or experiments that correspond to each module.
#' For example, \code{module} = list(early = 1:2, mid = 3:5, late = 6:7) divides a dataset
#' storing seven developmental stages into 3 modules.
#' @param permutations a numeric value specifying the number of permutations to be performed for the \code{ReductiveHourglassTest}.
#' @param lillie.test a boolean value specifying whether the Lilliefors Kolmogorov-Smirnov Test shall be performed to quantify the goodness of fit.
#' @param plotHistogram a boolean value specifying whether a \emph{Lillifor's Kolmogorov-Smirnov-Test}
#' shall be performed to test the goodness of fit of the approximated distribution, as well as additional plots quantifying the significance
#' of the observed phylotranscriptomic pattern.
#' @param runs specify the number of runs to be performed for goodness of fit computations, in case \code{plotHistogram} = \code{TRUE}.
#' In most cases \code{runs} = 100 is a reasonable choice. Default is \code{runs} = 10 (because it takes less computation time for demonstration purposes).
#' @param parallel performing \code{runs} in parallel (takes all cores of your multicore machine).
#' @param gof.warning a logical value indicating whether non significant goodness of fit results should be printed as warning. Default is \code{gof.warning = FALSE}.
#' @param custom.perm.matrix a custom \code{\link{bootMatrix}} (permutation matrix) to perform the underlying test statistic. Default is \code{custom.perm.matrix = NULL}.
#' @details The \emph{reductive late conservation test} is a permutation test based on the following test statistic.
#'
#' (1) A set of developmental stages is partitioned into three modules - early, mid, and late - based on prior biological knowledge.
#'
#' (2) The mean \code{\link{TAI}} or \code{\link{TDI}} value for each of the three modules T_early, T_mid, and T_late are computed.
#'
#' (3) The two differences D1 = T_early - T_late and D2 = T_mid - T_late are calculated.
#'
#' (4) The minimum D_min of D1 and D2 is computed as final test statistic of the reductive hourglass test.
#'
#'
#' In order to determine the statistical significance of an observed minimum difference D_min
#' the following permutation test was performed. Based on the \code{\link{bootMatrix}} D_min
#' is calculated from each of the permuted \code{\link{TAI}} or \code{\link{TDI}} profiles,
#' approximated by a Gaussian distribution with method of moments estimated parameters returned by \code{\link[fitdistrplus]{fitdist}},
#' and the corresponding p-value is computed by \code{\link{pnorm}} given the estimated parameters of the Gaussian distribution.
#' The \emph{goodness of fit} for the random vector \emph{D_min} is statistically quantified by an Lilliefors (Kolmogorov-Smirnov) test
#' for normality.
#'
#'
#' In case the parameter \emph{plotHistogram = TRUE}, a multi-plot is generated showing:
#'
#' (1) A Cullen and Frey skewness-kurtosis plot generated by \code{\link[fitdistrplus]{descdist}}.
#' This plot illustrates which distributions seem plausible to fit the resulting permutation vector D_min.
#' In the case of the \emph{reductive late conservation test} a normal distribution seemed plausible.
#'
#' (2) A histogram of D_min combined with the density plot is plotted. D_min is then fitted by a normal distribution.
#' The corresponding parameters are estimated by \emph{moment matching estimation} using the \code{\link[fitdistrplus]{fitdist}} function.
#'
#' (3) A plot showing the p-values for N independent runs to verify that a specific p-value is biased by a specific permutation order.
#'
#' (4) A barplot showing the number of cases in which the underlying goodness of fit (returned by Lilliefors (Kolmogorov-Smirnov) test
#' for normality) has shown to be significant (\code{TRUE}) or not significant (\code{FALSE}).
#' This allows to quantify the permutation bias and their implications on the goodness of fit.
#' @return a list object containing the list elements:
#'
#' \code{p.value} : the p-value quantifying the statistical significance (low-high-high pattern) of the given phylotranscriptomics pattern.
#'
#' \code{std.dev} : the standard deviation of the N sampled phylotranscriptomics patterns for each developmental stage S.
#'
#' \code{lillie.test} : a boolean value specifying whether the \emph{Lillifors KS-Test} returned a p-value > 0.05,
#' which indicates that fitting the permuted scores with a normal distribution seems plausible.
#'
#' @references
#'
#' Drost HG et al. (2015) Mol Biol Evol. 32 (5): 1221-1231 doi:10.1093/molbev/msv012
#'
#' Quint M et al. (2012). \emph{A transcriptomic hourglass in plant embryogenesis}. Nature (490): 98-101.
#'
#' Piasecka B, Lichocki P, Moretti S, et al. (2013) \emph{The hourglass and the early conservation models co-existing
#' patterns of developmental constraints in vertebrates}. PLoS Genet. 9(4): e1003476.
#'
#' @author Hajk-Georg Drost and Jaruwatana Sodai Lotharukpong
#' @seealso \code{\link{lcScore}}, \code{\link{bootMatrix}}, \code{\link{FlatLineTest}},\code{\link{ReductiveHourglassTest}}, \code{\link{ReverseHourglassTest}}, \code{\link{PlotSignature}}
#' @examples
#'
#' data(PhyloExpressionSetExample)
#'
#' # perform the late conservation test for a PhyloExpressionSet
#' # here the prior biological knowledge is that stages 1-2 correspond to module 1 = early,
#' # stages 3-5 to module 2 = mid (phylotypic module), and stages 6-7 correspond to
#' # module 3 = late
#' LateConservationTest(PhyloExpressionSetExample,
#' modules = list(early = 1:2, mid = 3:5, late = 6:7),
#' permutations = 1000)
#'
#'
#' # use your own permutation matrix based on which p-values (LateConservationTest)
#' # shall be computed
#' custom_perm_matrix <- bootMatrix(PhyloExpressionSetExample,100)
#'
#' LateConservationTest(PhyloExpressionSetExample,
#' modules = list(early = 1:2, mid = 3:5, late = 6:7),
#' custom.perm.matrix = custom_perm_matrix)
#'
#' @import foreach
#' @export
LateConservationTest <- function(ExpressionSet,
modules = NULL,
permutations = 1000,
lillie.test = FALSE,
plotHistogram = FALSE,
runs = 10,
parallel = FALSE,
gof.warning = FALSE,
custom.perm.matrix = NULL){
is.ExpressionSet(ExpressionSet)
if(is.null(modules))
stop("Please specify the three modules: early, mid, and late using the argument 'module = list(early = ..., mid = ..., late = ...)'.", call. = FALSE)
if(any(table(unlist(modules)) > 1))
stop("Intersecting modules are not defined for the LateConservationTest.", call. = FALSE)
if(length(modules) != 3)
stop("Please specify three modules: early, mid, and late to perform the LateConservationTest.", call. = FALSE)
if(length(unlist(modules)) != (dim(ExpressionSet)[2] - 2))
stop("The number of stages classified into the three modules does not match the total number of stages stored in the given ExpressionSet.", call. = FALSE)
nCols <- dim(ExpressionSet)[2]
score_vector <- vector(mode = "numeric",length = permutations)
resMatrix <- matrix(NA_real_, permutations,(nCols-2))
real_age <- vector(mode = "numeric",length = nCols-2)
real_age <- cpp_TAI(as.matrix(dplyr::select(ExpressionSet, 3:ncol(ExpressionSet))),as.vector(unlist(dplyr::select(ExpressionSet, 1))))
# compute the real late conservation of the observed phylotranscriptomics pattern
# lcScore = late conservation score
real_ltv <- lcScore(real_age,early = modules[[1]],mid = modules[[2]],late = modules[[3]],profile.warn=T)
options(warn=1)
### compute the bootstrap matrix
if (is.null(custom.perm.matrix)){
resMatrix <- cpp_bootMatrix(as.matrix(dplyr::select(ExpressionSet, 3:ncol(ExpressionSet))),as.vector(unlist(dplyr::select(ExpressionSet, 1))),as.numeric(permutations))
}
else if (!is.null(custom.perm.matrix)){
resMatrix <- custom.perm.matrix
}
### compute the global phylotranscriptomics destruction scores for each sampled age vector
score_vector <- apply(resMatrix, 1 ,lcScore,early = modules[[1]],
mid = modules[[2]],late = modules[[3]])
# parameter estimators using MASS::fitdistr
param <- fitdistrplus::fitdist(score_vector,"norm", method = "mme")
mu <- param$estimate[1]
sigma <- param$estimate[2]
if(plotHistogram){
if(runs < 1)
stop("You need at least one run...")
if(lillie.test)
graphics::par(mfrow = c(2,2))
if(!lillie.test)
graphics::par(mfrow = c(1,3))
fitdistrplus::descdist(score_vector, boot = permutations)
# plot histogram of scores
normDensity <- function(x){
return(stats::dnorm(x,mu,sigma))
}
graphics::curve( expr = normDensity,
xlim = c(min(score_vector,real_ltv),max(score_vector,real_ltv)),
col = "steelblue",
lwd = 5,
xlab = "Scores",
ylab = "Frequency" )
graphics::hist( x = score_vector,
prob = TRUE,
add = TRUE,
breaks = permutations / (0.01 * permutations) )
graphics::rug(score_vector)
# plot a red line at the position where we can find the real lt value
graphics::abline(v = real_ltv, lwd = 5, col = "darkred")
#legend("topleft", legend = "A", bty = "n")
p.vals_vec <- vector(mode = "numeric", length = runs)
lillie_vec <- vector(mode = "logical", length = runs)
lct <- vector(mode = "list", length = 3)
#cat("\n")
if(parallel){
### Parallellizing the sampling process using the 'doParallel' and 'parallel' package
### register all given cores for parallelization
par_cores <- parallel::makeForkCluster(parallel::detectCores())
doParallel::registerDoParallel(par_cores)
# perform the sampling process in parallel
parallel_results <- foreach::foreach(i = 1:runs,
.combine = "rbind",
.errorhandling = "stop") %dopar% {
data.frame(LateConservationTest( ExpressionSet = ExpressionSet,
permutations = permutations,
lillie.test = TRUE,
plotHistogram = FALSE,
modules = modules )[c(1,3)])
}
parallel::stopCluster(par_cores)
colnames(parallel_results) <- c("p.value","lillie.test")
p.vals_vec <- parallel_results[ ,"p.value"]
lillie_vec <- parallel_results[ ,"lillie.test"]
}
if(!parallel){
# sequential computations of p-values
# if(runs >= 10){
# # initializing the progress bar
# #progressBar <- txtProgressBar(min = 1,max = runs,style = 3)
#
# }
for(i in 1:runs){
if(lillie.test){
lct <- LateConservationTest( ExpressionSet = ExpressionSet,
permutations = permutations,
lillie.test = TRUE,
plotHistogram = FALSE,
modules = list(early = modules[[1]],mid = modules[[2]],late = modules[[3]]),
runs = NULL )
}
if(!lillie.test){
lct <- LateConservationTest( ExpressionSet = ExpressionSet,
permutations = permutations,
lillie.test = FALSE,
plotHistogram = FALSE,
modules = list(early = modules[[1]],mid = modules[[2]],late = modules[[3]]),
runs = NULL )
}
p.vals_vec[i] <- lct$p.value
if(lillie.test)
lillie_vec[i] <- lct$lillie.test
# if(runs >= 10){
# # printing out the progress
# setTxtProgressBar(progressBar,i)
# }
}
}
graphics::plot( x = p.vals_vec,
type = "l" ,
lwd = 6,
ylim = c(0,1),
col = "darkblue",
xlab = "Runs",
ylab = "p-value" )
graphics::abline(h = 0.05, lty = 2, lwd = 3)
if(lillie.test){
tbl <- table(factor(lillie_vec, levels = c("FALSE","TRUE")))
graphics::barplot( height = tbl/sum(tbl) ,
beside = TRUE,
names.arg = c("FALSE", "TRUE"),
ylab = "relative frequency",
main = paste0("runs = ",runs) )
}
}
#if(real_ltv >= 0)
pval <- stats::pnorm(real_ltv,mean = mu,sd = sigma,lower.tail = FALSE)
#if(real_ltv < 0)
#pval <- pnorm(real_ltv,mean=mu,sd=sigma,lower.tail=TRUE)
### computing the standard deviation of the sampled TAI values for each stage separately
sd_vals <- vector(mode = "numeric",length = nCols-2)
sd_vals <- apply(resMatrix,2,stats::sd)
if(lillie.test){
# perform Lilliefors K-S-Test
lillie_p.val <- nortest::lillie.test(score_vector)$p.value
# does the Lilliefors test pass the criterion
lillie_bool <- (lillie_p.val > 0.05)
if(gof.warning & (lillie_p.val < 0.05) & (!plotHistogram)){
warning("Lilliefors (Kolmogorov-Smirnov) test for normality did not pass the p > 0.05 criterion!")
}
}
if(lillie.test)
return(list(p.value = pval,std.dev = sd_vals,lillie.test = lillie_bool))
if(!lillie.test)
return(list(p.value = pval,std.dev = sd_vals,lillie.test = NA))
}