/
phylter.R
266 lines (249 loc) · 11.6 KB
/
phylter.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#' Filter phylogenomics datasets
#'
#' Detection and filtering out of outliers in a list of trees
#' or a list of distance matrices.
#'
#' @param X A list of phylogenetic trees (phylo object) or a list
#' of distance matrices. Trees can have different number of leaves and matrices
#' can have different dimensions. If this is the case, missing values are imputed.
#' @param bvalue If X is a list of trees, nodes with a support below 'bvalue' will be collapsed
#' prior to the outlier detection.
#' @param distance If X is a list of trees, type of distance used to compute the
#' pairwise matrices for each tree. Can be "patristic" (sum of branch lengths separating tips, the default)
#' or nodal (number of nodes separating tips). The "nodal" option should only be used if all species
#' are present in all genes.
#' @param k Strength of outlier detection. The higher this value the less outliers
#' detected (see details).
#' @param k2 Same as k for complete gene outlier detection. To preserve complete genes from
#' being discarded, k2 can be increased. By default, k2 = k. (see above)
#' @param Norm Should the matrices be normalized prior to the complete analysis and how. If "median" (the default), matrices are divided by their median, if
#' "mean" they are divided by their mean, if "none", no normalization if performed. Normalizing ensures that fast-evolving
#' (and slow-evolving) genes are not treated as outliers. Normalization by median is a better choice as it is less sensitive to outlier values.
#' @param Norm.cutoff Value of the median (if \code{Norm="median"}) or the mean (if
#' \code{Norm="mean"}) of phylogenetic distance matrices below which genes are simply discarded from the analysis. This
#' prevents dividing by 0, and allows getting rid of genes that contain mostly branches
#' of length 0 and are therefore uninformative anyway. Discarded genes, if any, are listed in
#' the output (\code{out$DiscardedGenes}).
#' @param gene.names List of gene names used to rename elements in X. If NULL (the default), 0
#' elements are named 1,2,..,length(X).
#' @param test.island If TRUE (the default), only the highest value in
#' an 'island' of outliers is considered an outlier. This prevents non-outliers hitchhiked by outliers
#' to be considered outliers themselves.
#' @param verbose If TRUE (the default), messages are written during the filtering process to get information
#' of what is happening
#' @param stop.criteria The optimisation stops when the gain in concordance between matrices between round \code{n} and round \code{n+1} is smaller
#' than this value. Default to 1e-5.
#' @param InitialOnly Logical. If TRUE, only the Initial state of the data is computed.
#' @param normalizeby Should the gene x species matrix be normalized prior to outlier detection, and how.
#' @param parallel Should the computations be parallelized when possible? Default to TRUE. Note that the number of threads
#' cannot be set by the user when `parallel=TRUE`. It uses all available cores on the machine.
#' @return A list of class 'phylter' with the 'Initial' (before filtering) and 'Final' (after filtering) states,
#' or a list of class 'phylterinitial' only, if InitialOnly=TRUE. The function also returns the list of DiscardedGenes, if any.
#' @examples
#' data(carnivora)
#'
#' # using default paramaters
#' res <- phylter(carnivora, parallel = FALSE) # perform the phylter analysis
#' res # brief summary of the analysis
#' res$DiscardedGenes # list of genes discarded prior to the analysis
#' res$Initial # See all elements prior to the analysis
#' res$Final # See all elements at the end of the analysis
#' res$Final$Outliers # Print all outliers detected
#'
#' \donttest{
#' # Change the call to phylter to use nodal distances, instead of patristic:
#' res <- phylter(carnivora, distance = "nodal")
#' }
#'
#' @importFrom utils tail combn
#' @importFrom stats hclust as.dist median
#' @importFrom graphics plot
#' @export
phylter<-function(X, bvalue=0, distance="patristic", k=3, k2=k, Norm="median", Norm.cutoff=1e-3, gene.names=NULL, test.island=TRUE, verbose=TRUE, stop.criteria=1e-5, InitialOnly=FALSE, normalizeby="row", parallel=TRUE) {
ReplaceValueWithCompromise<-function(allmat, what, compro, lambda) {
for (i in 1:length(allmat)) {
whatsp<-what[what[,1]==i,2]
if (length(whatsp)>0) {
allmat[[i]][whatsp,]<-compro[whatsp,]*lambda[i]
allmat[[i]][,whatsp]<-compro[,whatsp]*lambda[i]
}
}
return(allmat)
}
ReoderBackTheCells<-function(DetectedCells, OrderUsed) {
DetectedCells$cells[,2]<-OrderUsed[DetectedCells$cells[,2]]
if (!is.null(DetectedCells$outsp)) DetectedCells$outsp<-OrderUsed[DetectedCells$outsp]
return(DetectedCells)
}
FindNewCells<-function(DetectedCells, KnownCells) {
NewCells<-NULL
if (is.null(KnownCells)&!is.null(DetectedCells)) NewCells<-DetectedCells
if (!is.null(KnownCells)&is.null(DetectedCells)) NewCells<-NULL
if (!is.null(KnownCells)&!is.null(DetectedCells)) {
#we paste the DetectedCells at the end of KnownCells, reusing KnownCells to save memory
KnownCells<-rbind(KnownCells, DetectedCells)
test<-tail(duplicated(KnownCells), nrow(DetectedCells))
NewCells<-DetectedCells[test==FALSE,,drop=FALSE]
if (nrow(NewCells)==0) NewCells<-NULL
}
return(NewCells)
}
CompareBeforeAfter<-function(InitialMatrices, AllOutliers, sp.order, which="all") {
if (is.null(AllOutliers)) Out<-NULL
else {
if (which=="all") {
cell.exists<-apply(AllOutliers, 1, function(x, sp) is.element(sp.order[x[2]],colnames(InitialMatrices[[x[1]]])), sp=sp.order)
AllOutliers<-AllOutliers[cell.exists,,drop = FALSE]
Out<-cbind(names(InitialMatrices[AllOutliers[,1]]), sp.order[AllOutliers[,2]])
}
if (which=="complete") {
Out<-NULL
speciesintables<-table(unlist(lapply(InitialMatrices, rownames)))
speciesinoutliers<-table(AllOutliers[,2])
Out$ComplOutSP<-names(which(speciesinoutliers==speciesintables[match(names(speciesinoutliers), names(speciesintables))]))
genesintables<-unlist(lapply(InitialMatrices, ncol))
genesinoutliers<-table(AllOutliers[,1])
Out$ComplOutGN<-names(which(genesinoutliers==genesintables[match(names(genesinoutliers), names(genesintables))]))
}
}
return(Out)
}
#NEW
X.clean<-PreparePhylterData(X, bvalue, distance, Norm, Norm.cutoff, gene.names, verbose)
Xsave<-X.clean$Xsave
matrices<-X.clean$matrices
discardedgenes<-X.clean$discardedgenes #list pof discarded genes
discardedmatrix<-X.clean$discardedmatrix #matrix of discarded cells (same format as Outliers at the end)
#END NEW
RES<-DistatisFast(matrices, parallel=parallel)
WR<-Dist2WR(RES)
Initial<-NULL
Initial$mat.data<-Xsave
Initial$WR<-WR
Initial$RV<-RES$RVmat
Initial$weights<-RES$alpha
Initial$compromise<-RES$compromise
Initial$F<-RES$F
##New
Initial$matrices<-matrices
Initial$PartialF<-RES$PartialF
if (InitialOnly) {
class(Initial)<-c("phylterinitial", "list")
return(Initial)
}
maxWR<-max(WR) ##for plotting purpose
if (verbose) cat(paste("Initial score: ",round(RES$quality, digits=ceiling(abs(log10(stop.criteria)))), "", sep=""))
VAL<-RES$quality #First Quality value
CELLSREMOVED<-NULL
continue<-TRUE
lastLoop<-FALSE #when switching to TRUE, we do the last loop where we search for complete gene outliers.
##We integrate this in the loop of cell outliers to keep possible the fact of not removing complete outliers detectected if they do not improve the quality of the compromise (very unlikely??)
##OPTIMIZATION
while(continue) {
# cluster rows to be able to detect 'islands'
OrderWRrow<-hclust(as.dist(RES$compromise))$order
# reorder rows according to row clustering
WR.reordered<-WR[OrderWRrow,]
if (!lastLoop) {
# detect cell outliers
CellsToRemove<-detect.outliers(WR.reordered, k=k,test.island=test.island, normalizeby=normalizeby)
# reorder to previous order
CellsToRemove<-ReoderBackTheCells(CellsToRemove, OrderWRrow)
}
else {
CellsToRemove<-detect.outliers.array(RES$alpha, nrow(RES$compromise), k=k2)
}
NewCellsToRemove<-FindNewCells(CellsToRemove$cells, CELLSREMOVED)
if (verbose & !is.null(nrow(NewCellsToRemove))) {
cat(paste("\n ",nrow(NewCellsToRemove)," new cells to remove "))
}
if (!is.null(NewCellsToRemove)) {
CELLSREMOVED.new<-rbind(CELLSREMOVED, NewCellsToRemove)
matrices.new<-ReplaceValueWithCompromise(matrices, CELLSREMOVED.new, RES$compromise, RES$lambda)
RES.new<-DistatisFast(matrices.new, parallel=parallel)
VAL.new<-c(VAL, RES.new$quality)
if (verbose) cat(paste("-> New score: ",round(RES.new$quality, digits=ceiling(abs(log10(stop.criteria)))), sep=""))
# if (verbose) plot(VAL, type="o")
gain<-VAL.new[length(VAL.new)]-VAL.new[length(VAL.new)-1]
# if (gain<0) {
if (gain<stop.criteria) {
if (verbose) cat(" -> NO")
if (verbose) cat("\n => Gain too small")
if (lastLoop) {
continue<-FALSE #c'est vraiment la fin
if (verbose) cat(" -> Stopping optimization.")
}
else {
if (verbose) cat (" -> Checking for complete gene outliers")
lastLoop<-TRUE ##
}
}
else { #on continue.
if (verbose) cat(" -> OK")
RES<-RES.new
matrices<-matrices.new
CELLSREMOVED<-CELLSREMOVED.new
VAL<-VAL.new
WR<-Dist2WR(RES)
#si lastLoop était vrai, il redevient faux car on se demande si on peut réaméliorer maintenant améliorer.
if (lastLoop) lastLoop<-FALSE
}
}
else {
if (verbose) cat ("\n => No more outliers detected")
if (lastLoop) {
cat (" -> STOPPING OPTIMIZATION")
break ##usefull?
continue<-FALSE #c'est vraiment la fin
}
else {
if (verbose) cat (" -> Checking for complete gene outliers")
lastLoop<-TRUE
}
}
}
# Prepare return object
Final<-NULL
Final$WR<-WR
Final$RV<-RES$RVmat
Final$weights<-RES$alpha
Final$compromise<-RES$compromise
Final$F<-RES$F
Final$PartialF<-RES$PartialF
Final$species.order<-colnames(RES$compromise)
Final$AllOptiScores<-VAL
# ### NEW - 10/03/2022
# ### WE MUST MODIFY CELLSREMOVED TO REINTEGRATE THE GENES
# ### THAT WERE REMOVED BECAUSE OF MEDIAN or MEAN=0 ISSUE. INDEED,
# ### if gene n is removed, gene n+1 becomes n, n+2 becomles n+1, etc.
# ### We need to reorganise these cells to put the correct identifiers.
# ### we do that as follows:
# print(CELLSREMOVED)
# if (length(ZeroLengthMatrices)>0) {
# for (i in 1:length(ZeroLengthMatrices)) {
# CELLSREMOVED[CELLSREMOVED[,1]>=ZeroLengthMatrices[i],1]<-CELLSREMOVED[CELLSREMOVED[,1]>=ZeroLengthMatrices[i],1]+1
# }
# #and then we add as outliers these genes that had been removed
# #and all there species
# CELLSREMOVED<-rbind(CELLSREMOVED, cbind(rep(ZeroLengthMatrices, each=nrow(WR)),rep(1:nrow(WR),length(ZeroLengthMatrices))))
# }
# ### END OF THIS NEW PART
Final$CELLSREMOVED<-CELLSREMOVED
#MAYBE DO NOT RETURN THIS!?
Final$Outliers<-CompareBeforeAfter(Xsave, CELLSREMOVED, Final$species.order, which="all")
Final$CompleteOutliers<-CompareBeforeAfter(Xsave, Final$Outliers, Final$species.order, which="complete")
#store the way the function was called
##New
Final$matrices<-matrices
Final$Discarded<-discardedmatrix #organized like Outliers, but for genes discarded at the very beginning
call<-list(call=match.call(), bvalue=bvalue, distance=distance, k=k, k2=k2, Norm=Norm, Norm.cutoff=Norm.cutoff, gene.names=gene.names, test.island=test.island, verbose=verbose, stop.criteria=stop.criteria, InitialOnly=InitialOnly, normalizeby=normalizeby)
Result<-list(Initial=Initial, Final=Final, DiscardedGenes=discardedgenes, call=call)
class(Result)<-c("phylter", "list")
class(Result$Initial)<-c("phylterinitial", "list")
class(Result$Final)<-c("phylterfinal", "list")
if (verbose) {
cat("\n--------\n")
print(summary(Result))
}
return(Result)
}