/
Nested_resampling_random_forest.R
263 lines (206 loc) · 10.3 KB
/
Nested_resampling_random_forest.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
## Written by Grethe Hystad, ghystad@pnw.edu
## Reviewed by Anirudh Prabhu, aprabhu@carnegiescience.edu
## Data consist of 134 samples of organic molecular mixtures analyzed by pyrolysis Gass chromatogray-Mass Spectrometry (pyr-GC-MS).
## Each data set has the scan values in the rows and the Mass to Charge ratio (m/z) values in the columns.
## The data values are the raw intensities as measured by pyr-GC-MS.
## Each sample has either an abiotic (A) or biotic (B) origin.
## Purpose is to build classification rules to predict whether the samples are biotic or abiotic.
## Purpose is also to determine the features, (m/z) and (scan) values, that discriminate between the biotic and the abiotic groups.
## Random Forest is used as the machine learning method.
## The data are preprocessed before we apply random forest.
## We apply nested resampling to obtain the generalized prediction performance
library(dplyr) # for dataframe computation
library(MALDIquant) # for chemometrics processing
library(caret) # for machine learning
library(mlr3) # for machine learning
library("mlr3verse") # for machine learning
library("mlr3learners")# for machine learning
library("mlr3tuning") # for machine learning
library("data.table") # for rbindlist
library("ggplot2") # for plots
###Data preparation
setwd("C:/Users/ghystad/Desktop/Data2022")
species=c(rep("A",5),"B", "A","A","B","B","A","C", rep("B",5),rep("C",4),rep("B",4),"A",rep("B",11),"A","A","B","A","A","B","B","A",rep("B",3),"A","B",rep("A",6),rep("B",3),"A","A","B","B",rep("A",4),"B",rep("A",5),"B",rep("A",10),rep("B",4),rep("A",5),rep("B",3),rep("A",3),rep("B",4),rep("A",8),"B","A","A","B","C","C","A",rep("B",3),"A","B", "B", "A","B",rep("A",3),"B","A", rep("A",7),"B","A","B")
species2=as.numeric(as.factor(species))
ind=which(species2=="3")
species_n=species2[-ind]
#Reading in the data is based on the code found in: "How to import multiple .csv files simultaneously in R and create a data frame" by Dahna, A., datascience+, 03/09/2019
#https://datascienceplus.com/how-to-import-multiple-csv-files-simultaneously-in-r-and-create-a-data-frame/
fnames <- list.files()
fnames2=fnames[-ind]
z1=lapply(fnames2, read.delim) #read in all of 134 datasets
z=z1
NN=700 #number of m/z values
mass=seq(50,NN,1) #m/z
MM=3240 #number of scans
N=length(fnames2)
M=list()
for(i in 1:N){
colnames(z[[i]])="mass"
#remove commas
z[[i]]=data.frame(do.call("rbind", strsplit(as.character(z[[i]]$mass), ",", fixed = TRUE)))
z[[i]]=data.frame(lapply(z[[i]],as.numeric))
colnames(z[[i]])=c("scan",as.character(seq(50,NN,1)))
z[[i]]=z[[i]] %>% slice(1:MM) #Selects the first MM rows
M[[i]]=z[[i]]
}
#saveRDS(M,"C:/Users/ghystad/Documents/Templeton_grant_research/GC_MS_R_files/Templeton_R_files/M.RDS")
#M=readRDS("C:/Users/ghystad/Documents/Templeton_grant_research/GC_MS_R_files/Templeton_R_files/M.RDS")
################
y=species_n
y=factor(y,labels=c("A","B"))
######################################################################
#Preprocessing
#Detect the significant peaks as local max above four times the signal to noise ratio
#number of m/z values to use
MZ=151
#Create Chromatograms for each sample and each m/z value inside each sample
Sample.list=list()
for (i in 1:N){
S=list()
for(j in 1:MZ){
S[[j]] = createMassSpectrum(mass=seq(1,MM,1), intensity=unlist(M[[i]][,j+1]),
metaData=list(name="Chrom"))
}
chrom = transformIntensity(S,method="sqrt") #stabilize the variance
chrom = smoothIntensity(chrom, method="MovingAverage",halfWindowSize=5)#Smooth the data
chrom = removeBaseline(chrom, method="SNIP") #Remove the baseline
# Put the processed chromatograms back into a dataframe
Processed.chrom.list=list()
for (k in 1:MZ){
Processed.chrom.list[[k]] = as.numeric(intensity(chrom[[k]]))
}
Processed.Mass_dataframe = as.data.frame(do.call(rbind, Processed.chrom.list))
Ma=max(Processed.Mass_dataframe)
Mi=min(Processed.Mass_dataframe)
#Normalize across sample
Processed.Mass_dataframe=t((Processed.Mass_dataframe-Mi)/(Ma-Mi))
Processed.Mass_dataframe=as.data.frame(Processed.Mass_dataframe)
S2=list()
for(t in 1:MZ){
S2[[t]] = createMassSpectrum(mass=seq(1,MM,1), intensity=Processed.Mass_dataframe[,t],
metaData=list(name="Chrom_normalized"))
}
peaks = detectPeaks(S2, method="MAD", halfWindowSize=20, SNR=4)
peak.list=list()
for (tt in 1:MZ){
v=numeric(MM)
scan.number=mass(peaks[[tt]])
v[scan.number] = intensity(peaks[[tt]])
peak.list[[tt]] = v
}
Processed.peaks = t(as.data.frame(do.call(rbind, peak.list)))
row.names(Processed.peaks)=c(paste0("R", 1:MM))
colnames(Processed.peaks)=c(paste0("M", 50:(MZ+50-1)))
Sample.list[[i]] = Processed.peaks
}
mass.scan.list=list()
for(i in 1:N){
sm=as.numeric(unlist(Sample.list[[i]])) #scan and mass spectrum for each sample
mass.scan.list[[i]]=sm
}
#Sample vs mass/scan numbers
data.mass.scan = do.call(rbind, mass.scan.list) #Put the samples into a dataframe
Bin=as.character(seq(1,3240,1))
MS=as.character(seq(50,(MZ+50-1),1))
colnames(data.mass.scan)=paste(outer(Bin, MS, paste, sep = ';'))
#Next we use hierarchical clustering with a distance of 20 to group the scan numbers for each m/z value.
#The method is based on the paper: "Sample classification from proteing mass spectrometry by peak probability contrasts"
#by Tibshirani, R., et al. (2004), Bioinformatis, 20(17):3034-44, doi:10.1093/bioinformatics/bth357
ml=50:(MZ+50-1)
mll=length(50:(MZ+50-1))
data.mass.scan_new=as.data.frame(ifelse(data.mass.scan>0,1,0))
MZ_name=c(paste0(";",50:(MZ+50-1)))
MZ_name2=c(paste0(50:(MZ+50-1)))
MZ_name3=c(paste0(".",50:(MZ+50-1)))
L_original=list() # Scan numbers for each cluster per m/z value selected
LL_dataframe=list() #The max intensities for the ith m/z value for each bin of scan numbers and each sample
for (i in 1:MZ){
data.mass.scan_new2 = data.mass.scan_new %>% select(ends_with(MZ_name[i])) #Select columns that ends with m/z=i
scan_name=sub(basename(colnames(data.mass.scan_new2)),pattern = MZ_name[i], replacement = "" , fixed = TRUE)
colnames(data.mass.scan_new2) = scan_name #Name the columns with the scan numbers
f=function(x){
y=sum(x)
}
N_elements=as.numeric(apply(as.matrix(data.mass.scan_new2),2,f)) #Count the number of elements for each scan number across the samples
vec=as.numeric(rep(colnames(data.mass.scan_new2), times=N_elements)) #Repeat the nonzero scan numbers with its frequency
hc=hclust(dist(vec),method="complete") # Hiearcial clustering on the scan numbers
clusters=cutree(hc,h=20) # Cluster with a distance of 20
dataframeN=as.data.frame(data.mass.scan) %>% select(ends_with(c(paste0(MZ_name[i]))))
L.original2=list() #Scan numbers for each cluster for the ith m/z value selected
L_dataframe=list() #The max intensity for the jth cluster in the ith m/z value for each sample
for (j in 1:length(unique(clusters))){
indd=which(clusters==unique(clusters)[j])
L.original2[[j]]=as.data.frame(unique(vec[indd]))
colnames(L.original2[[j]])=MZ_name2[i]
Lt=vec[indd]
L_mean=round(mean(Lt)) #Mean number of scan number in each cluster
dataframeN2=dataframeN[,unique(Lt)]
va=if(length(unique(Lt))>1){
apply(dataframeN2,1,max) #Find the max intensity over the scan numbers in one cluster for each sample
}else {dataframeN2}
DD=data.frame(matrix(ncol = 1, nrow = N)) #The max intensity for the jth cluster in the ith m/z value for each sample
colnames(DD)= paste(outer(as.character(L_mean),MZ_name2[i], paste, sep = ';'))
DD[,1]=va
L_dataframe[[j]]=DD
}
L_original[[i]]=L.original2
LL_dataframe[[i]]=do.call(cbind, L_dataframe) #The max intensities for the ith m/z value for each bin of scan numbers and each sample
}
data.mass.scan33=do.call(cbind, LL_dataframe) # Dataframe with the samples as rows and the intertwined m/z values and scan numbers as columns
names2=colnames(data.mass.scan33)
lg=function(x){(length(which(x>0)))/(dim(data.mass.scan33)[2])}
Perc_Nnonzero=apply(data.mass.scan33,1,lg) # Ratio of nonzero feature values for each observation
data.mass.scan33 = data.mass.scan33 %>% mutate(Perc_Nnonzero)
colnames(data.mass.scan33) = make.names(colnames(data.mass.scan33))
#Removing variables with near zero variance, nearZeroVar, and
#correlated predictors, findCorrelation, are based on the book
#"Applied Predictive Modeling" by Kuhn, M. and Johnson, K., Springer, New York, 2013
near.zero.variance = nearZeroVar(data.mass.scan33) #Remove features with near zero variance
data.mass.scan33 = data.mass.scan33[, -near.zero.variance] #remove variables with near zero variance
dim(data.mass.scan33) #new dimension
#Omit correlated predictors from the samples
corr.ma=cor(data.mass.scan33)
Corr.h = findCorrelation(corr.ma, 0.85)
data.mass.scan33 <- data.mass.scan33[, -Corr.h]
dim(data.mass.scan33) #new dimension
#Contains all samples with the selected features and corresponding y-values (A or B)
training_transformed=data.frame(data.mass.scan33,y=y)
########################################################
#Machine learning
#The machine learning R-code is based on the mlr3 library and the book https://mlr3book.mlr-org.com/, #Flexible and Robust Machine Learning Using mlr3 in R#
#by Lang, M. et al
#Create a task
task=as_task_classif(training_transformed, target = "y", positive="B")
#Using stratified random sampling for each resampling
task$col_roles$stratum = "y"
###############################
#Ranger
set.seed(99)
inner_loop = rsmp("cv", folds = 10) # inner loop for nested CV
outer_loop = rsmp("cv", folds = 5)
outer_loop$instantiate(task)
#Create the learner
learner = lrn("classif.ranger", id = "rf",predict_type="prob",importance = "impurity")
graph = po("removeconstants") %>>% learner
plot(graph)
graph_learner = as_learner(graph)
graph_learner$param_set$ids()
#rf
graph_learner$param_set$values$rf.mtry = to_tune(p_int(1,2000))
graph_learner$param_set$values$rf.num.trees = to_tune(p_int(1,8000))
graph_learner$id = "graph_learner"
set.seed(99)
future::plan("multisession")
at = AutoTuner$new(
learner = graph_learner,
resampling = inner_loop,
measure = msr("classif.auc"),
terminator = trm("evals", n_evals = 20),
tuner = tnr("random_search"),
store_models = TRUE)
rr = resample(task = task, learner = at, resampling = outer_loop, store_models = TRUE)
extract_inner_tuning_results(rr)
#Unbiased prediction performance (testing error)
rr$aggregate()
rr$aggregate(msr("classif.auc"))