/
create_DataFile.R
424 lines (354 loc) · 16.6 KB
/
create_DataFile.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
#'@title Prepare input data for subsequent BayLum Analysis
#'
#'@description The function pre-processes input data from BIN/BINX file, XSYG files
#'or [RLum.Analysis-class] objects for `'BayLum'`. The parameters for
#'the modelling are controlled by a to be supplied YAML configuration file
#'(please read package vignette).
#'
#'@details
#'The function uses a single configuration file based on the YAML format and operates
#'in two modes:
#'
#'(1) The YAML file contains the path to the files and the function
#'attempts to import them. In such a case, all files must be thoroughly prepared
#'(e.g., strictly follow the SAR protocol etc.).
#'
#'(2) Alternatively, the YAML file contains no file paths but the data
#'were imported and processed before `create_DataFile()` was called (recommended).
#'Then the function is searching for objects with the sample name in the global environment.
#'Example: `samp1 <- Luminescence::read_BIN2R(...)` with `samp1` the sample name as specified
#'in the YAML file.
#'
#'For more details, please see the package vignette.
#'
#'@param config_file [character] (**required**): path to YAML configuration file; alternatively
#'the config_file can be a [list] similar to the R representation of the imported YAML file.
#'This enables on-the fly modifications
#'
#'@param verbose [logical] (*with default*): enable/disable terminal feedback
#'
#'@section Function version: 0.1.0
#'
#'@author Sebastian Kreutzer, Institute of Geography, Ruprecht-Karl University of Heidelberg (Germany), in parts based on code by Claire Christophe
#'
#'@returns Returns a [list] that can be processed by the modelling functions of 'BayLum'
#'
#' \itemize{
#' \item \bold{LT} (one list per sample); each list contains all L/T values for the corresponding sample;
#' \item \bold{sLT} (one list per sample); each list contains all uncertainties on L/T values for the corresponding sample;
#' \item \bold{ITimes} (one list per sample); each list contains irradiation time values for the corresponding sample;
#' \item \bold{dLab}, a matrix containing in line \code{i}, the laboratory dose rate and its variance for sample \code{i};
#' \item \bold{ddot_env}, a matrix containing in line \code{i}, the environmental dose rate and its variance (excluding the common error terms) for sample \code{i};
#' \item \bold{regDose} (one list per sample); each list contains all regenerated doses;
#' \item \bold{J}, a vector giving, for each BIN file, the number of aliquots selected for the analysis;
#' \item \bold{K}, a vector giving, for each BIN file, the number of regenerative doses in the SAR protocol;
#' \item \bold{Nb_measurement}, a vector giving, for each BIN file, the number of measurements;
#' \item \bold{SampleNames}, a character vector with the sample names;
#' \item \bold{Nb_sample}, the number of samples in the dataset
#' }
#'
#'@seealso [write_YAMLConfigFile], [yaml::read_yaml], [Luminescence::read_BIN2R], [Luminescence::read_XSYG2R], [Luminescence::subset_SingleGrainData]
#'
#'@examples
#'##set path to YAML file
#' yaml_file <- system.file("extdata/example.yml", package = "BayLum")
#'
## BIN/BINX files
#' samp1_file <- system.file("extdata/samp1/bin.bin", package = "BayLum")
#' samp2_file <- system.file("extdata/samp2/bin.bin", package = "BayLum")
#'
#' ## import BIN files
#' samp1 <- Luminescence::read_BIN2R(samp1_file, verbose = FALSE) |>
#' subset(POSITION == 2 & GRAIN == 32)
#' samp2 <- Luminescence::read_BIN2R(samp2_file, verbose = FALSE) |>
#' subset(POSITION == 2 & GRAIN == 32)
#'
#' ## create file
#' create_DataFile(yaml_file)
#'
#'@md
#'@export
create_DataFile <- function(
config_file,
verbose = TRUE
){
# Import data -------------------------------------------------------------
## This function has two modes:
## (1) if no file names are provided, the function treats sample names as
## objects names that should be available on in the working environment
## (2) if file names are present, the function tries to import those files
## However, little treatment is done and the files should be correct
if (verbose) cli::cli_rule("BayLum - [create_DataFile()]")
## read YAML file
if (verbose) cli::cli_progress_step("Load YAML configuration file ... ", spinner = TRUE)
## try list
if(inherits(config_file, "list"))
config <- config_file
else
config <- yaml::read_yaml(file = config_file, readLines.warn = FALSE)
## make sure config is always a list
if(!inherits(config[[1]], "list"))
config <- list(config)
## check structure and names of the YAML file
for(i in seq_along(config)) {
## top level
t <- names(config[[i]]) %in% c("sample", "files","settings")
if(!all(t))
stop(
paste0(
"[create_DataFile()] Missing or wrong paramter in YAML file on top level!\n\t-> YAML entry: ", i, "\n\t-> Parameter: ",
paste(names(config[[i]])[!t], collapse = ", ")),
call. = FALSE)
## settings level
t <- names(config[[i]]$settings) %in% c("dose_points", "dose_source", "dose_env", "rules")
if(!all(t))
stop(
paste0(
"[create_DataFile()] Missing or wrong paramter in YAML file on settings level!\n\t-> YAML entry: ", i, "\n\t-> Parameter: ",
paste(names(config[[i]]$settings)[!t], collapse = ", ")),
call. = FALSE)
## rules level
t <- names(config[[i]]$settings$rules) %in% c(
"beginSignal", "endSignal", "beginBackground", "endBackground",
"beginTest", "endTest", "beginTestBackground", "endTestBackground",
"inflatePercent", "nbOfLastCycleToRemove")
if(!all(t))
stop(
paste0(
"[create_DataFile()] Missing or wrong paramter in YAML file on rules level!\n\t-> YAML entry: ", i, "\n\t-> Parameter: ",
paste(names(config[[i]]$settings$rules)[!t], collapse = ", ")),
call. = FALSE)
}
## determine number of samples
n_samples <- length(config)
## fix sample names ... better safe than a weird error messages
## this way the sample names are safe to use (more or less)
if (verbose) cli::cli_progress_step("Sanitize sample names ... ", spinner = TRUE)
for (i in seq_len(n_samples)) {
## replace all whitespace with "_"
config[[i]]$sample <- gsub(
x = config[[i]]$sample,
pattern = " ",
replacement = "_",
fixed = TRUE)
## replace all odd characters with nothing
regmatches(
x = config[[i]]$sample,
regexpr(pattern = "[a-zA-Z0-9\\_\\-]+", text = config[[i]]$sample, perl = TRUE),
invert = TRUE) <- ""
}
## make list of objects; depending on the entry case
if (verbose) cli::cli_progress_step("Generate object list ... ", spinner = TRUE)
object_list <- lapply(seq_len(n_samples), function(x) {
## use objects -------
if (is.null(config[[x]]$files)) {
## assign sample name if it exists in the working environment
if (inherits(try(out <- get(x = config[[x]]$sample), silent = TRUE), "try-error"))
stop(
paste0("[create_DataFile()] <", config[[x]]$sample,
"> is not a valid object in the working environment!"),
call. = FALSE)
} else {
## import files as specified -------
out <- lapply(suppressWarnings(normalizePath(config[[x]]$files)), function(x) {
if (!file.exists(x))
stop(paste0("[create_DataFile()] file <", x, "> does not exist!"), call. = FALSE)
## select function based on file
switch(
tolower(rev(strsplit(basename(x), split = ".", fixed = TRUE)[[1]])[1]),
bin = Luminescence::read_BIN2R(x, fastForward = TRUE, verbose = FALSE, zero_data.rm = TRUE),
xsyg = Luminescence::read_XSYG2R(x, fastForward = TRUE, verbose = FALSE),
stop("[create_DateFile()] Unsupported input file. Supported are BIN/BINX and XSYG files!", call. = FALSE)
) |> Luminescence::get_RLum(recordType = c("OSL", "IRSL"), drop = FALSE)
}) |> unlist()
}
## account for the output object nature, which might be a double list (or not)
if (!inherits(out, "list"))
out <- list(out)
## check the nature of the files
## if Risoe.BINfileData convert automatically
if (any(vapply(out, function(x) inherits(x, "Risoe.BINfileData"), logical(1)))) {
if (verbose) cli::cli_alert_warning(paste0("... ", config[[x]]$sample,": Risoe.BINfileData instead of RLum.Analysis detected ... "))
if (verbose) cli::cli_progress_step(paste0("... ", config[[x]]$sample,": Convert Risoe.BINfileData to RLum.Analysis ... ", spinner = TRUE))
out <- lapply(out, function(x) {
if (inherits(x,"Risoe.BINfileData"))
Luminescence::Risoe.BINfileData2RLum.Analysis(x, txtProgressBar = FALSE)
else
x
}) |> unlist(recursive = FALSE)
}
## now everything must be of type RLum.Analysis ... crash if not
if ((any(vapply(out, function(x) !inherits(x, "RLum.Analysis"), logical(1)))))
stop("[create_DataFile()] Wrong object type. All input data must be of type RLum.Analysis!]",
call. = FALSE)
## sometimes datasets are empty due to previous selections
## by the user ... remove such datasets
if (any(vapply(out, length, numeric(1)) == 0)) {
warning("[create_DataFile()] Empty RLum.Analysis records found and removed ... ",
call. = FALSE,
immediate. = FALSE)
out[sapply(out, length) == 0] <- NULL
}
## remove dose points according to the config
if(!is.null(config[[x]]$settings$rules$nbOfLastCycleToRemove) &&
config[[x]]$settings$rules$nbOfLastCycleToRemove > 0) {
## how many records?
rm <- config[[x]]$settings$rules$nbOfLastCycleToRemove * 2
## remove the records
if(verbose) cli::cli_progress_step(paste0(
"... ", config[[x]]$sample, ": Remove ", rm, " dose points ... "),
spinner = TRUE)
out <- lapply(out, function(o) {
o@records <- o@records[1:(length(o@records) - rm)]
return(o)
})
}
return(out)
})
# Additional checks -------------------------------------------------------
if (verbose) cli::cli_progress_step("Run additional checks ... ", spinner = TRUE)
## sample names must be unique
chk_samples <- vapply(config, function(x) x$sample, character(1))
if (any(duplicated(chk_samples)))
stop(
paste0("[create_DataFile()] Sample names must be unique, found the follwing duplicated entries: ",
paste(chk_samples[duplicated(chk_samples)], collapse = ", ")),
call. = FALSE)
## check that all curves have the same length
for (i in seq_along(object_list)) {
tmp_n_channels <-
vapply(unlist(Luminescence::get_RLum(object_list[[i]])), function(x) nrow(x@data), numeric(1))
if (length(unique(tmp_n_channels)) > 1)
stop(paste0(
"[create_DataFile()] All curves within one sample must have the same number of channels/data points!\n",
"\t-> Please check sample <", config[[i]]$sample, "> were I found the following number of channels: ",
paste(unique(tmp_n_channels), collapse = ", ")),
call. = FALSE)
## check integration limits against the number of channels
if (max(unlist(config[[i]]$settings$rules[1:4])) > tmp_n_channels[1])
stop(paste0("[create_DataFile()] Check your integration limit settings for sample <",
config[[i]]$sample, ">. Your curves do not have more than ",
tmp_n_channels[1], " channels!"),
call. = FALSE)
}
# Extract information -----------------------------------------------------
## we have to mimic the original format by Claire otherwise everything
## else subsequent needs to be modified
if (verbose) cli::cli_progress_step("Extract information and calculate values ... ", spinner = TRUE)
## K -------
## get number of regeneration points for all datasets
K <- vapply(seq_along(object_list), function(x){
## get the number of all records in each dataset
tmp_K <- vapply(object_list[[x]], length, numeric(1))
## remove the specified number of last points
tmp_K <- tmp_K
## three integrity checks
## (1) less than four? (two records)
if (any(tmp_K < 4))
stop("[create_DataFile()] All records must have at least two dose points!", call. = FALSE)
## (2) all must be of same length
if (length(unique(tmp_K)) > 1)
stop("[create_DataFile()] For one sample all records must have the same number of curves!",
call. = FALSE)
## (3) they must be a multiple of two
if (any(tmp_K %% 2 > 0))
stop("[create_DataFile()] Curves must be multiple of two!", call. = FALSE)
tmp_K[1]/2
}, numeric(1))
## Nb_measurement --------------
## from K we can derive the number of curves (which should be similar for each sample)
Nb_measurement <- K * 2
## J ---------
## number of aliquots for each sample
J <- vapply(object_list, length, numeric(1))
## dLab --------
## the source dose rate of the lab
dLab <- vapply(config, function(x) unlist(x$settings$dose_source), numeric(2))
## ddot_env -------
## the environmental dose rate
ddot_env <- vapply(config, function(x) unlist(x$settings$dose_env), numeric(2))
## LT and sLT --------
## calculate the LT and sLT values
LxTx_list <- lapply(seq_along(object_list), function(x){
## we use Claire's approach otherwise the risk of mistakes is too high
rule <- as.numeric(config[[x]]$settings$rules)
## calculate multiple of background signal
prop_Lx <- length(c(rule[3]:rule[4]))/length(c(rule[1]:rule[2]))
prop_Tx <- length(c(rule[7]:rule[8]))/length(c(rule[5]:rule[6]))
## calculate the Lx related value
## Signal Lx integral
Lx <- t(vapply(object_list[[x]], function(y) {
vapply(seq(1,length(y),2), function(z) sum(y@records[[z]]@data[rule[1]:rule[2],2]), numeric(1))
}, numeric(K[x])))
## Background Lx integral
Bx <- t(vapply(object_list[[x]], function(y) {
vapply(seq(1,length(y),2), function(z) sum(y@records[[z]]@data[rule[3]:rule[4],2])/prop_Lx, numeric(1))
}, numeric(K[x])))
## net Lx
net_Lx <- Lx - Bx
## calculate the Tx related value
## Tx signal
Tx <- t(vapply(object_list[[x]], function(y) {
vapply(seq(2,length(y),2), function(z) sum(y@records[[z]]@data[rule[5]:rule[6],2]), numeric(1))
}, numeric(K[x])))
## Tx background
BTx <- t(vapply(object_list[[x]], function(y) {
vapply(seq(2,length(y),2), function(z) sum(y@records[[z]]@data[rule[7]:rule[8],2])/prop_Tx, numeric(1))
}, numeric(K[x])))
## net Tx
net_Tx <- Tx - BTx
## calculate Lx/Tx
LxTx <- net_Lx/net_Tx
# calculate error of LxTx which will become sLT
LxTx_err <- LxTx[,1:ncol(LxTx)] * sqrt(
((Lx * (1 + rule[9]^2 * Lx) + Bx * ( 1 + rule[9]^2 * Bx)) /
(Lx - Bx)^2) + ((Tx * (1 + rule[9]^2 * Tx) + BTx * (1 + rule[9]^2 * BTx))/(Tx - BTx)^2))
## returned combined list
return(list(LT = LxTx, sLT = LxTx_err))
}) |> unlist(recursive = FALSE)
## ITimes -------
ITimes <- lapply(seq_along(object_list), function(x) {
if(!is.null(config[[x]]$settings$dose_points)) {
IRR_TIME <- config[[x]]$settings$dose_points
## trigger warning if the settings were wrong
if(length(IRR_TIME) < K[x]-1)
stop(paste0(
"[create_DataFile()] Not enough regeneration dose points specified.
\t -> Please check in config file '$settings$dose_points' in sample <" ,config[[x]]$sample, ">!"),
call. = FALSE)
## the IRR_TIME is silently shortened to K otherwise it
## becomes to complicated if the users mix up settings
IRR_TIME <- IRR_TIME[seq_len(K[x]-1)]
} else {
## extract irradiation times
IRR_TIME <- Luminescence::merge_RLum(
Luminescence::extract_IrradiationTimes(object_list[[x]]))$irr.times$IRR_TIME
## remove natural (0 dose)
IRR_TIME <- IRR_TIME[c(1, seq(3,length(IRR_TIME),2))]
IRR_TIME <- IRR_TIME[-seq(1,length(IRR_TIME), K[x])]
}
## create matrix with irradiation
matrix(IRR_TIME, nrow = J[x], ncol = K[x]-1, byrow = TRUE)
})
## regDose -------
regDose <- lapply(seq_along(ITimes), function(x) ITimes[[x]] * dLab[1,x])
# Construct final list ----------------------------------------------------
if (verbose) cli::cli_progress_step("Combine list ... ", spinner = TRUE)
l <- list(
LT = LxTx_list[names(LxTx_list) == "LT"],
sLT = LxTx_list[names(LxTx_list) == "sLT"],
ITimes = ITimes,
dLab = dLab,
ddot_env = ddot_env,
regDose = regDose,
J = J,
K = K - 1,
Nb_measurement = Nb_measurement,
SampleNames = chk_samples,
Nb_sample = n_samples
)
## add attribute originator
attr(l, "originator") <- "create_DataFile"
## return
return(l)
}