-
Notifications
You must be signed in to change notification settings - Fork 11
/
gdm.formatsitepair.R
759 lines (719 loc) · 35.6 KB
/
gdm.formatsitepair.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
#' @title Combines Biological and Environmental Data to Produce a GDM-formatted
#' Site-Pair Table
#'
#' @description This function takes input biological data and environmental,
#' geographic, and other predictor data and builds a site-pair table required
#' for fitting a Generalized Dissimilarity Model using the \code{\link[gdm]{gdm}}
#' function. NOTE: x-y coordinates of sites MUST be present in either the
#' biological or the environmental data. Site coordinates ideally should be in a
#' projected coordinate system (i.e., not longitude-latitude) to ensure proper
#' calculation of geographic distances. \cr
#'
#' The input biological data can be in one of the following four formats. Note
#' that the general term "species" is used, but any classification of biological
#' entities (e.g. functional types, haplotypes, etc) can be used as long as an
#' appropriate distance metric is also supplied (see "dist" argument): \cr
#'
#' \enumerate{
#' \item site-by-species matrix \cr
#'
#' \item x, y, species list \cr
#'
#' \item site-by-site biological distance (dissimilarity) matrix \cr
#'
#' \item an existing site-pair table (see Details) \cr
#' }
#'
#' Predictor data can be provided in three formats: \cr
#'
#' \enumerate{
#' \item a site-by-predictor matrix with a column for each predictor variable
#' and a row for each site \cr
#'
#' \item a terra object SpatRaster, with one raster for each predictor variable \cr
#'
#' \item one or more site-by-site distance matrices using the "distPreds"
#' argument (see below).
#' }
#'
#' @usage
#' formatsitepair(bioData, bioFormat, dist="bray", abundance=FALSE, siteColumn=NULL,
#' XColumn, YColumn, sppColumn=NULL, abundColumn=NULL, sppFilter=0, predData,
#' distPreds=NULL, weightType="equal", custWeights=NULL, sampleSites=1, verbose=FALSE)
#'
#' @param bioData The input biological (the response variable) data table, in
#' one of the four formats defined above (see Details).
#'
#' @param bioFormat An integer code specifying the format of bioData. Acceptable
#' values are 1, 2, 3, or 4 (see Details).
#'
#' @param dist Default = "bray". A character code indicating the metric to
#' quantify pairwise site distances / dissimilarities. Calls the
#' \code{\link[vegan]{vegdist}} function from the \pkg{vegan} package to
#' calculate dissimilarity and therefore accepts any method available from
#' \code{\link[vegan]{vegdist}}.
#'
#' @param abundance Default = FALSE. Indicates whether the biological data are
#' abundance data (TRUE) or presence-absence (0, 1) data (FALSE).
#'
#' @param siteColumn The name of the column in either the biological or
#' environmental data table containing a unique site identifier. If a site column
#' is provided in both the biological and environmental data, the site column name
#' must be the same in both tables.
#'
#' @param XColumn The name of the column containing x-coordinates of sites.
#' X-coordinates can be provided in either the biological or environmental data
#' tables, but MUST be in at least one of them. If an x-coordinate column is
#' provided in both the biological and environmental data, the column name must
#' be identical. Site coordinates ideally should be in a projected coordinate
#' system (i.e., not longitude-latitude) to ensure proper calculation of
#' geographic distances. Note that if you are using rasters, they must be in the
#' same coordinate system as the site coordinates.
#'
#' @param YColumn The name of the column containing y-coordinates of sample sites.
#' Y-coordinates can be provided in either the biological or environmental data
#' tables, but MUST be in at least one of them. If a y-coordinate column is
#' provided in both the biological and environmental data, the column name must
#' be identical. Site coordinates ideally should be in a projected coordinate
#' system (i.e., not longitude-latitude) to ensure proper calculation of
#' geographic distances. Note that if you are using rasters, they must be in the
#' same coordinate system as the site coordinates.
#'
#' @param sppColumn Only used if bioFormat = 2 (x, y, species list). The name of
#' the column containing unique name / identifier for each species.
#'
#' @param abundColumn If abundance = TRUE, this parameter identifies the column
#' containing the measure of abundance at each site. Only used if bioFormat = 2
#' (i.e., x, y, species list), though in the case of abundance data, the table
#' would have four columns: x, y, species, abundance.
#'
#' @param sppFilter Default = 0. To account for limited sampling effort at some
#' sites, sppFilter removes all sites at which the number of recorded species
#' (i.e., observed species richness) is less than the specified value. For
#' example, if sppFilter = 5, all sites with fewer than 5 recorded species
#' will be removed.
#'
#' @param predData The environmental predictor data. Accepts either a
#' site-by-predictor table or a terra object SpatRaster.
#'
#' @param distPreds An optional list of distance matrices to be used as predictors
#' in combination with predData. For example, a site-by-site dissimilarity matrix
#' for one biological group (e.g., trees) can be used as a predictor for another
#' group (e.g., ferns). Each distance matrix must have as the first column the
#' names of the sites (therefore the matrix will not be square). The name of the
#' column containing the site names should have the same name as that provided
#' for the siteColumn argument. Site IDs are required here to ensure correct ordering
#' of sites in the construction of the site-pair table. Note that the formatsitepair
#' function will not accept distance matrices in the as the only predictors
#' (i.e., at least one additional, non-distPreds predictor variable is required). If you wish to fit GDM
#' using only distance matrices provided using distPreds, provide one fake predictor (e.g., with all sites
#' having the same value), plus site and coordinate columns. The s1 and
#' s2 columns for this fake variable can then be removed by hand before fitting the GDM.
#'
#' @param weightType Default = "equal". Defines the weighting for sites. Can be
#' either: (1) "equal" (weights for all sites set = 1), (2) "richness" (each
#' site weighted according to number of species recorded), or (3) "custom"
#' (user defined). If weightType="custom", the user must provide a vector of
#' site weights equal to the number of rows in the full site-pair table (i.e.,
#' before species filtering (sppFilter argument) or sub-sampling is taken into
#' account (sampleSites argument)).
#'
#' @param custWeights A two column matrix or data frame of user-defined site
#' weights. The first column should be the site name and should be named the same
#' as that provided for the siteColumn argument. The second column should be numeric
#' weight values and should be named "weights". The weight values represent the
#' importance of each site in model fitting, and the values in the output
#' site-pair table is an average of the two sites in each site-pair. Required
#' when weightType = "custom". Ignored otherwise.
#'
#' @param sampleSites Default = 1. A number between 0-1 indicating the fraction
#' of sites to be used to construct the site-pair table. This argument can be
#' used to reduce the number of sites to overcome possible memory limitations
#' when fitting models with very large numbers of sites.
#'
#' @param verbose Default = FALSE. If TRUE, summary of information regarding
#' dimensions of the site-pair table will be printed that can be useful for diagnostics.
#'
#' @details
#' bioData and bioFormat:
#' The function accepts biological data in the following formats:
#'
#' bioData = site-by-species matrix; bioFormat = 1: assumes that the response
#' data are provided with a site ID column (specified by siteCol) and, optionally,
#' two columns for the x & y coordinates of the sites. All remaining columns
#' contain the biological data, with a column for each biological entity (most
#' commonly species). In the case that a raster stack (a terra object SpatRaster) is provided for the
#' environmental data (predData), x-y coordinates MUST be provided in bioData
#' to allow extraction of the environmental data at site locations. The x-y
#' coordinates will be intersected with the raster stack and, if the number of
#' unique cells intersected by the points is less than the number of unique site
#' IDs (i.e. multiple sites fall within a single cell), the function will use
#' the raster cell as the site ID and aggregate sites accordingly. Therefore,
#' model fitting will be sensitive to raster cell size. If the environmental
#' data are in tabular format, they should have the same number of sites
#' (i.e., same number of rows) as bioData. The x-y coordinate and site ID
#' columns must have the same names in bioData and predData.
#'
#' bioData = x, y, species list (optionally a fourth column with abundance can
#' be provided); bioFormat = 2: assumes a table of 3 or 4 columns, the first two
#' being the x & y coordinates of species records, the third (sppCol) being the
#' name / identifier of the species observed at that location, and optionally a
#' fourth column indicating a measure of abundance. If an abundance column is
#' not provided, presence-only data are assumed. In the case that a raster stack
#' (a terra object SpatRaster) is provided for the environmental data (predData),
#' the x-y coordinates will be intersected with the raster stack and, if the
#' number of unique cells intersected by the points is less than the number of
#' unique site IDs (i.e. multiple sites fall within a single cell), the function
#' will use the raster cell as the site ID and aggregate sites accordingly.
#' Therefore, model fitting will be sensitive to raster cell size.
#'
#' bioData = site-site distance (dissimilarity) matrix; bioFormat = 3. This option
#' allows the use of an existing site-site distance (dissimilarity) matrix, such as
#' genetic distance matrix calculated outside of the gdm package. Only the lower
#' triangle of the matrix is required to create the site-pair table, but the
#' function will automatically removes the upper triangle if present. The code
#' checks and aligns the order of sites in the distance matrix and the predictor
#' data to ensure they match. To do so, (1) a site column is required in both
#' the distance matrix and the predictor data and (2) site IDs are required to
#' be a number. This is the only bioFormat in which the environmental data MAY
#' NOT be a raster stack.
#'
#' bioData = site-pair table; bioFormat = 4: with an already created site-pair
#' table, this option allows the user to add one or more distance matrices (see
#' distPreds above) to the existing site-pair table and/or sub-sample the
#' site-pair table (see sample above). If the site-pair table was not created
#' using the formatsitepair function, the user will need to ensure the order of
#' the sites matches that in other tables being provided to the function.
#'
#' NOTES: (1) The function assumes that the x-y coordinates and the raster stack
#' (if used) are in the same coordinate system. No checking is performed to
#' confirm this is the case. (2) The function assumes that the association between
#' the provided site and x-y coordinate columns are singular and unique.
#' Therefore, the function will fail should a given site has more than one sets of
#' coordinates associated with it, as well as multiple sites being given the
#' exact same coordinates.
#'
#' @return A formatted site-pair table containing the response (biological
#' distance or dissimilarity), predictors, and weights as required for fitting
#' Generalized Dissimilarity Models.
#'
#' @examples
#' ## tabular data
#' # start with the southwest data table
#' head(southwest)
#' sppData <- southwest[c(1,2,13,14)]
#' envTab <- southwest[c(2:ncol(southwest))]
#'
#' #########table type 1
#' ## site-species table without coordinates
#' testData1a <- reshape2::dcast(sppData, site~species)
#' ##site-species table with coordinates
#' coords <- unique(sppData[, 2:ncol(sppData)])
#' testData1b <- merge(testData1a, coords, by="site")
#' ## site-species, table-table
#' exFormat1a <- formatsitepair(testData1a, 1, siteColumn="site", XColumn="Long",
#' YColumn="Lat", predData=envTab)
#'
#' #' # next, let's try environmental raster data
#' ## not run
#' # rastFile <- system.file("./extdata/swBioclims.grd", package="gdm")
#' # envRast <- terra::rast(rastFile)
#'
#' ## site-species, table-raster
#' ## not run
#' # exFormat1b <- formatsitepair(testData1b, 1, siteColumn="site", XColumn="Long",
#' # YColumn="Lat", predData=envRast)
#'
#' #########table type 2
#' ## site xy spp list, table-table
#' exFormat2a <- formatsitepair(sppData, 2, XColumn="Long", YColumn="Lat",
#' sppColumn="species", siteColumn="site", predData=envTab)
#'
#' ## site xy spp list, table-raster
#' ## not run
#' # exFormat2b <- formatsitepair(sppData, 2, XColumn="Long", YColumn="Lat",
#' # sppColumn="species", siteColumn="site", predData=envRast)
#'
#' #########table type 3
#' ## It is possible to format a site-pair table by starting
#' # with a pre-calculated matrix of biological distances
#' dim(gdmDissim) # pairwise distance matrix + 1 column for site IDs
#' gdmDissim[1:5, 1:5]
#' # now we can format the table:
#' exFormat3 <- formatsitepair(gdmDissim, 3, XColumn="Long", YColumn="Lat",
#' predData=envTab, siteColumn="site")
#'
#' #########table type 4
#' ## adds a predictor matrix to an existing site-pair table, in this case,
#' ## predData needs to be provided, but is not actually used
#' exFormat4 <- formatsitepair(exFormat2a, 4, predData=envTab, siteColumn="site",
#' distPreds=list(as.matrix(gdmDissim)))
#'
#' @keywords gdm
#'
#' @importFrom methods is
#' @importFrom stats as.dist
#' @importFrom vegan vegdist
#' @importFrom reshape2 dcast
#'
#' @export
formatsitepair <- function(bioData, bioFormat, dist="bray", abundance=FALSE,
siteColumn=NULL, XColumn, YColumn, sppColumn=NULL,
abundColumn=NULL, sppFilter=0, predData, distPreds=NULL,
weightType="equal", custWeights=NULL, sampleSites=1,
verbose=FALSE){
###########################
##lines used to quickly test function
#bioData <- southwest[, c(1,2,13,14)]
#bioFormat <- 2
#dist <- "bray"
#abundance <- F
#siteColumn <- "site"
#XColumn <- "Long"
#YColumn <- "Lat"
#sppColumn <- "species"
#sppFilter <- 0
#abundColumn <- NULL
#predData <- southwest[, c(2:ncol(southwest))]
#distPreds <- NULL
#weightType <- "equal"
#custWeights <- NULL
#sampleSites <- 1
#################
#bioData <- bio
#bioFormat <- 2
#dist <- "bray"
#abundance <- T
#siteColumn <- "site"
#XColumn <- "X"
#YColumn <- "Y"
#sppColumn <- "locus"
#sppFilter <- 0
#abundColumn <- "abund"
#predData <- preds
#distPreds <- NULL
#weightType <- "equal"
#custWeights <- NULL
#sampleSites <- 1
###########################
##input error checking
##makes sure bioData has the required format
if(!(is(bioData, "data.frame") | is(bioData, "matrix") | is(bioData, "gdmData"))){
"bioData object needs to be either of class data.frame or matrix in one of
the acceptable formats listed in the help document."
}
##transforms bioData into a data frame in order to proceed without error, just to make sure,
##basically remove the affects of factors
if(is(bioData, "data.frame") | is(bioData, "matrix")){
bioData <- as.data.frame(bioData, stringsAsFactors=F)
}
##makes sure predData is in an acceptable format
if(!(is(predData, "data.frame") | is(predData, "matrix") | .is_raster(predData))){
"predData object needs to either of class data.frame, matrix, or raster."
}
if(is(predData, "data.frame") | is(predData, "matrix")){
predData <- as.data.frame(predData, stringsAsFactors=F)
}
# make sure predData is a terra object or convert it to terra
if (.is_raster(predData)) {
# check terra package is available
.check_pkgs("terra")
predData <- .check_rast(predData, "predData")
}
##if bioFormat is not an acceptable number, exit function
if(bioFormat %in% c(1:4)){} else{
stop("Acceptable values for the bioFormat argument are: 1, 2, 3, or 4")
}
##checks that geo has either TRUE or FALSE
if(!(abundance==TRUE | abundance==FALSE)){
stop("abundance argument must be either TRUE or FALSE")
}
##if sampleSites is not a number, then exit function
#if(is.null(sampleSites)==TRUE){
# stop("sampleSites argument must be a number between 0-1")
#}
if (is.numeric(sampleSites) == FALSE | sampleSites <= 0 |
sampleSites > 1) {
stop("sampleSites argument must be a number 0 < x <= 1")
}
##makes sure that sppFilter is a number, if not exit function
if((is.numeric(sppFilter)==FALSE & is.null(sppFilter)==FALSE) | sppFilter<0){
stop("sppFilter argument must be a positive integer")
}
##makes sure a proper weightType is used
if(weightType %in% c("equal", "richness", "custom")){} else{
stop("Acceptable values for the weightType argument are: equal, richness, or custom")
}
##if weightType == custom, makes sure a custWeights is attached
if(weightType=="custom" & is.null(custWeights)==T){
stop("weightType argument = 'custom', but no custWeights vector provided.")
}
##with bioFormat 2, makes sure spp data has been included
if(bioFormat==2 & is.null(sppColumn)==TRUE){
stop("Need to define sppColumn argument when bioFormat==2")
}
##makes sure that the sppColumn name can be found in the bioData with bioFormat 2
#if(bioFormat==2 & (sppColumn %in% names(bioData))){} else{
# stop("Cannot find sppColumn in bioData - check name?")
#}
##makes sure that a site column is provided when using table type 2 and raster environmental data
if(bioFormat==2 & is.null(siteColumn)==TRUE){
if(!.is_raster(predData)){
stop("A siteColumn needs to be provided in either the bioData or predData objects.")
}
}
##when a site column is provided
if(is.null(siteColumn)==FALSE){
##makes sure the site column name is of type character
if(!is(siteColumn, "character")){
stop("siteColumn argument needs to be of class = 'character'.")
##checks to see if siteColumn exists in the bioData for bioFormats 1 and 2
}else if(!(siteColumn %in% colnames(bioData)) & (bioFormat==1 | bioFormat==2)){
stop("Cannot find a match for the name of the siteColumn in the columns
of the bioData object.")
}
##if the siteColumn is provided with input type 3, remove it
#if(bioFormat==3 & siteColumn %in% colnames(bioData)){
# wSite <- which(colnames(bioData)==siteColumn)
# bioData <- bioData[-wSite]
#}
}
##checks to make sure that the coordinate columns are characters and can be
# found in either the biological or environmental data
if(bioFormat!=4){
if(!is(XColumn, "character")){
stop("XColumn argument needs to be of class 'character'.")
}else if(!is(YColumn, "character")){
stop("YColumn argument needs to be of class 'character'.")
}else if(!(XColumn %in% colnames(bioData) | XColumn %in% colnames(predData))){
stop("XColumn not found in either the bioData or predData arguments")
}else if(!(YColumn %in% colnames(bioData) | YColumn %in% colnames(predData))){
stop("YColumn not found in either the bioData or predData arguments")
}
}
##checks table type 3 specific requirements
if(bioFormat==3){
if(weightType=="richness"){
stop("Cannot weight by site richness when supplying the biological data
as a distance matrix.")
}else if(nrow(bioData)!=(ncol(bioData)-1)){
stop("Check dimensions of the bioData object. Does the biological dissimilarity matrix have a column for site IDs as required?")
}
}
##warns if distPreds are not matrices
for(mat in distPreds){
if(!is(mat, "matrix") & !is(mat, "data.frame")){
warning("One or more of the provided predictor distance matrices are not
of class 'matrix'.")
}
}
##if a custom weight vector is provided, makes sure it is a vector
if(is.null(custWeights)==FALSE & (!is(custWeights, "data.frame") & !is(custWeights, "matrix"))){
stop("The argument custWeights needs to be of class 'data.frame' or 'matrix'.")
}
##sets up variables to be used later
toRemove <- NULL ##removed from sppfilter
removeRand <- NULL ##remove from random sample out sites
distData <- NULL
##checks input data format
##species data as site-species matrix
if(bioFormat==1 | bioFormat==2){
##first, if bioFormat=2, then transforms it into a site-spp matrix (bioFormat 1)
if(bioFormat==2){
##makes sure that the sppColumn name can be found in the bioData with bioFormat 2
if((sppColumn %in% colnames(bioData))){} else{
stop("Cannot find sppColumn in bioData - check name?")
}
##insert a site column if one was not given
if(is.null(siteColumn)){
colnames(bioData)[which(colnames(bioData)==XColumn)] <- "bioData_FSP_xColumn"
colnames(bioData)[which(colnames(bioData)==YColumn)] <- "bioData_FSP_yColumn"
bioData <- transform(bioData,
griddedSiteID=as.numeric(interaction(bioData$bioData_FSP_xColumn,
bioData$bioData_FSP_yColumn,
drop=TRUE)))
siteColumn <- "griddedSiteID"
colnames(bioData)[which(colnames(bioData)=="bioData_FSP_xColumn")] <- XColumn
colnames(bioData)[which(colnames(bioData)=="bioData_FSP_yColumn")] <- YColumn
}
##insert presence if abundance was not given
if(is.null(abundColumn)){
warning("No abundance column was specified, so the biological data are assumed to be presences.")
bioData[, "bioData_FSP_abundColumn"] <- 1
abundColumn <- "bioData_FSP_abundColumn"
}
##rename the siteColumn and sppColumn in order to cast the data into a siteXspp matrix
preCastBio <- bioData
colnames(preCastBio)[which(colnames(preCastBio)==siteColumn)] <- "griddedSiteID"
colnames(preCastBio)[which(colnames(preCastBio)==sppColumn)] <- "bioData_FSP_sppColumn"
castData <- dcast(preCastBio, fill=0, griddedSiteID~bioData_FSP_sppColumn, value.var=abundColumn)
##adds coordinates to the cast data
uniqueCoords <- unique(preCastBio[which(colnames(preCastBio) %in% c("griddedSiteID", XColumn, YColumn))])
bioData <- merge(castData, uniqueCoords, by="griddedSiteID")
colnames(bioData)[which(colnames(bioData)=="griddedSiteID")] <- siteColumn
}
##checks to see if the coordinates can be found in bioData, if not, checks to see if
##they can be found in envData
if((XColumn %in% colnames(bioData))==FALSE | (YColumn %in% colnames(bioData)==FALSE)){
xCol <- which(colnames(predData)==XColumn)
yCol <- which(colnames(predData)==YColumn)
locs <- predData[c(xCol,yCol)]
}else{
xCol <- which(colnames(bioData)==XColumn)
yCol <- which(colnames(bioData)==YColumn)
locs <- bioData[c(xCol,yCol)]
}
# checks unique sites against rasters
if(.is_raster(predData)){
# when using rasters, uses the cell as the site
warning("When using rasters for environmental covariates (predictors), each site is assigned to the
raster cell in which the site is located. If more than one site occurs within the same raster cell,
the biological data of those sites are aggregated (more likely as raster resolution decreases).")
# gets the cell location of the given coordinates
cellID <- data.frame(cellName = terra::cellFromXY(predData, locs))
# if none of the points intersected with the predictor raster
if (all(is.na(cellID$cellName))) {
stop("None of the data points provided intersect with the rasters. Double check spatial data.")
}
cellLocs <- as.data.frame(terra::xyFromCell(predData, cellID$cellName))
##temporarily keeps old site in to identify what to remove from other objects
rastBioData <- cbind(cellID, cellLocs, bioData[-c(which(colnames(bioData) %in% c(XColumn, YColumn)))])
##if custom weights selected, gives weight table new cell site names
if(weightType=="custom" & !is.null(custWeights)){
nameTab <- unique(rastBioData[c("cellName", siteColumn)])
tempWeightTab <- merge(x=nameTab, y=custWeights, by=siteColumn)
siteNum <- which(colnames(tempWeightTab)=="cellName")
custWeights <- tempWeightTab[-siteNum]
colnames(custWeights)[1] <- siteColumn
}
##removes original site column from the rastBioData table
siteNum <- which(colnames(rastBioData)==siteColumn)
rastBioData <- rastBioData[-siteNum]
##aggregates species data by cell
cellNum <- which(colnames(rastBioData)=="cellName")
bioData <- stats::aggregate(rastBioData, rastBioData[cellNum], FUN=mean)
bioData <- bioData[-cellNum]
# extracts raster data into environmental predictor data table
rastEx <- terra::extract(predData, bioData$cellName)
##renames bioData columns which have been updated from rasters
colnames(bioData)[which(colnames(bioData)=="cellName")] <- siteColumn
colnames(bioData)[which(colnames(bioData)=="x")] <- XColumn
colnames(bioData)[which(colnames(bioData)=="y")] <- YColumn
##updates locs object based on raster coordinates
xCol <- which(colnames(bioData)==XColumn)
yCol <- which(colnames(bioData)==YColumn)
locs <- bioData[c(xCol,yCol)]
##recreates the predData
siteCol <- which(colnames(bioData)==siteColumn)
predData <- cbind(bioData[siteCol], locs, rastEx)
}
##filters out sites with low species count
##first isolates the species data
siteCol <- which(colnames(bioData)==siteColumn)
xCol <- which(colnames(bioData)==XColumn)
yCol <- which(colnames(bioData)==YColumn)
sppDat <- bioData[, -c(siteCol, xCol, yCol)]
##totals the number of species per site
sppDat[sppDat>=1] <- 1
sppDat[sppDat==0] <- 0
sppDat[is.na(sppDat)] <- 0
sppTotals <- cbind.data.frame(bioData[,siteCol], apply(sppDat, 1, function(m){sum(as.numeric(m))}))
colnames(sppTotals) <- c(siteColumn, "totals")
##filters out sites with less species than filter
filterBioDat <- subset(sppTotals, sppTotals[colnames(sppTotals)[2]] >= sppFilter)
toRemove <- bioData[,siteCol][which(!(bioData[,siteCol] %in% filterBioDat[,siteColumn]))]
##reassembles bioData after filtering
spSiteCol <- filterBioDat[1]
bioData <- unique(merge(spSiteCol, bioData, by=siteColumn))
##subsample sites using random sampling
if (sampleSites < 1) {
fullSites <- bioData[, siteCol]
randRows <- sort(sample(1:nrow(bioData),
round(nrow(bioData) * sampleSites, 0)))
##actual selection of the random rows to keep
bioData <- bioData[c(randRows),]
#removeRand <- fullLength[-(randRows)]
##records the sites that have been removed, for distPreds later in function
removeRand <- fullSites[which(! (fullSites %in% bioData[,siteCol]))]
}
##identifies and removes filtered out sites and sampled sites from predData
##renames siteColumn in order to access objects correctly
colnames(bioData)[colnames(bioData)==siteColumn] <- "theSiteColumn"
colnames(predData)[colnames(predData)==siteColumn] <- "theSiteColumn"
predData <- unique(predData)
#predData3 <- predData[which(predData$theSiteColumn %in%
# as.character(as.numeric(bioData$theSiteColumn))),]
predData <- predData[which(predData$theSiteColumn %in%
as.character(bioData$theSiteColumn)),]
##remove custom weights from any sites removed by species filtering and sampling
if(weightType=="custom" & !is.null(custWeights)){
colnames(custWeights)[colnames(custWeights)==siteColumn] <- "theSiteColumn"
custWeights <- custWeights[which(predData$theSiteColumn %in% custWeights[,"theSiteColumn"]),]
hwap <- custWeights[,"theSiteColumn"]#[,1]
hwap <- order(hwap)
custWeights <- custWeights[hwap,]
colnames(custWeights)[colnames(custWeights)=="theSiteColumn"] <- siteColumn
}
##rename site columns for results
colnames(bioData)[colnames(bioData)=="theSiteColumn"] <- siteColumn
colnames(predData)[colnames(predData)=="theSiteColumn"] <- siteColumn
##as a final check, makes sure bioData and predData sites are in same order
predSite <- which(names(predData) == siteColumn)
bioSite <- which(names(bioData)==siteColumn)
hwap <- predData[,predSite]
hwap <- order(hwap)
predData <- predData[hwap,]
rosetta <- bioData[,bioSite]
rosetta <- order(rosetta)
bioData <- bioData[rosetta,]
##sets up species data for calculating dissimilarity
bx <- which(names(bioData)==XColumn)
by <- which(names(bioData)==YColumn)
sppData <- bioData[-c(bioSite, bx, by)]
##creates distance matrix
if(abundance==F){
sppData[sppData>=1] <- 1
sppData[sppData==0] <- 0
sppData[is.na(sppData)] <- 0
distData <- vegdist(sppData, dist, binary=T)
}else{
sppData[is.na(sppData)] <- 0
distData <- vegdist(sppData, dist, binary=F)
}
########################################################################
##species data as site-by-site distance matrix
}else if(bioFormat==3){
holdSite <- bioData[, which(siteColumn %in% colnames(bioData))]
if(!is.numeric(holdSite)){
stop("Site IDs must be numeric when providing a site-by-site distance matrix (bioFormat=3).")
}
bioData <- bioData[, -which(siteColumn %in% colnames(bioData))]
orderedData <- as.matrix(as.dist(bioData[order(holdSite),
order(holdSite)]))
# Date: 04/16/2024
# Note: The upper triangle of the distance matrix should be converted to a vector by row so we need to transpose the distance matrix first
# to make sure the the order of pair-wise distances in the vector matches with other of predictors in the data frame generated by gdm.formatsitepair function.
# This was fixed by Xin Chen.
distData <- lower.tri(as.matrix(orderedData), diag = FALSE)
# distData <- as.vector(orderedData[distData])
distData <- as.vector(t(orderedData)[distData])
predData <- unique(predData)
hwap <- predData[siteColumn][, 1]
hwap <- order(hwap)
predData <- predData[order(holdSite), ]
########################################################################
##site pair table, already prepped
}else if(bioFormat==4){
##site-pair distance value
outTable <- bioData
########################################################################
}else{
##return error, bioFormat argument out of bounds
stop(paste("bioFormat argument of '", as.character(bioFormat), "' is not an accepted input value", sep=""))
}
########################################################################
##With the dissim distance calculated, creates and fills the table in gdm format
if(bioFormat!=4){
##creates base site-pair table
outTable <- as.data.frame(createsitepair(dist=distData,
spdata=bioData,
envInfo=predData,
dXCol=XColumn,
dYCol=YColumn,
siteCol=siteColumn,
weightsType=weightType,
custWeights=custWeights))
}else{
outTable <- bioData
}
##first checks that the size of the dissimilarity matrices, if any were provided
##then pastes any dissimilarity matrices onto the created site-pair table
if(length(distPreds)>0){
baseMat <- distPreds[[1]]
##checks to size of each dissimilarity matrix, to make sure they are all the same
lapply(distPreds, function(mat, mat1){
if((dim(mat1)[1]!=dim(mat)[1]) & (dim(mat1)[2]!=dim(mat)[2])){
stop("The dimensions of your predictor matrices are not the same.")
}
}, mat1=baseMat)
#print(dim(baseMat))
##hold site columns
holdSiteCols <- lapply(distPreds, function(dP){dP[,which(siteColumn %in% colnames(dP))]})
#remove site column from matrices
distPreds <- lapply(distPreds, function(dP){dP[,-which(siteColumn %in% colnames(dP))]})
#print(dim(distPreds[[1]]))
##orders the distance matrices of distPreds
distPreds <- mapply(function(dP, hSC){as.matrix(as.dist(dP[order(hSC),order(hSC)]))},
dP=distPreds, hSC=holdSiteCols, SIMPLIFY=FALSE)
#print(dim(distPreds[[1]]))
##orders the site columns to match the distance matrices
orderSiteCols <- lapply(holdSiteCols, function(hSC){hSC[order(hSC)]})
##compares sites with sites removed, for one reason or another
rmSites <- c(toRemove, removeRand)
##removes the sites removed above when creating the site-pair table
if(length(rmSites)>0){
rmIndex <- lapply(orderSiteCols, function(hSC, tR){which((hSC %in% tR))}, tR=rmSites)
distPreds <- mapply(function(mat, tR){mat[-c(tR), -c(tR)]}, mat=distPreds, tR=rmIndex, SIMPLIFY=FALSE)
}
##set new baseMat
baseMat <- distPreds[[1]]
#print(dim(baseMat))
##checks the size of the dissimilarity matrices against the size of distData
baseMatDat <- lower.tri(as.matrix(baseMat),diag=FALSE)
baseMatDat <- as.vector(baseMat[baseMatDat])
#print(nrow(outTable))
#print(length(baseMatDat))
if(nrow(outTable)!=length(baseMatDat)){
stop("The dimensions of the distance predictor matrices do not match the biological data.")
}
##adds any associated distance predictors to the sitepair table
for(num in 1:length(distPreds)){
#num <- 2
##isolate matrix
matrixDat <- lower.tri(as.matrix(distPreds[[num]], diag=FALSE))
sweetFreakinPredMatrix <- as.vector(distPreds[[num]][matrixDat])
##add matrix to table
if(ncol(outTable)>6){
##break table up to insert matrix data columns
baseSitePair <- outTable[,1:6]
otherSitePair <- outTable[,7:ncol(outTable)]
otherNames <- colnames(otherSitePair)
s1SitePair <- as.data.frame(otherSitePair[,1:(ncol(otherSitePair)/2)])
colnames(s1SitePair) <- otherNames[1:(ncol(otherSitePair)/2)]
s2SitePair <- as.data.frame(otherSitePair[,(ncol(otherSitePair)/2+1):ncol(otherSitePair)])
colnames(s2SitePair) <- otherNames[(ncol(otherSitePair)/2+1):ncol(otherSitePair)]
##formats data from dissimilarity matrices
s1Zeros <- as.data.frame(rep(0,length(sweetFreakinPredMatrix)))
colnames(s1Zeros) <- paste("s1.matrix_", num, sep="")
s2Mat <- as.data.frame(sweetFreakinPredMatrix)
colnames(s2Mat) <- paste("s2.matrix_", num, sep="")
##restructures the output table
outTable <- cbind(baseSitePair, s1SitePair, s1Zeros, s2SitePair, s2Mat)
}else{
##formats data from dissimilarity matrices
s1Zeros <- as.data.frame(rep(0,length(sweetFreakinPredMatrix)))
colnames(s1Zeros) <- paste("s1.matrix_", num, sep="")
s2Mat <- as.data.frame(sweetFreakinPredMatrix)
colnames(s2Mat) <- paste("s2.matrix_", num, sep="")
##restructures the output table
outTable <- cbind(outTable, s1Zeros, s2Mat)
}
}
}
if(verbose){
if(weightType[1]=="equal"){
print("Site weighting type: Equal")
}else if(weightType[1]=="custom"){
print("Site weighting type: Custom")
}else{
print("Site weighting type: Richness")
}
print(paste0("Site-pair table created with ", nrow(outTable), " rows ",
"(", nrow(unique(outTable[,3:4]))+1, " unique sites)", " and ",
ncol(outTable) , " columns (", (ncol(outTable)-6)/2,
" environmental variables)."))
}
##return output table
class(outTable) <- c("gdmData", "data.frame")
return(outTable)
}