-
-
Notifications
You must be signed in to change notification settings - Fork 26
/
visualizations.r
executable file
路1384 lines (1335 loc) 路 57.9 KB
/
visualizations.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#
# This software was authored by Zhian N. Kamvar and Javier F. Tabima, graduate
# students at Oregon State University; Jonah C. Brooks, undergraduate student at
# Oregon State University; and Dr. Nik Gr眉nwald, an employee of USDA-ARS.
#
# Permission to use, copy, modify, and distribute this software and its
# documentation for educational, research and non-profit purposes, without fee,
# and without a written agreement is hereby granted, provided that the statement
# above is incorporated into the material, giving appropriate attribution to the
# authors.
#
# Permission to incorporate this software into commercial products may be
# obtained by contacting USDA ARS and OREGON STATE UNIVERSITY Office for
# Commercialization and Corporate Development.
#
# The software program and documentation are supplied "as is", without any
# accompanying services from the USDA or the University. USDA ARS or the
# University do not warrant that the operation of the program will be
# uninterrupted or error-free. The end-user understands that the program was
# developed for research purposes and is advised not to rely exclusively on the
# program for any reason.
#
# IN NO EVENT SHALL USDA ARS OR OREGON STATE UNIVERSITY BE LIABLE TO ANY PARTY
# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
# LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION,
# EVEN IF THE OREGON STATE UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE. USDA ARS OR OREGON STATE UNIVERSITY SPECIFICALLY DISCLAIMS ANY
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY STATUTORY
# WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS"
# BASIS, AND USDA ARS AND OREGON STATE UNIVERSITY HAVE NO OBLIGATIONS TO PROVIDE
# MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#==============================================================================#
#' Internal function to plot the results from ia() and poppr()
#'
#' @param sample either an object of class "ialist" or a list of ialists
#' @param pval a named vector specifying the p values to display
#' @param pop The name of the population
#' @param file The name of the source file
#' @param N The number of samples in the population
#' @param observed observed values of Ia and rbarD
#' @param index The index to plot (defaults to "rbarD")
#' @param labsize size of the in-plot label
#' @param linesize size of the in-plot line
#'
#' @return a ggplot2 object
#' @keywords internal
#'
#' @examples
#' \dontrun{
#' data(Pinf)
#' x <- Pinf %>% seppop() %>% lapply(ia, sample = 99, valuereturn = TRUE, quiet = TRUE, plot = FALSE)
#' x
#' poppr:::poppr.plot(sample = x, file = "hey") # plots multiple populations
#' # plot.ialist takes care of the single populations.
#' for (i in x){
#' print(plot(i))
#' }
#' }
#'
poppr.plot <- function(sample, pval = c(Ia = 0.05, rbarD = 0.05),
pop = NULL, file = NULL, N = NULL,
observed = c(Ia = 0, rbarD = 0),
index = c("rbarD", "Ia"),
labsize = rel(3),
linesize = rel(1)){
INDEX_ARGS <- c("rbarD", "Ia")
index <- match.arg(index, INDEX_ARGS)
tidy_ialist <- function(ialist){
res <- stats::reshape(ialist$samples,
direction = "long",
varying = 1:2,
times = c("Ia", "rbarD"),
timevar = "variable",
v.names = "value")[-3] %>%
dplyr::as_tibble()
res$variable <- as.factor(res$variable)
res
}
if (!class(sample) %in% "ialist" & class(sample) %in% "list"){
ggsamps <- dplyr::bind_rows(lapply(sample, tidy_ialist), .id = "population")
ggvals <- vapply(sample, "[[", numeric(4), "index")
ggvals <- setNames(as.data.frame.table(ggvals, stringsAsFactors = FALSE),
c("index", "population", "value"))
ggsamps <- ggsamps[ggsamps$variable == index, ]
ggvals <- ggvals[grep(ifelse(index == "Ia", "I", "D"), ggvals$index), ]
ggindex <- ggvals[ggvals$index == index, ]
ggpval <- ggvals[ggvals$index == ifelse(index == "Ia", "p.Ia", "p.rD"), ]
ggsamps$population <- factor(ggsamps$population, names(sample))
srange <- range(ggsamps$value, na.rm = TRUE)
binw <- diff(srange/30)
plot_title <- make_poppr_plot_title(sample[[1]][["samples"]][[index]],
file = file)
thePlot <- ggplot(ggsamps, aes_string(x = "value", group = "population")) +
geom_histogram(binwidth = binw, position = "identity") +
geom_rug(alpha = 0.5) +
geom_vline(aes_string(xintercept = "value"), color = "blue", linetype = 2,
data = ggindex) +
facet_wrap(~population, scales = "free_x") +
ggtitle(plot_title)
if (index == "rbarD"){
thePlot <- thePlot + xlab(expression(paste(bar(r)[d])))
} else if (index == "Ia"){
thePlot <- thePlot + xlab(expression(paste(I[A])))
}
return(thePlot)
}
plot_title <- make_poppr_plot_title(sample[[index]], file, N, pop)
if (all(is.nan(sample$rbarD))){
oops <- ggplot(as.data.frame(list(x=-10:9)), aes_string(x = "x")) +
geom_histogram(binwidth=1, fill="orange") +
geom_text(aes(label="Warning:", x=0, y=0.8), color="black", size=rel(15)) +
geom_text(aes(label="Data contains only NaNs and\ncannot be displayed graphically",
x=0, y=0.5, hjust=0.5), color="black", size=rel(10)) +
theme(line = element_blank(),
axis.text = element_blank(),
axis.title = element_blank()) +
ggtitle(plot_title)
return(oops)
}
if (any(is.na(sample[[index]]))){
sample[[index]][is.na(sample[[index]])] <- mean(sample[[index]], na.rm=TRUE)
}
srange <- range(sample[[index]])
labs <- c(Ia = "I[A]", rbarD = "bar(r)[d]")
names(pval) <- names(labs)
binw <- diff(srange/30)
if (is.na(observed[index])){
xval <- srange[2]
just <- 1
vline <- geom_blank()
annot_color <- "red"
obs <- "NaN"
} else if (all(observed[index] > srange)){
xval <- observed[index]
just <- 1.1
} else {
maxobsmin <- c(srange[1], observed[index], srange[2])
xjust <- which.max(abs(diff(maxobsmin)))
xval <- srange[xjust]
just <- ifelse(xjust < 2, 0, 1)
}
if (!exists("annot_color")){
annot_color <- "blue"
vline <- geom_vline(xintercept = observed[index], color = "blue",
linetype = 2, size = linesize)
obs <- signif(observed[index], 3)
}
obslab <- paste(labs[index], ":", obs, sep = "")
plab <- paste("p =", signif(pval[index], 3))
ggdata <- tidy_ialist(list(samples = sample))
ggdata <- ggdata[ggdata$variable == index, ]
thePlot <- ggplot(ggdata, aes_string(x = "value"))
thePlot <- thePlot +
geom_histogram(binwidth = binw, position = "identity") +
geom_rug() +
vline +
annotate(geom = "text", x = xval, y = Inf, label = obslab, color = annot_color,
vjust = 2, hjust = just, parse = TRUE, size = labsize) +
annotate(geom = "text", x = xval, y = Inf, label = plab, color = annot_color,
vjust = 4, hjust = just, size = labsize)
if (index == "rbarD"){
thePlot <- thePlot + xlab(expression(paste(bar(r)[d])))
} else if (index == "Ia"){
thePlot <- thePlot + xlab(expression(paste(I[A])))
}
thePlot <- thePlot + ggtitle(plot_title)
return(thePlot)
}
#==============================================================================#
#
#' Create a minimum spanning network of selected populations using a distance
#' matrix.
#'
#' @param gid a \code{\link{genind}}, \code{\link{genclone}},
#' \code{\link{genlight}}, or \code{\link{snpclone}} object
#'
#' @param distmat a distance matrix that has been derived from your data set.
#'
#' @param mlg.compute if the multilocus genotypes are set to "custom" (see
#' \code{\link{mll.custom}} for details) in your genclone object, this will
#' specify which mlg level to calculate the nodes from. See details.
#'
#' @param palette a \code{vector} or \code{function} defining the color palette
#' to be used to color the populations on the graph. It defaults to
#' \code{\link{topo.colors}}. See examples for details.
#'
#' @param sublist a \code{vector} of population names or indexes that the user
#' wishes to keep. Default to "ALL".
#'
#' @param exclude a \code{vector} of population names or indexes that the user
#' wishes to discard. Default to \code{NULL}.
#'
#' @param blacklist DEPRECATED, use exclude.
#'
#' @param vertex.label a \code{vector} of characters to label each vertex. There
#' are two defaults: \code{"MLG"} will label the nodes with the multilocus
#' genotype from the original data set and \code{"inds"} will label the nodes
#' with the representative individual names.
#'
#' @param gscale "grey scale". If this is \code{TRUE}, this will scale the color
#' of the edges proportional to the observed distance, with the lines becoming
#' darker for more related nodes. See \code{\link{greycurve}} for details.
#'
#' @param glim "grey limit". Two numbers between zero and one. They determine
#' the upper and lower limits for the \code{\link{gray}} function. Default is
#' 0 (black) and 0.8 (20\% black). See \code{\link{greycurve}} for details.
#'
#' @param gadj "grey adjust". a positive \code{integer} greater than zero that
#' will serve as the exponent to the edge weight to scale the grey value to
#' represent that weight. See \code{\link{greycurve}} for details.
#'
#' @param gweight "grey weight". an \code{integer}. If it's 1, the grey scale
#' will be weighted to emphasize the differences between closely related
#' nodes. If it is 2, the grey scale will be weighted to emphasize the
#' differences between more distantly related nodes. See
#' \code{\link{greycurve}} for details.
#'
#' @param wscale "width scale". If this is \code{TRUE}, the edge widths will be
#' scaled proportional to the inverse of the observed distance , with the
#' lines becoming thicker for more related nodes.
#'
#' @param showplot logical. If \code{TRUE}, the graph will be plotted. If
#' \code{FALSE}, it will simply be returned.
#'
#' @param include.ties logical. If \code{TRUE}, the graph will include all edges
#' that were arbitrarily passed over in favor of another edge of equal weight.
#' If \code{FALSE}, which is the default, one edge will be arbitrarily
#' selected when two or more edges are tied, resulting in a pure minimum
#' spanning network.
#'
#' @param threshold numeric. By default, this is \code{NULL}, which will have no
#' effect. Any threshold value passed to this argument will be used in
#' \code{\link{mlg.filter}} prior to creating the MSN. If you have a data set
#' that contains contracted MLGs, this argument will override the threshold in
#' the data set. See Details.
#'
#' @param clustering.algorithm string. By default, this is \code{NULL}. If
#' \code{threshold = NULL}, this argument will have no effect. When supplied
#' with either "farthest_neighbor", "average_neighbor", or "nearest_neighbor",
#' it will be passed to \code{\link{mlg.filter}} prior to creating the MSN. If
#' you have a data set that contains contracted MLGs, this argument will
#' override the algorithm in the data set. See Details.
#'
#' @param ... any other arguments that could go into plot.igraph
#'
#' @return \item{graph}{a minimum spanning network with nodes corresponding to
#' MLGs within the data set. Colors of the nodes represent population
#' membership. Width and color of the edges represent distance.}
#' \item{populations}{a vector of the population names corresponding to the
#' vertex colors} \item{colors}{a vector of the hexadecimal representations of
#' the colors used in the vertex colors}
#'
#'
#' @details The minimum spanning network generated by this function is generated
#' via igraph's \code{\link[igraph:mst]{minimum.spanning.tree}}. The resultant
#' graph produced can be plotted using igraph functions, or the entire object
#' can be plotted using the function \code{\link{plot_poppr_msn}}, which will
#' give the user a scale bar and the option to layout your data.
#' \subsection{node sizes}{
#' The area of the nodes are representative of the number of samples. Because
#' \pkg{igraph} scales nodes by radius, the node sizes in the graph are
#' represented as the square root of the number of samples.}
#' \subsection{mlg.compute}{
#' Each node on the graph represents a different multilocus genotype.
#' The edges on the graph represent genetic distances that connect the
#' multilocus genotypes. In genclone objects, it is possible to set the
#' multilocus genotypes to a custom definition. This creates a problem for
#' clone correction, however, as it is very possible to define custom lineages
#' that are not monophyletic. When clone correction is performed on these
#' definitions, information is lost from the graph. To circumvent this, The
#' clone correction will be done via the computed multilocus genotypes, either
#' "original" or "contracted". This is specified in the \code{mlg.compute}
#' argument, above.}
#' \subsection{contracted multilocus genotypes}{
#' If your incoming data set is of the class \code{\linkS4class{genclone}},
#' and it contains contracted multilocus genotypes, this function will retain
#' that information for creating the minimum spanning network. You can use the
#' arguments \code{threshold} and \code{clustering.algorithm} to change the
#' threshold or clustering algorithm used in the network. For example, if you
#' have a data set that has a threshold of 0.1 and you wish to have a minimum
#' spanning network without a threshold, you can simply add
#' \code{threshold = 0.0}, and no clustering will happen.
#'
#' The \code{threshold} and \code{clustering.algorithm} arguments can also be
#' used to filter un-contracted data sets.
#'
#' All filtering will use the distance matrix supplied in the argument
#' \code{distmat}.
#' }
#'
#' @note The edges of these graphs may cross each other if the graph becomes too
#' large.
#'
#' @seealso \code{\link{plot_poppr_msn}} \code{\link{nancycats}},
#' \code{\link{upgma}}, \code{\link{nj}}, \code{\link{nodelabels}},
#' \code{\link{tab}}, \code{\link{missingno}}, \code{\link{bruvo.msn}},
#' \code{\link{greycurve}}
#'
#' @export
#' @aliases msn.poppr
#' @author Javier F. Tabima, Zhian N. Kamvar, Jonah C. Brooks
#' @examples
#'
#' # Load the data set and calculate the distance matrix for all individuals.
#' data(Aeut)
#' A.dist <- diss.dist(Aeut)
#'
#' # Graph it.
#' A.msn <- poppr.msn(Aeut, A.dist, gadj = 15, vertex.label = NA)
#'
#' # Find the sizes of the nodes (number of individuals per MLL):
#' igraph::vertex_attr(A.msn$graph, "size")^2
#'
#' \dontrun{
#' # Set subpopulation structure.
#' Aeut.sub <- as.genclone(Aeut)
#' setPop(Aeut.sub) <- ~Pop/Subpop
#'
#' # Plot respective to the subpopulation structure
#' As.msn <- poppr.msn(Aeut.sub, A.dist, gadj=15, vertex.label=NA)
#'
#' # Show only the structure of the Athena population.
#' As.msn <- poppr.msn(Aeut.sub, A.dist, gadj=15, vertex.label=NA, sublist=1:10)
#'
#' # Let's look at the structure of the microbov data set
#'
#' library("igraph")
#' data(microbov)
#' micro.dist <- diss.dist(microbov, percent = TRUE)
#' micro.msn <- poppr.msn(microbov, micro.dist, vertex.label=NA)
#'
#' # Let's plot it and show where individuals have < 15% of their genotypes
#' # different.
#'
#' edge_weight <- E(micro.msn$graph)$weight
#' edge_labels <- ifelse(edge_weight < 0.15, round(edge_weight, 3), NA)
#' plot.igraph(micro.msn$graph, edge.label = edge_labels, vertex.size = 2,
#' edge.label.color = "red")
#'
#' }
#'
#==============================================================================#
poppr.msn <- function (gid, distmat, palette = topo.colors, mlg.compute = "original",
sublist = "All", exclude = NULL, blacklist = NULL, vertex.label = "MLG",
gscale=TRUE, glim = c(0,0.8), gadj = 3, gweight = 1,
wscale=TRUE, showplot = TRUE,
include.ties = FALSE, threshold = NULL,
clustering.algorithm = NULL, ...){
# testing gid -------------------------------------------------------------
if (!inherits(gid, c("genclone", "snpclone"))){
if (inherits(gid, "genind")){
gid <- as.genclone(gid)
} else if (inherits(gid, "genlight")){
gid <- as.snpclone(gid)
} else {
stop("gid must be a genind/genclone or genlight/snpclone object.")
}
}
if (!inherits(gid@mlg, "MLG")){
gid@mlg <- new("MLG", gid@mlg)
}
if (!is.null(blacklist)) {
warning(
option_deprecated(
match.call(),
"blacklist",
"exclude",
"2.8.7.",
"Please use `exclude` in the future"
),
immediate. = TRUE
)
exclude <- blacklist
}
# Handling MLG type -------------------------------------------------------
visible_mlg <- visible(gid@mlg)
if (visible_mlg == "custom"){
mll(gid) <- mlg.compute
} else if (visible_mlg == "contracted"){
mll(gid) <- "original"
if (is.null(threshold)){
threshold <- cutoff(gid@mlg)["contracted"]
}
if (is.null(clustering.algorithm)){
clustering.algorithm <- distalgo(gid@mlg)
}
}
# testing dist ------------------------------------------------------------
is_dist <- inherits(distmat, "dist")
is_mat <- inherits(distmat, "matrix")
if (is_dist | is_mat){
n <- nInd(gid)
eq_size <- if (is_dist) n == attr(distmat, "Size") else n == nrow(distmat)
if (!eq_size){
stop("The size of the distance matrix does not match the size of the data.\n")
}
} else {
stop("The distance matrix is neither a dist object nor a matrix.\n")
}
distmat <- as.matrix(distmat)
gadj <- ifelse(gweight == 1, gadj, -gadj)
# Subsetting the population -----------------------------------------------
# This will subset both the population and the matrix.
if (toupper(sublist[1]) != "ALL" | !is.null(exclude)){
sublist_exclude <- sub_index(gid, sublist, exclude)
distmat <- distmat[sublist_exclude, sublist_exclude, drop = FALSE]
gid <- popsub(gid, sublist, exclude)
}
# Clone correcting the matrix ---------------------------------------------
if (!is.null(threshold)){
filtered <- filter_at_threshold(gid,
threshold,
indist = distmat,
clustering.algorithm,
bruvo_args = NULL)
distmat <- filtered$indist
cgid <- filtered$cgid
gid <- filtered$gid
} else {
cgid <- gid[.clonecorrector(gid), ]
singles <- !duplicated(mll(gid))
distmat <- distmat[singles, singles, drop = FALSE]
}
rownames(distmat) <- indNames(cgid) -> colnames(distmat)
poppr_msn_list <- msn_constructor(
gid = gid,
cgid = cgid,
palette = palette,
indist = distmat,
include.ties = include.ties,
mlg.compute = mlg.compute,
vlab = vertex.label,
visible_mlg = visible_mlg,
wscale = wscale,
gscale = gscale,
glim = glim,
gadj = gadj,
showplot = showplot,
...)
return(poppr_msn_list)
}
#==============================================================================#
#' Create a table summarizing missing data or ploidy information of a genind or
#' genclone object
#'
#' @param gen a \linkS4class{genind} or \linkS4class{genclone} object.
#'
#' @param type \code{character}. What information should be returned. Choices
#' are "missing" (Default) and "ploidy". See Description.
#'
#' @param percent \code{logical}. (ONLY FOR \code{type = 'missing'}) If
#' \code{TRUE} (default), table and plot will represent missing data as a
#' percentage of each cell. If \code{FALSE}, the table and plot will represent
#' missing data as raw counts. (See details)
#'
#' @param plot \code{logical}. If \code{TRUE}, a simple heatmap will be
#' produced. If \code{FALSE} (default), no heatmap will be produced.
#'
#' @param df \code{logical}. If \code{TRUE}, the data will be returned as a long
#' form data frame. If \code{FALSE} (default), a matrix with samples in rows
#' and loci in columns will be returned.
#'
#' @param returnplot \code{logical}. If \code{TRUE}, a list is returned with two
#' elements: \code{table} - the normal output and \code{plot} - the ggplot
#' object. If \code{FALSE}, the table is returned.
#'
#' @param low \code{character}. What color should represent no missing data or
#' lowest observed ploidy? (default: "blue")
#'
#' @param high \code{character}. What color should represent the highest amount
#' of missing data or observed ploidy? (default: "red")
#'
#' @param plotlab \code{logical}. (ONLY FOR \code{type = 'missing'}) If
#' \code{TRUE} (default), values of missing data greater than 0\% will be
#' plotted. If \code{FALSE}, the plot will appear un-appended.
#'
#' @param scaled \code{logical}. (ONLY FOR \code{type = 'missing'}) This is for
#' when \code{percent = TRUE}. If \code{TRUE} (default), the color specified
#' in \code{high} will represent the highest observed value of missing data.
#' If \code{FALSE}, the color specified in \code{high} will represent 100\%.
#'
#' @return a matrix, data frame (\code{df = TRUE}), or a list (\code{returnplot
#' = TRUE}) representing missing data per population (\code{type = 'missing'})
#' or ploidy per individual (\code{type = 'ploidy'}) in a \linkS4class{genind}
#' or \linkS4class{genclone} object.
#'
#' @details
#' Missing data is accounted for on a per-population level.\cr
#' Ploidy is accounted for on a per-individual level.
#'
#' \subsection{For type = 'missing'}{
#' This data is potentially useful for identifying areas of systematic missing
#' data. There are a few caveats to be aware of. \itemize{ \item
#' \strong{Regarding counts of missing data}: Each count represents the number
#' of individuals with missing data at each locus. The last column, "mean" can
#' be thought of as the average number of individuals with missing data per
#' locus. \item \strong{Regarding percentage missing data}: This percentage is
#' \strong{relative to the population and locus}, not to the entire data set.
#' The last column, "mean" represents the average percent of the population
#' with missing data per locus. }}
#' \subsection{For type = 'ploidy'}{
#' This option is useful for data that has been imported with mixed ploidies.
#' It will summarize the relative levels of ploidy per individual per locus.
#' This is simply based off of observed alleles and does not provide any
#' further estimates.}
#'
#' @export
#' @keywords missing ploidy
#' @author Zhian N. Kamvar
#' @examples
#' data(nancycats)
#' nancy.miss <- info_table(nancycats, plot = TRUE, type = "missing")
#' data(Pinf)
#' Pinf.ploid <- info_table(Pinf, plot = TRUE, type = "ploidy")
#'
#==============================================================================#
info_table <- function(gen, type = c("missing", "ploidy"), percent = TRUE, plot = FALSE,
df = FALSE, returnplot = FALSE, low = "blue",
high = "red", plotlab = TRUE, scaled = TRUE){
datalabel <- as.character(match.call()[2])
ARGS <- c("missing", "ploidy")
type <- match.arg(type, ARGS)
if (type == "missing"){
valname <- "Missing"
pops <- seppop(gen, drop = FALSE)
pops$Total <- gen
inds <- 1
if (percent){
inds <- c(table(pop(gen)), nInd(gen))
}
data_table <- matrix(0, nrow = nLoc(gen) + 1, ncol = length(pops))
data_table[1:nLoc(gen), ] <- vapply(pops, number_missing_locus, numeric(nLoc(gen)), 1)
data_table[-nrow(data_table), ] <- t(apply(data_table[-nrow(data_table), ], 1, "/", inds))
data_table[nrow(data_table), ] <- colMeans(data_table[-nrow(data_table), ])
rownames(data_table) <- c(locNames(gen), "Mean")
colnames(data_table) <- names(pops)
dimnames(data_table) <- list(Locus = c(locNames(gen), "Mean"), Population = names(pops))
if (all(data_table == 0)){
message("No Missing Data Found!")
return(NULL)
}
if (plot){
data_df <- as.data.frame.table(data_table,
responseName = valname,
stringsAsFactors = FALSE)
leg_title <- valname
data_df[1:2] <- data.frame(lapply(data_df[1:2],
function(x) factor(x, levels = unique(x))))
plotdf <- data_df -> textdf
if (percent) {
plotdf$Missing <- round(plotdf$Missing*100, 2)
textdf$Missing <- paste(plotdf$Missing, "%")
miss <- "0 %"
title <- paste("Percent missing data per locus and population of",
datalabel)
leg_title <- paste("Percent", leg_title)
if(!scaled){
lims <- c(0, 100)
}
} else {
textdf$Missing <- round(textdf$Missing, 2)
miss <- 0
title <- paste("Missing data per locus and population of",
datalabel)
}
if (scaled | !percent){
lims <- c(0, max(plotdf$Missing))
}
linedata <- data.frame(list(yint = 1.5, xint = nrow(data_table) - 0.5))
textdf$Missing <- ifelse(textdf$Missing == miss, "", textdf$Missing)
plotdf$Missing[plotdf$Locus == "Mean" & plotdf$Population == "Total"] <- NA
outplot <- ggplot(plotdf, aes_string(x = "Locus", y = "Population")) +
geom_tile(aes_string(fill = valname)) +
labs(list(title = title, x = "Locus", y = "Population")) +
labs(fill = leg_title) +
scale_fill_gradient(low = low, high = high, na.value = "white",
limits = lims) +
geom_hline(aes_string(yintercept = "yint"), data = linedata) +
geom_vline(aes_string(xintercept = "xint"), data = linedata)
if (plotlab){
outplot <- outplot + geom_text(aes_string(label = valname),
data = textdf)
}
outplot <- outplot +
theme_classic() +
scale_x_discrete(expand = c(0, -1)) +
scale_y_discrete(expand = c(0, -1),
limits = rev(unique(plotdf$Population))) +
theme(axis.text.x = element_text(size = 10, angle = -45,
hjust = 0, vjust = 1))
print(outplot)
}
} else if (type == "ploidy"){
valname <- "Observed_Ploidy"
data_table <- get_local_ploidy(gen)
dimnames(data_table) <- list(Samples = indNames(gen), Loci = locNames(gen))
if (plot){
data_df <- as.data.frame.table(data_table,
responseName = valname,
stringsAsFactors = FALSE)
data_df[1:2] <- data.frame(lapply(data_df[1:2],
function(x) factor(x, levels = unique(x))))
vars <- aes_string(x = "Loci", y = "Samples", fill = valname)
mytheme <- theme_classic() +
theme(axis.text.x = element_text(size = 10, angle = -45,
hjust = 0, vjust = 1))
legvars <- unique(data_df[[valname]])
title <- paste("Observed ploidy of", datalabel)
outplot <- ggplot(data_df) + geom_tile(vars) +
scale_fill_gradient(low = low, high = high, guide = "legend",
breaks = legvars) +
scale_x_discrete(expand = c(0, -1)) +
scale_y_discrete(expand = c(0, -1),
limits = rev(unique(data_df$Samples))) +
labs(list(title = title, x = "Locus", y = "Sample",
fill = "Observed\nPloidy")) +
mytheme
print(outplot)
}
}
if (df){
if (!exists("data_df")){
data_df <- as.data.frame.table(data_table,
responseName = valname,
stringsAsFactors = FALSE)
}
data_table <- data_df
} else {
if (type == "missing"){
data_table <- t(data_table)
}
class(data_table) <- c("locustable", "matrix")
}
if (returnplot & exists("outplot")){
data_table <- list(table = data_table, plot = outplot)
}
return(data_table)
}
#==============================================================================#
#' Display a greyscale gradient adjusted to specific parameters
#'
#' This function has one purpose. It is for deciding the appropriate scaling for
#' a grey palette to be used for edge weights of a minimum spanning network.
#'
#'
#' @param data a sequence of numbers to be converted to greyscale.
#'
#' @param glim "grey limit". Two numbers between zero and one. They determine
#' the upper and lower limits for the \code{\link{gray}} function. Default is 0
#' (black) and 0.8 (20\% black).
#'
#' @param gadj "grey adjust". a positive \code{integer} greater than zero that
#' will serve as the exponent to the edge weight to scale the grey value to
#' represent that weight.
#'
#' @param gweight "grey weight". an \code{integer}. If it's 1, the grey scale
#' will be weighted to emphasize the differences between closely related nodes.
#' If it is 2, the grey scale will be weighted to emphasize the differences
#' between more distantly related nodes.
#'
#' @param scalebar When this is set to \code{TRUE}, two scalebars will be
#' plotted. The purpose of this is for adding a scale bar to minimum spanning
#' networks produced in earlier versions of poppr.
#'
#' @return A plot displaying a grey gradient from 0.001 to 1 with minimum and
#' maximum values displayed as yellow lines, and an equation for the correction
#' displayed in red.
#'
#' @author Zhian N. Kamvar
#'
#' @examples
#' # Normal grey curve with an adjustment of 3, an upper limit of 0.8, and
#' # weighted towards smaller values.
#' greycurve()
#' \dontrun{
#' # 1:1 relationship grey curve.
#' greycurve(gadj=1, glim=1:0)
#'
#' # Grey curve weighted towards larger values.
#' greycurve(gweight=2)
#'
#' # Same as the first, but the limit is 1.
#' greycurve(glim=1:0)
#'
#' # Setting the lower limit to 0.1 and weighting towards larger values.
#' greycurve(glim=c(0.1,0.8), gweight=2)
#' }
#' @export
#==============================================================================#
greycurve <- function(data = seq(0, 1, length = 1000), glim = c(0,0.8),
gadj = 3, gweight = 1, scalebar = FALSE){
gadj <- ifelse(gweight == 1, gadj, -gadj)
adjustcurve(data, glim, correction = gadj, show = TRUE, scalebar = scalebar)
}
#==============================================================================#
#' Plot minimum spanning networks produced in poppr.
#'
#' This function allows you to take the output of poppr.msn and bruvo.msn and
#' customize the plot by labeling groups of individuals, size of nodes, and
#' adjusting the palette and scale bar.
#'
#' @param x a \code{\linkS4class{genind}}, \code{\linkS4class{genclone}},
#' \code{\linkS4class{genlight}}, or \code{\linkS4class{snpclone}} object from
#' which \code{poppr_msn} was derived.
#'
#' @param poppr_msn a \code{list} produced from either \code{\link{poppr.msn}}
#' or \code{\link{bruvo.msn}}. This list should contain a graph, a vector of
#' population names and a vector of hexadecimal color definitions for each
#' population.
#'
#' @inheritParams greycurve
#'
#' @inheritParams poppr.msn
#'
#'
#' @param nodescale a \code{numeric} indicating how to scale the node sizes (scales by area).
#' @param nodebase \strong{deprecated} a \code{numeric} indicating what base logarithm should be
#' used to scale the node sizes. Defaults to 1.15. See details.
#'
#' @param nodelab an \code{integer} specifying the smallest size of node to
#' label. See details.
#'
#' @param inds a \code{character} or \code{numeric} vector indicating which
#' samples or multilocus genotypes to label on the graph. See details.
#'
#' @param mlg \code{logical} When \code{TRUE}, the nodes will be labeled by
#' multilocus genotype. When \code{FALSE} (default), nodes will be labeled by
#' sample names.
#'
#' @param quantiles \code{logical}. When set to \code{TRUE} (default), the scale
#' bar will be composed of the quantiles from the observed edge weights. When
#' set to \code{FALSE}, the scale bar will be composed of a smooth gradient
#' from the minimum edge weight to the maximum edge weight.
#'
#' @param cutoff a number indicating the longest distance to display in your
#' graph. This is performed by removing edges with weights greater than this
#' number.
#'
#' @param palette a function or character corresponding to a specific palette
#' you want to use to delimit your populations. The default is whatever
#' palette was used to produce the original graph.
#'
#' @param layfun a function specifying the layout of nodes in your graph. It
#' defaults to \code{\link[igraph:layout_nicely]{layout.auto}}.
#'
#' @param beforecut if \code{TRUE}, the layout of the graph will be computed
#' before any edges are removed with \code{cutoff}. If \code{FALSE} (Default),
#' the layout will be computed after any edges are removed.
#'
#' @param pop.leg if \code{TRUE}, a legend indicating the populations will
#' appear in the top right corner of the graph, but will not overlap. Setting
#' \code{pop.leg = FALSE} disables this legend. See details.
#'
#' @param size.leg if \code{TRUE}, a legend displyaing the number of samples per
#' node will appear either below the population legend or in the top right
#' corner of the graph. Setting \code{size.leg = FALSE} disables this legend.
#'
#' @param scale.leg if \code{TRUE}, a scale bar indicating the distance will
#' appear under the graph. Setting \code{scale.leg = FALSE} suppresses this
#' bar. See details.
#'
#' @param ... any other parameters to be passed on to
#' \code{\link[igraph]{plot.igraph}}.
#'
#' @details The previous incarnation of msn plotting in poppr simply plotted the
#' minimum spanning network with the legend of populations, but did not
#' provide a scale bar and it did not provide the user a simple way of
#' manipulating the layout or labels. This function allows the user to
#' manipulate many facets of graph creation, making the creation of minimum
#' spanning networks ever so slightly more user friendly.
#'
#' This function must have both the source data and the output msn to work.
#' The source data must contain the same population structure as the graph.
#' Every other parameter has a default setting.
#'
#' \subsection{Parameter details}{ \itemize{
#' \item \code{inds} By default, the graph will label each node (circle) with
#' all of the samples (individuals) that are contained within that node. As
#' each node represents a single multilocus genotype (MLG) or individuals (n
#' >= 1), this argument is designed to allow you to selectively label the
#' nodes based on query of sample name or MLG number. If the option \code{mlg
#' = TRUE}, the multilocus genotype assignment will be used to label the node.
#' If you do not want to label the nodes by individual or multilocus genotype,
#' simply set this to a name that doesn't exist in your data.
#' \item \code{nodescale} The nodes (circles) on the graph represent different
#' multilocus genotypes. The area of the nodes represent the number of
#' individuals. Setting nodescale will scale the area of the nodes.
#' \item \code{nodelab} If a node is not labeled by individual, this will
#' label the size of the nodes greater than or equal to this value. If you
#' don't want to label the size of the nodes, simply set this to a very high
#' number.
#' \item \code{cutoff} This is useful for when you want to investigate groups
#' of multilocus genotypes separated by a specific distance or if you have two
#' distinct populations and you want to physically separate them in your
#' network.
#' \item \code{beforecut} This is an indicator useful if you want to maintain
#' the same position of the nodes before and after removing edges with the
#' \code{cutoff} argument. This works best if you set a seed before you run
#' the function.
#' }}
#'
#' \subsection{mlg.compute}{
#' Each node on the graph represents a different multilocus genotype.
#' The edges on the graph represent genetic distances that connect the
#' multilocus genotypes. In genclone objects, it is possible to set the
#' multilocus genotypes to a custom definition. This creates a problem for
#' clone correction, however, as it is very possible to define custom lineages
#' that are not monophyletic. When clone correction is performed on these
#' definitions, information is lost from the graph. To circumvent this, The
#' clone correction will be done via the computed multilocus genotypes, either
#' "original" or "contracted". This is specified in the \code{mlg.compute}
#' argument, above.}
#'
#' \subsection{legends}{ To avoid drawing the legend over the graph, legends
#' are separated by different plotting areas. This means that if legends are
#' included, you cannot plot multiple MSNs in a single plot. The scale bar (to
#' be added in manually) can be obtained from \code{\link{greycurve}} and the
#' legend can be plotted with \code{\link{legend}}.}
#'
#' @return the modified msn list, invisibly.
#'
#' @seealso \code{\link[igraph:layout_nicely]{layout.auto}} \code{\link[igraph]{plot.igraph}}
#' \code{\link{poppr.msn}} \code{\link{bruvo.msn}} \code{\link{greycurve}}
#' \code{\link[igraph]{delete_edges}} \code{\link{palette}}
#'
#' @author Zhian N. Kamvar
#' @export
#' @examples
#' # Using a data set of the Aphanomyces eutieches root rot pathogen.
#' data(Aeut)
#' adist <- diss.dist(Aeut, percent = TRUE)
#' amsn <- poppr.msn(Aeut, adist, showplot = FALSE)
#'
#' # Default
#' library("igraph") # To get all the layouts.
#' set.seed(500)
#' plot_poppr_msn(Aeut, amsn, gadj = 15)
#'
#' \dontrun{
#'
#' # Different layouts (from igraph) can be used by supplying the function name.
#' set.seed(500)
#' plot_poppr_msn(Aeut, amsn, gadj = 15, layfun = layout_with_kk)
#'
#' # Removing link between populations (cutoff = 0.2) and labelling no individuals
#' set.seed(500)
#' plot_poppr_msn(Aeut, amsn, inds = "none", gadj = 15, beforecut = TRUE, cutoff = 0.2)
#'
#' # Labelling individual #57 because it is an MLG that crosses populations
#' # Showing clusters of MLGS with at most 5% variation
#' # Notice that the Mt. Vernon population appears to be more clonal
#' set.seed(50)
#' plot_poppr_msn(Aeut, amsn, gadj = 15, cutoff = 0.05, inds = "057")
#'
#'
#' data(partial_clone)
#' pcmsn <- bruvo.msn(partial_clone, replen = rep(1, 10))
#'
#' # You can plot using a color palette or a vector of named colors
#' # Here's a way to define the colors beforehand
#' pc_colors <- nPop(partial_clone) %>%
#' RColorBrewer::brewer.pal("Set2") %>%
#' setNames(popNames(partial_clone))
#'
#' pc_colors
#'
#' # Labelling the samples contained in multilocus genotype 9
#' set.seed(999)
#' plot_poppr_msn(partial_clone, pcmsn, palette = pc_colors, inds = 9)
#'
#' # Doing the same thing, but using one of the sample names as input.
#' set.seed(999)
#' plot_poppr_msn(partial_clone, pcmsn, palette = pc_colors, inds = "sim 20")
#'
#' # Note that this is case sensitive. Nothing is labeled.
#' set.seed(999)
#' plot_poppr_msn(partial_clone, pcmsn, palette = pc_colors, inds = "Sim 20")
#'
#' # Something pretty
#' data(microbov)
#' mdist <- diss.dist(microbov, percent = TRUE)
#' micmsn <- poppr.msn(microbov, mdist, showplot = FALSE)
#'
#' plot_poppr_msn(microbov, micmsn, palette = "terrain.colors", inds = "n",
#' quantiles = FALSE)
#' plot_poppr_msn(microbov, micmsn, palette = "terrain.colors", inds = "n",
#' cutoff = 0.3, quantiles = FALSE)
#'
#' ### Utilizing vectors for palettes
#'
#' data(Pram)
#' Pram_sub <- popsub(Pram, exclude = c("Nursery_CA", "Nursery_OR"))
#'
#' # Creating the network for the forest
#' min_span_net_sub <- bruvo.msn(Pram_sub, replen = other(Pram)$REPLEN,
#' add = TRUE, loss = TRUE, showplot = FALSE,
#' include.ties = TRUE)
#'
#' # Creating the network with nurseries
#' min_span_net <- bruvo.msn(Pram, replen = other(Pram)$REPLEN,
#' add = TRUE, loss = TRUE, showplot = FALSE,
#' include.ties = TRUE)
#'
#' # Only forest genotypes
#' set.seed(70)
#' plot_poppr_msn(Pram,
#' min_span_net_sub,
#' inds = "ALL",
#' mlg = TRUE,
#' gadj = 9,
#' nodescale = 5,
#' palette = other(Pram)$comparePal,
#' cutoff = NULL,
#' quantiles = FALSE,
#' beforecut = TRUE)
#'
#' # With Nurseries
#' set.seed(70)
#' plot_poppr_msn(Pram,
#' min_span_net,
#' inds = "ALL",
#' mlg = TRUE,
#' gadj = 9,
#' nodescale = 5,
#' palette = other(Pram)$comparePal,
#' cutoff = NULL,
#' quantiles = FALSE,
#' beforecut = TRUE)
#' }
#==============================================================================#
#' @importFrom igraph layout.auto delete.edges
plot_poppr_msn <- function(x,
poppr_msn,
gscale = TRUE,
gadj = 3,
mlg.compute = "original",
glim = c(0, 0.8),
gweight = 1,
wscale = TRUE,
nodescale = 10,
nodebase = NULL,
nodelab = 2,
inds = "ALL",
mlg = FALSE,
quantiles = TRUE,
cutoff = NULL,
palette = NULL,
layfun = layout.auto,
beforecut = FALSE,
pop.leg = TRUE,
size.leg = TRUE,
scale.leg = TRUE,
...) {
if (!is(x, "genlight") && !is.genind(x)){
stop(paste(substitute(x), "is not a genind or genclone object."))
}
if (!identical(names(poppr_msn), c("graph", "populations", "colors"))){
stop("graph not compatible")
}
if (!is.null(palette)){
pal <- palette