/
plot.R
149 lines (136 loc) · 5.99 KB
/
plot.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#' Plot distributions of SDGs identified in text
#'
#' \code{plot_sdg} creates a (stacked) barplot of the frequency distribution of SDGs identified via \link{detect_sdg} or \link{detect_sdg_systems}.
#'
#' The function is built using \code{\link[ggplot2]{ggplot}} and can thus be flexibly extended. See examples.
#'
#' @param hits \code{data frame} as returned by \code{\link{detect_sdg}} or \code{\link{detect_sdg_systems}}. Must include columns \code{sdg} and \code{system}.
#' @param systems \code{character} vector specifying the query systems to be visualized. Values must be available in the \code{system} column of \code{hits}. \code{systems} of length greater 1 result, by default, in a stacked barplot. Defaults to \code{NULL} in which case available values are retrieved from \code{hits}.
#' @param sdgs \code{numeric} vector with integers between 1 and 17 specifying the SDGs to be visualized. Values must be available in the \code{sdg} column of \code{hits}. Defaults to \code{NULL} in which case available values are retrieved from \code{hits}.
#' @param normalize \code{character} specifying whether results should be presented as frequencies (\code{normalize = "none"}), the default, or whether the frequencies should be normalized using either the total frequencies of each system (\code{normalize = "systems"}) or the total number of documents (\code{normalize = "documents"}).
#' @param color \code{character} vector used to color the bars according to systems. The default, \code{"unibas"}, uses three colors of University of Basel's corporate design. Alternatively, \code{color} must specified using \link{color} names or color hex values. \code{color} will be interpolated to match the length of \code{systems}.
#' @param sdg_titles \code{logical} specifying whether the titles of the SDG should added to the axis annotation.
#' @param remove_duplicates \code{logical} specifying the handling of multiple hits of the same SDG for a given document and system. Defaults to \code{TRUE} implying that no more than one hit is counted per SDG, system, and document.
#' @param ... arguments passed to \code{\link[ggplot2]{geom_bar}}.
#'
#' @return The function returns a \code{\link[ggplot2]{ggplot}} object that can either be stored in an object or printed to produce the plot.
#'
#' @examples
#' \donttest{
#' # run sdg detection
#' hits <- detect_sdg_systems(projects)
#'
#' # create barplot
#' plot_sdg(hits)
#'
#' # create barplot with facets
#' plot_sdg(hits) + ggplot2::facet_wrap(~system)
#' }
#'
#' @export
plot_sdg <- function(hits,
systems = NULL,
sdgs = NULL,
normalize = "none",
color = "unibas",
sdg_titles = FALSE,
remove_duplicates = TRUE,
...) {
# check if columns present
required_columns <- c("sdg", "system")
if (any(!required_columns %in% names(hits))) {
missing <- required_columns[!required_columns %in% names(hits)]
stop(paste0("Data object must include columns [", paste0(missing, collapse = ", "), "]."))
}
# replace NULLs
if (is.null(systems)) {
systems <- hits %>%
dplyr::arrange(system) %>%
dplyr::pull(system) %>%
as.character() %>%
unique()
}
if (is.null(sdgs)) sdgs <- unique(stringr::str_extract(hits$sdg, "[:digit:]{2}") %>% as.numeric())
# check sdg and system
if (any(!sdgs %in% 1:17)) stop("sdgs can only take numbers in 1:17.")
if (any(!systems %in% hits$system)) {
stop(paste0("Data object only contains systems [", paste0(unique(hits$system), collapse = ", "), "]."))
}
# handle duplicates
duplicates <- hits %>%
dplyr::select(document, sdg, system) %>%
duplicated()
if (any(duplicates) & remove_duplicates == TRUE) {
hits <- hits %>% dplyr::filter(!duplicates)
message(paste0(sum(duplicates), " duplicate hits removed. Set remove_duplicates = FALSE to retain duplicates."))
}
# extract number of documents
n_documents <- length(levels(hits$document))
# handle colors
if (color[1] == "unibas") {
color <- c("#D2EBE9", "#A5D7D2", "#46505A")
}
if (length(color) != length(systems)) {
color <- grDevices::colorRampPalette(color)(length(systems))
}
# handle sdgs
sdgs <- paste0("SDG-", ifelse(sdgs < 10, "0", ""), sdgs) %>% sort()
# prepare data
hits <- hits %>%
dplyr::filter(
sdg %in% sdgs,
system %in% systems
) %>%
dplyr::mutate(
sdg = factor(sdg, levels = sdgs),
system = factor(system)
)
# change to titles
if (sdg_titles == TRUE) {
sdg_titles <- aurora_queries %>%
dplyr::mutate(sdg_title = stringr::str_to_title(sdg_title)) %>%
dplyr::select(sdg, sdg_title) %>%
unique() %>%
dplyr::arrange(sdg) %>%
dplyr::pull(sdg_title, sdg)
hits <- hits %>%
dplyr::mutate(sdg = factor(sdg_titles[sdg], levels = sdg_titles))
}
# get frequencies
hits <- hits %>%
dplyr::group_by(system, sdg) %>%
dplyr::summarize(n = dplyr::n()) %>%
dplyr::ungroup()
y_label <- "Frequency"
# transform to proportions
if (normalize[1] != "none") {
if (normalize[1] == "systems") {
hits <- hits %>%
dplyr::group_by(system) %>%
dplyr::mutate(n = n / sum(n)) %>%
dplyr::ungroup()
} else if (normalize[1] == "documents") {
hits <- hits %>%
dplyr::group_by(system) %>%
dplyr::mutate(n = n / n_documents) %>%
dplyr::ungroup()
} else {
stop('Argument normalize must the "none", "systems", or "documents".')
}
y_label <- "Proportion"
}
# generate plot
plot <- hits %>%
ggplot2::ggplot(mapping = ggplot2::aes(x = sdg, y = n, fill = system)) +
ggplot2::geom_bar(..., stat = "identity") +
ggplot2::scale_x_discrete(drop = FALSE) +
ggplot2::scale_fill_manual(name = "Query\nsystem", values = color) +
ggplot2::theme_minimal() +
ggplot2::theme(
axis.text.x = ggplot2::element_text(angle = 45, hjust = 1),
axis.title.x = ggplot2::element_blank()
) +
ggplot2::labs(y = y_label)
# output plot
plot
}