/
cas_build_urls.R
204 lines (197 loc) · 6.94 KB
/
cas_build_urls.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#' URL builder
#'
#' Convenience function typically used to generate urls to index pages listing
#' articles.
#'
#' @section Date formats: It is not uncommon in particular for index pages to
#' include dates in the URL, along the lines of
#' `example.com/archive/2022-01-01`, `example.com/archive/2022-01-02`, etc. To
#' build such urls, \code{cas_build_urls} needs a `start_date` and `end_date`.
#' The formatting of the date can be defined either by providing to the
#' parameter `date_format` a string that \code{\link{strptime}} is able to
#' interpret directly, or a simplified string (such as "Ymd", without the
#' "%"),adding a `date_separator` such as "-" as needed.
#'
#'
#' @param url First part of index link that does not change in other
#' index pages.
#' @param glue Logical, defaults to FALSE. If TRUE, the url is parsed with
#' `glue`, enabling custom or repeated location for the variable part of the
#' url. If `glue` is set to TRUE, it is expected that the url will include the
#' string `{here}` within curly brackets, e.g.
#' `https://example.com/archive/?from_date={here}&to_date={here}`.
#' @param url_ending Part of index link appneded after the part of the link that
#' varies. If not relevant, may be left empty.
#' @param start_page If the urls include a numerical component, define first
#' number of the sequence. Defaults to NULL. If given, coerced to numeric,
#' expected to be an integer.
#' @param end_page If the urls include a numerical component, define first
#' number of the sequence. Defaults to NULL. If given, coerced to numeric,
#' expected to be an integer.
#' @param increase_by Defines by how much the number in the link should be
#' increased in the numerical sequence. Defaults to 1.
#' @param date_format A character string, defaults to "YMD". Check
#' \code{\link{strptime}} for valid values used to define the format of the
#' date that is part of the URL. Simplified formats such as the following are
#' also accepted: "Y" (e.g. 2022), "Ym" (2022-10), "Ymd" (e.g. 2022-10-24).
#' See details.
#' @param start_date Defaults to NULL. If given, a date, or a character vector
#' of length one coercible to date with \code{\link{as.Date}}. When given,
#' urls are built based on dates, and parameters `start_page`, `end_page`, and
#' `increase_by`, are ignored.
#' @param end_date Defaults to \code{Sys.Date()}. If given, a date, or a
#' character vector of length one coercible to date with
#' \code{\link{as.Date}}.
#' @param increase_date_by Defaults to "day". See \code{\link{seq.Date}} for
#' valid values.
#' @param reversed_order Logical, defaults to FALSE. If TRUE, the order of urls
#' in the output.
#' @param index_group A character vector, defaults to "index". Used for
#' differentiating among different types of index or links in local databases.
#' @param index Defaults to TRUE. Relevant only if `write_to_db` is also set to
#' TRUE. If TRUE, urls are stored in the local database in the index table,
#' otherwise they are stored in the contents table.
#' @param write_to_db Defaults to FALSE. If set to TRUE, stores the newly
#' created URLs to the local database.
#' @return A data frame with three columns, `id`, `url`, and `index_group`.
#' Typically, `url` corresponds to a vector of unique urls.
#' @export
#' @examples
#' cas_build_urls(
#' url = "https://www.example.com/news/",
#' start_page = 1,
#' end_page = 10
#' )
#'
#' cas_build_urls(
#' url = "https://example.com/news/?skip=",
#' start_page = 0,
#' end_page = 100,
#' increase_by = 10
#' )
#'
#'
#' cas_build_urls(
#' url = "https://example.com/archive/",
#' start_date = "2022-01-01",
#' end_date = "2022-12-31",
#' date_separator = "-"
#' ) %>%
#' head()
#'
#' cas_build_urls(
#' url = "https://example.com/archive/?from={here}&to={here}",
#' glue = TRUE,
#' start_date = "2011-01-01",
#' end_date = "2022-12-31",
#' date_separator = ".",
#' date_format = "dmY",
#' index_group = "news"
#' )
cas_build_urls <- function(url,
url_ending = "",
glue = FALSE,
start_page = NULL,
end_page = NULL,
increase_by = 1,
date_format = "Ymd",
start_date = NULL,
end_date = Sys.Date() - 1,
date_separator = NULL,
increase_date_by = "day",
reversed_order = FALSE,
index_group = "index",
index = TRUE,
write_to_db = FALSE,
...) {
if (is.null(end_page)==FALSE) {
if (length(end_page)!=1) {
cli::cli_abort(
message = c(`x` = "{.var end_page} must be a numeric vector of length 1.")
)
}
end_page_n <- as.numeric(end_page)
if (is.na(end_page_n)) {
if (stringr::str_detect(string = end_page, pattern = "-")) {
cli::cli_abort(
message = c(`x` = "{.var end_page} must be an integer.",
i = "Check your inputs. Perhaps you meant to set {.var end_date} instead?")
)
} else {
cli::cli_abort(
message = c(`x` = "{.var end_page} must be an integer.")
)
}
}
}
if (is.null(start_date) == FALSE) {
# allow for simplified date_format
if (stringr::str_detect(string = date_format, pattern = "%", negate = TRUE)) {
date_format <- stringr::str_c(
"%",
c(stringr::str_split(
string = date_format,
pattern = "",
simplify = TRUE
))
) %>%
stringr::str_c(collapse = "")
}
if (is.null(date_separator) == FALSE) {
date_format <- stringr::str_replace_all(
string = date_format,
pattern = "(?!^)%",
replacement = stringr::str_c(date_separator, "%")
)
}
variable_part <- base::format(
base::seq.Date(as.Date(start_date),
as.Date(end_date),
by = increase_date_by
),
date_format
) %>%
base::unique()
} else if (is.null(start_page) | is.null(end_page)) {
variable_part <- ""
} else {
variable_part <- format(
base::seq(start_page, end_page, increase_by),
scientific = FALSE
) %>%
stringr::str_trim()
}
if (glue == TRUE) {
here <- variable_part
urls <- glue::glue(url)
} else {
urls <- stringr::str_c(
url,
variable_part,
url_ending
)
}
if (reversed_order == TRUE) {
urls <- base::rev(urls)
}
output_df <- tibble::tibble(url = as.character(urls %>%
base::unique() %>%
stringr::str_trim())) %>%
dplyr::mutate(
index_group = index_group,
id = as.numeric(dplyr::row_number())
) %>%
dplyr::select(
"id",
"url",
"index_group"
)
if (write_to_db == TRUE) {
cas_write_db_urls(
urls = output_df,
index = index,
...
)
}
output_df
}