Skip to content

Commit

Permalink
Merge pull request #19 from coderaanalytics/econdata_structures
Browse files Browse the repository at this point in the history
Econdata structures
  • Loading branch information
byrongibby committed Jan 15, 2024
2 parents 69407fe + cb6fb80 commit 5174793
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 45 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
*.prj
.lsp
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: econdatar
Title: Automation of time series uploads and downloads
Version: 2.0.4
Version: 2.0.5
Date: 2023-03-13
Authors@R: c(person("Byron", "Botha", role = c("aut", "cre"), email = "byron@codera.co.za"),
person("Sebastian", "Krantz", role = "ctb"))
Expand Down
65 changes: 42 additions & 23 deletions R/read_econdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,23 @@ read_econdata <- function(id, ..., tidy = FALSE) {

data_structure <- data_message[[2]]$structures[["data-structures"]][[1]][[2]]

series_dims <- sapply(data_structure$components, function(component) {
series_ids <- sapply(data_structure$components, function(component) {
if (component[[1]] == "#sdmx.infomodel.datastructure.Dimension") {
component[[2]][["concept-identity"]][[2]]$id
} else {
NA
}
}) |>
na.omit()
})

series_pos <- sapply(data_structure$components, function(component) {
if (component[[1]] == "#sdmx.infomodel.datastructure.Dimension") {
component[[2]]$position
} else {
NA
}
})

series_dims <- na.omit(series_ids[order(series_pos)])

obs_attrs <- sapply(data_structure$components, function(component) {
if (component[[1]] == "#sdmx.infomodel.datastructure.Attribute") {
Expand Down Expand Up @@ -148,13 +157,13 @@ read_econdata <- function(id, ..., tidy = FALSE) {

query_params <- list()

if (is.null(params$release) || params$release != "unreleased") {

tryCatch(query_params$release <- strftime(params$release, "%Y-%m-%dT%H:%M:%S"),
error = function(e) { query_params$release <- NULL })
if (is.null(params$release)) params$release = "latest"

if (is.null(query_params$release)) {
if (params$release != "unreleased") {

query_params$release <- tryCatch({
strftime(params$release, "%Y-%m-%dT%H:%M:%S")
}, error = function(e) {
response <- GET(env$repository$url,
path = paste(env$repository$path,
"datasets",
Expand All @@ -169,36 +178,39 @@ read_econdata <- function(id, ..., tidy = FALSE) {

data_message <- content(response, type = "application/json", encoding = "UTF-8")

if (is.null(params$release) || params$release == "latest") {
release <- tail(data_message$releases, n = 1)[[1]]$release |>
if (params$release == "latest") {
release <- head(data_message$releases, n = 1)[[1]]$release |>
as.POSIXct(x, tz = "UTC", format = "%Y-%m-%dT%H:%M:%SZ")
attr(release, "tzone") <- "Africa/Johannesburg"
query_params$release <- strftime(release, "%Y-%m-%dT%H:%M:%S")

return(strftime(release, "%Y-%m-%dT%H:%M:%S"))

} else {
release <- sapply(data_message$releases, function(release) {
if(params$release == release$description) {
release$release
} else {
NA
}
}) |>
na.omit() |>
head(n = 1)
if(params$release == release$description) {
release$release
} else {
NA
}
}) |>
na.omit() |>
head(n = 1)

if (length(release) != 0) {
release <- as.POSIXct(release, tz = "UTC", format = "%Y-%m-%dT%H:%M:%SZ")
attr(release, "tzone") <- "Africa/Johannesburg"
query_params$release <- strftime(release, "%Y-%m-%dT%H:%M:%S")

return(strftime(release, "%Y-%m-%dT%H:%M:%S"))
} else {
message("Release not found, returning latest release instead.")
release <- tail(data_message$releases, n = 1)[[1]]$release |>
as.POSIXct(x, tz = "UTC", format = "%Y-%m-%dT%H:%M:%SZ")
attr(release, "tzone") <- "Africa/Johannesburg"
query_params$release <- strftime(release, "%Y-%m-%dT%H:%M:%S")

return(strftime(release, "%Y-%m-%dT%H:%M:%S"))
}
}
}
})
}

if (!is.null(params$series_key)) {
Expand Down Expand Up @@ -277,6 +289,13 @@ read_econdata <- function(id, ..., tidy = FALSE) {
if (length(database) == 1) {
return(database[[1]])
} else {
return(list("data-sets", database[[1]]))
if (tidy) {
names(database) <-
paste0("v", sapply(database,
function(x) attr(x, "metadata")$version))
return(database)
} else {
return(database)
}
}
}
Empty file removed R/read_structure.R
Empty file.
4 changes: 3 additions & 1 deletion R/write_release.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ write_release <- function(id, version, providerid, description, reset = FALSE, r
if (!is.null(params$release)) {
query_params$release <- params$release
} else {
query_params$release <- format(Sys.time(), "%Y-%m-%dT%H:%M:%S")
query_params$release <- format(Sys.time(),
"%Y-%m-%dT%H:%M:%S",
tz = "Africa/Johannesburg")
}


Expand Down
3 changes: 0 additions & 3 deletions R/write_structure.R

This file was deleted.

4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
```r
install.packages(c("remotes", "tcltk"), repos = "https://cran.mirror.ac.za")
library("remotes")
install_github("coderaanalytics/econdatar", ref = "2.0.4")
install_github("coderaanalytics/econdatar", ref = "2.0.5")
```

Install from disk
Expand All @@ -30,7 +30,7 @@ Or if selecting a particular release **(recommended)**, [see](https://github.com
```r
library("remotes")
remove.packages("econdatar")
install_github("coderaanalytics/econdatar", ref = "2.0.4")
install_github("coderaanalytics/econdatar", ref = "2.0.5")
```

Please see the [EconData blog](https://randomsample.co.za) for in depth tutorials
Expand Down
32 changes: 17 additions & 15 deletions man/read_econdata.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
read_econdata
}
\description{
Returns the data set for the given dataflow - ECONDATA:id(version) and data provider - ECONDATA:id, as a list, or as tidy \emph{data.table}'s. Available data sets can be looked up from the data registry (http://www.econdata.co.za/FusionRegistry). Tidying can be done directly within \code{read_econdata()}, or ex-post using \code{econdata_tidy()}.
Returns the data for the given data set - ECONDATA:id(version), as a list, or as tidy \emph{data.table}'s. Available data sets can be looked up from the web platform (http://www.econdata.co.za). Tidying can be done directly within \code{read_econdata()}, or ex-post using \code{econdata_tidy()}.
}
\usage{
read_econdata(id, \dots, tidy = FALSE)
Expand All @@ -18,13 +18,13 @@ econdata_tidy(x, \dots)
\item{\dots}{Further \emph{Optional} arguments:
\tabular{llll}{
\code{agencyid} \tab\tab Agency responsible for the data definition. \cr\cr
\code{version} \tab\tab Version of the data definition. \cr\cr
\code{provideragencyid} \tab\tab Agency responsible for making the data available. \cr\cr
\code{providerid} \tab\tab Provider of the data. \cr\cr
\code{file} \tab\tab character. File name for retrieving JSON data from disk. \cr\cr
\code{username} \tab\tab character. EconData username. \cr\cr
\code{password} \tab\tab character. EconData password. \cr\cr
\code{agencyid} \tab\tab character. Agency responsible for the metadata creation/maintenance. \cr
\code{version} \tab\tab character. Version(s) of the data (different versions will have different metadata), or 'all' to return all available versions. \cr
\code{series_key} \tab\tab character. A character vector specifying a subset of time series (see the web platform (export function) for details). \cr
\code{release} \tab\tab character or time object with format \%Y-\%m-\%dT\%H:\%M:\%S. The release description, or a date/time which will return the data as it was at that moment, or 'latest', or 'unreleased'. \cr
\code{file} \tab\tab character. File name for retrieving JSON data from disk. \cr
\code{username} \tab\tab character. Web username. \cr
\code{password} \tab\tab character. Web password. \cr
}
}
Expand All @@ -47,17 +47,15 @@ econdata_tidy(x, \dots)
}
}
\details{
Specifying the full dataflow and data provider details (as opposed to only using the data id) allows more fine-grained control over the data set being queried. This is not necessary if there is only a single definition of the data and a single provider, which is typically the case.

An EconData account (http://www.econdata.co.za) is required to use this function. The user must provide their credentials either through the function arguments, or by setting the ECONDATA_CREDENTIALS environment variable using the syntax: "username;password", e.g. \code{Sys.setenv(ECONDATA_CREDENTIALS="username;password")}. If credentials are not supplied by the aforementioned methods a GUI dialog will prompt the user for credentials.
}
\value{
%% ~Describe the value returned
If \code{tidy = FALSE}, a list of data frames is returned, where the names of the list are the EconData series codes, and each data frame has a single column named 'OBS_VALUE' containing the data, with corresponding dates attached as rownames. Each data frame further has a \code{"metadata"} attribute providing information about the series. The entire list of data frames also has a \code{"metadata"} attribute, providing information about the dataset. If multiple datasets (or versions of a dataset if \code{version} is left empty) are being queried, a list of such lists is returned.
If \code{tidy = FALSE}, a list of data frames is returned, where the names of the list are the EconData series codes, and each data frame has a single column named 'OBS_VALUE' containing the data, with corresponding dates attached as rownames. Each data frame further has a \code{"metadata"} attribute providing information about the series. The entire list of data frames also has a \code{"metadata"} attribute, providing information about the dataset. If multiple datasets (or versions of a dataset if \code{version} is specified as 'all') are being queried, a list of such lists is returned.

If \code{tidy = TRUE} and \code{wide = TRUE} (the default), a single \emph{data.table} is returned where the first column is the date, and the remaining columns are series named by their EconData codes. Each series has two attributes: \code{"label"} provides a variable label combining important metadata from the \code{"metadata"} attribute in the non-tidy format, and \code{"source.code"} gives the series code assigned by the original data provider. The table has the same dataset-level \code{"metadata"} attribute as the list of data frames if \code{tidy = FALSE}. If multiple datasets (or versions of a dataset if \code{version} is left empty) are being queried, a list of such \emph{data.table}'s is returned.
If \code{tidy = TRUE} and \code{wide = TRUE} (the default), a single \emph{data.table} is returned where the first column is the date, and the remaining columns are series named by their EconData codes. Each series has two attributes: \code{"label"} provides a variable label combining important metadata from the \code{"metadata"} attribute in the non-tidy format, and \code{"source.code"} gives the series code assigned by the original data provider. The table has the same dataset-level \code{"metadata"} attribute as the list of data frames if \code{tidy = FALSE}. If multiple datasets (or versions of a dataset if \code{version} is specified as 'all') are being queried, a list of such \emph{data.table}'s is returned.
If \code{tidy = TRUE} and \code{wide = FALSE} and \code{compact = FALSE} (the default), a named list of two \emph{data.table}'s is returned. The first, \code{"data"}, has columns 'code', 'date' and 'value' providing the data in a long format. The second, \code{"metadata"}, provides dataset and series-level matadata, with one row for each series. If \code{compact = TRUE}, these two datasets are combined, where all repetitive content is converted to factors for more efficient storage. If multiple datasets (or versions of a dataset if \code{version} is left empty) are being queried, \code{compact = FALSE} gives a nested list, whereas \code{compact = TRUE} binds everything together to a single long frame. In general, if \code{wide = FALSE}, no attributes are attached to the tables or columns in the tables.
If \code{tidy = TRUE} and \code{wide = FALSE} and \code{compact = FALSE} (the default), a named list of two \emph{data.table}'s is returned. The first, \code{"data"}, has columns 'code', 'date' and 'value' providing the data in a long format. The second, \code{"metadata"}, provides dataset and series-level matadata, with one row for each series. If \code{compact = TRUE}, these two datasets are combined, where all repetitive content is converted to factors for more efficient storage. If multiple datasets (or versions of a dataset if \code{version} is specified as 'all') are being queried, \code{compact = FALSE} gives a nested list, whereas \code{compact = TRUE} binds everything together to a single long frame. In general, if \code{wide = FALSE}, no attributes are attached to the tables or columns in the tables.

%% \item{comp1 }{Description of 'comp1'}
%% \item{comp2 }{Description of 'comp2'}
Expand All @@ -82,12 +80,16 @@ ELECTRICITY_LONG <- econdata_tidy(ELECTRICITY, wide = FALSE)
with(ELECTRICITY_LONG, metadata[data, on = "data_key"])

# CPI Analytical Series: Different Revisions
CPI_ANL <- read_econdata(id = "CPI_ANL_SERIES")
CPI_ANL <- read_econdata(id = "CPI_ANL_SERIES", version = "all")
CPI_ANL_WIDE <- econdata_tidy(CPI_ANL)
CPI_ANL_LONG <- econdata_tidy(CPI_ANL, wide = FALSE, combine = TRUE)
CPI_ANL_ALLMETA <- econdata_tidy(CPI_ANL, wide = FALSE, allmeta = TRUE) # v2.0 has some 0-obs series

# Can query a specific version by adding e.g. version = "2.0" to the call
# Can query a specific version by adding e.g. version = "2.0.0" to the call

# Returns 5-10 years (daily average bond yields) not yet contained in the latest release
# (particularly useful for daily data that is released monthly)
MARKET_RATES <- read_econdata(id = "MARKET_RATES", series_key = "CMJD003.B.A", release = "unreleased")

}
}
Expand Down

0 comments on commit 5174793

Please sign in to comment.