-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean-and-separate.r
179 lines (159 loc) · 6.12 KB
/
clean-and-separate.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
library("tidyr")
library('readr')
source('translate.R')
# avoid overwriting existing files
if_doesnot_exist_write <- function(.fun, df, path, ...) {
if (file.exists(path)) {
message("File exists: `", path, "`\n")
ans <- readline("Do you wanna overwrite it? [y/n] ")
if (tolower(ans) == "y") {
.fun(df, path, ...)
} else if (tolower(ans) == 'n') {
message("You chose not to overwrite the existing file, out of caution, we chose to save it under new name.")
.fun(df, sprintf("%s-%s%s",
sub("[.][a-zA-Z]{3}$", "", path),
Sys.time(),
sub("(.*)([.][a-zA-Z]{3}$)", "\\2", path), ...)
)
} else {
stop(sprintf("Cannot proceed because your choice (`%s`) is not either `y` or ` n`", ans), call. = FALSE)
}
} else {
.fun(df, path, ...)
}
}
read_in <- function(file,
delim = ";",
skip = 0,
loc = locale(decimal_mark = ",",
grouping_mark = "."), ...) {
read_delim(file = file, delim = delim, locale = loc, skip = skip, ...)
}
# get the time series dim len of the variables, takes a df what read in returns
meta_info <- function(df) {
freq <- table(nms <- sub("[_.]\\d+", "", names(df)))
freq <- as.data.frame(freq)
names(freq) <- c("var", "freq")
# since `table` annoyingly orders the result
freq$var <- factor(freq$var, levels = unique(nms))
freq <- freq[order(freq$var), ]
freq
}
# takes a data frame that read_in returns
correct_names <- function(df) {
nms <- colnames(df)
# remove any digit following a . or _,
# this is a result of read_csv/read.csv making unique names
nms <- sub("[_.]\\d+", "", nms)
# concatenate the var names and years
firstrow <- unlist(df[1, ]) # first row holds years
firstrow <- ifelse(is.na(firstrow), "", paste0("YEAR", firstrow))
paste0(nms, firstrow)
}
# varying for reshape
# freq = from meta_info, the freq of vars
# a0 = from meta_info the len of vars that are ids
construct_varying <- function(freq, a0) {
to <- vector("list", length(freq))
for (i in seq_along(to)) {
to[[i]] <- a0 + sum(freq[seq_len(i)])
}
from <- lapply(seq_along(to), function(i) if (i == 1) {
return(a0 + 1)
} else {
to[[i - 1]] + 1
})
Map(seq, from, to)
}
# return the data in a long format
# the header of the tidy data would look like
# Kennziffer,Raumeinheit,Aggregat,var, year,value
# the column var contains all time-varying variables
#
make_tidy_data <- function(df) {
static.vars = grepl("([Ee]ntwicklung)|([Vv]eränderung)", names(df))
if (any(static.vars)){
message(paste0(names(df)[static.vars], collapse = "\n"),
" are dropped. We suspect they are static or hard to combine with the rest of the vars.")
}
df = df[, !static.vars]
meta <- meta_info(df)
idvar <- c("Kennziffer", "Raumeinheit", "Aggregat")
stopifnot(all(idvar %in% meta$var))
vary <- meta[!(meta$var %in% idvar), "freq"]
vary <- construct_varying(vary, length(idvar))
times = range(na.omit(as.integer(df[1, ])))
good.names <- correct_names(df)
df <- df[-1, ] # remove the row that contains the years
df <- setNames(df, good.names)
# handle cases when the var is named like "BaulandpreiseYEAR2016/2017"
# becomes "BaulandpreiseYEAR2016"
# NB: if you want "BaulandpreiseYEAR2017" change the below code to replacement="YEAR\\2"
patt = "YEAR(\\d{4})[/.-](\\d{4})"
problematics = grepl(patt, names(df))
df[, problematics] = lapply(df[, problematics], readr::parse_number) # German delimiter , for .
names(df) = sub(patt, replacement = "YEAR\\1", names(df))
# reshape to long
# sorry, stats::reshape does not work for unbalanced panel
# reshape(df,
# direction = 'long',
# varying = vary,
# idvar = idvar,
# timevar = "year",
# times = seq(times[[1]], times[[2]]),
# v.names = setdiff(names(df), idvar)
# )
# tidyverse alternative
pivot_longer(df, cols = unlist(vary)) %>%
separate(col = name, into = c("var", "year"), sep = "YEAR") # %>%
# pivot_wider(names_from = "var", values_from = "value")
# uncomment the commented part if you need all vars in one dataset (i.e back to wide format but tidy)
}
# nest the data by var and then write each var to disk
nest_write_each <- function(df, dir = NULL) {
df <- tidyr::nest(df, data = -var)
by = vapply(df[["data"]], function(df) df[["Aggregat"]][[1]], "")
by = gsub("\\s|[.]", "", by)
varname = gsub("/", "-or-", trimws(df[['var']])) # / is dir sep must be removed
path <- sprintf("%s/%s-by-%s.csv", dir, varname, by)
lapply(1:nrow(df), function(i) {
if_doesnot_exist_write(write_csv, df$data[[i]], path[[i]])
})
}
# a fucntion that does it all -----
# read in the .csv file with `read_in`, `make_tidy_data` and `nest_write_each`
## then read in the metadata from the .xls (sheet = Metadaten),
## translate it
## write it to a file with name Meta-*
## append the first two rows of meta_trans to columns-list.csv
## write each var to separated/*
do_all_steps <- function(path) {
.dir <- dirname(path)
xls.path <- sub("[.][A-Za-z]{3}$", ".xls", path)
meta.path <- sprintf("%s/Metadata-%s", .dir, basename(path))
if (file.exists(meta.path)) {
message("File exists: `", meta.path, "`\n")
message("No need to translate the metadata. It seems it already is.\n")
meta_trans = read_csv(meta.path)
}
else {
meta_trans <- translate_metadata(read_metadata_xls(xls.path))
names(meta_trans) <- make_metadata_names(meta_trans)
write_csv(meta_trans, meta.path)
}
.table <- read_csv("data/column-names.csv")
if (any(newones <- !(meta_trans$Indikator %in% .table$de)))
write_csv(with(meta_trans[newones,], data.frame(de = Indikator, en = Indicator)),
"data/column-names.csv", append = TRUE)
.table <- read_csv("data/column-names.csv")
if (!dir.exists(new.dir <- paste0(.dir, "/separated"))) {
dir.create(new.dir)
}
df_cleaned <- make_tidy_data(read_in(path))
nest_write_each(df_cleaned, new.dir)
if_doesnot_exist_write(
write_tsv,
translate_colnames(nest(df_cleaned, data = -var)$var, .table),
paste0(new.dir, "/readme.txt")
)
}