-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
list to Json string improvements #156
Comments
To over come this can utilise json parser. Possible solutions:
|
|
Need a solution batch method for jsonlite for a fair comparison |
tmp1 <- tempfile()
tmp2 <- tempfile()
con <- file(tmp1)
system.time(jsonlite::stream_out(dt[, .(var2)], con))
user system elapsed
230.323 9.498 259.813
system.time(data.table::fwrite(
x=as.list(jsonify::to_ndjson(dt$var2,unbox = T)),
file="test.jsonl",
quote =F,
col.names=F)
)
# user system elapsed
# 3.547 0.444 4.191 If possible should switch from jsonlite to jsonify. will need batch method for writing table out to file but current method is pretty fast. |
Possible alternatives using jsonlite. col_to_json_raw_1 <- function(dt, col, batch = 1e4){
max_len = nrow(dt)
start <- seq(1, max_len, batch)
end <- c(start[-1]-1, max_len)
output <- unlist(
lapply(seq_along(start), function(i) {
con <- rawConnection(raw(), open = "w")
jsonlite::stream_out(subset(dt[start[i]:end[i],], select = col), con, verbose = F, pagesize = batch)
str = rawToChar(rawConnectionValue(con))
close(con)
strsplit(str, split = "\n")[[1]]
}),
recursive = FALSE
)
return(output)
}
col_to_json_raw_2 <- function(dt, col){
con <- rawConnection(raw(), open = "w")
on.exit(close(con))
jsonlite::stream_out(subset(dt, select = col), con, verbose = F)
obj = rawConnectionValue(con)
end <- which(obj == charToRaw("\n"))
start <- c(1, end[-length(end)]+1)
return(sapply(seq_along(start), function(i) rawToChar(obj[start[i]:end[i]])))
}
col_to_json_raw_3 <- function(dt, col){
con <- rawConnection(raw(), open = "w")
on.exit(close(con))
jsonlite::stream_out(subset(dt, select = col), con, verbose = F)
return(readr::read_lines(rawConnectionValue(con),progress=F))
}
col_to_json_raw_4 <- function(dt, col){
con_raw <- rawConnection(raw(), open = "w")
jsonlite::stream_out(subset(dt, select = col), con_raw, verbose = F)
con_out <- rawConnection(rawConnectionValue(con_raw))
on.exit({
close(con_raw)
close(con_out)
})
return(readLines(con_out))
}
col_to_json_text <- function(dt, col){
con <- textConnection("character", open = "w")
on.exit(close(con))
jsonlite::stream_out(subset(dt, select = col), con, verbose = F)
return(textConnectionValue(con))
} library(data.table)
n = c(1e1, 1e2,1e3,1e4, 1e5)
bench_list = lapply(n, function(x){
dt = data.table::data.table(
var1 = 1:x,
var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
)
microbenchmark::microbenchmark(
"split_text" = col_to_json_raw_1(dt,"var2"),
"split_raw" = col_to_json_raw_2(dt,"var2"),
"raw_readr" = col_to_json_raw_3(dt,"var2"),
"raw_base" = col_to_json_raw_4(dt,"var2"),
"text_base" = col_to_json_text(dt,"var2"),
times = 10
)
})
These methods seem promising. Plus if a solution with jsonlite could be found then the overall dependencies would be able to be kept low :) |
col_to_json_jsonify <- function(dt, col, batch = 1e4){
max_len <- nrow(dt)
start <- seq(1, max_len, batch)
end <- c(start[-1]-1, max_len)
splits <- lapply(seq_along(start), function(i) dt[[col]][start[i]:end[i]])
output <- lapply(splits, function(i) {
strsplit(as.character(jsonify::to_ndjson(i,unbox = T, numeric_dates = F)), split = "\n")[[1]]
})[[1]]
return(output)
} library(data.table)
n = c(1e1, 1e2,1e3,1e4, 1e5)
bench_list = lapply(n, function(x){
dt = data.table::data.table(
var1 = 1:x,
var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
)
microbenchmark::microbenchmark(
"split_text" = col_to_json_raw_1(dt,"var2"),
"split_raw" = col_to_json_raw_2(dt,"var2"),
"raw_readr" = col_to_json_raw_3(dt,"var2"),
"raw_base" = col_to_json_raw_4(dt,"var2"),
"text_jsonify" = col_to_json_jsonify(dt,"var2"),
times = 10
)
}) Even with the new jsonlite functions it looks like jsonify is still faster. |
col_to_json_raw_4 <- function(dt, col, batch = 500){
con_raw <- rawConnection(raw(), open = "w")
jsonlite::stream_out(subset(dt, select = col), con_raw, verbose = F, pagesize = batch)
con_out <- rawConnection(rawConnectionValue(con_raw))
on.exit({
close(con_raw)
close(con_out)
})
return(readLines(con_out))
} library(data.table)
n = c(1e3,1e4, 1e5)
bench_list = lapply(n, function(x){
dt = data.table::data.table(
var1 = 1:x,
var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
)
microbenchmark::microbenchmark(
"raw_base_500" = col_to_json_raw_4(dt,"var2"),
"raw_base_1000" = col_to_json_raw_4(dt,"var2", 1e3),
"raw_base_10000" = col_to_json_raw_4(dt,"var2", 1e4),
"raw_base_100000" = col_to_json_raw_4(dt,"var2", 1e5),
"text_jsonify" = col_to_json_jsonify(dt,"var2"),
times = 10
)
}) Increasing the pagesize with |
Currently noctua just collapses list using
paste
. This is incorrect as it Athena won't be able to interpret it, for example.The text was updated successfully, but these errors were encountered: