Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

list to Json string improvements #156

Closed
DyfanJones opened this issue Sep 16, 2021 · 7 comments
Closed

list to Json string improvements #156

DyfanJones opened this issue Sep 16, 2021 · 7 comments
Labels
enhancement New feature or request

Comments

@DyfanJones
Copy link
Owner

DyfanJones commented Sep 16, 2021

Currently noctua just collapses list using paste. This is incorrect as it Athena won't be able to interpret it, for example.

paste(list(list("var3"= 1:3, "var4" = list("var5"= letters))), collapse = "|"))

'list(var3 = 1:3, var4 = list(var5 = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z")))'
@DyfanJones DyfanJones added the enhancement New feature or request label Sep 16, 2021
@DyfanJones
Copy link
Owner Author

To over come this can utilise json parser.

Possible solutions:

  • jsonlite
  • jsonify
  • rcppsimdjson

@DyfanJones
Copy link
Owner Author

DyfanJones commented Sep 16, 2021

library(data.table)

x = 1e6

dt1 = data.table(
  var1 = 1:x,
  var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
)

dt2 = data.table(
  var1 = 1:x,
  var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
)

col = "var2"
system.time(set(dt1, j=col, value=sapply(dt1[[col]], jsonlite::toJSON, auto_unbox = T)))
#   user  system elapsed 
# 196.737   1.269 199.344 

system.time(set(dt2, j=col, value=noctua:::list_to_json(dt2[[col]])))
#   user  system elapsed 
#  6.409   0.094   6.582 

head(dt1)
   var1                                                   var2
1:    1 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
2:    2 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
3:    3 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
4:    4 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
5:    5 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
6:    6 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
head(dt2)
   var1                                                   var2
1:    1 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
2:    2 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
3:    3 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
4:    4 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
5:    5 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
6:    6 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}

@DyfanJones
Copy link
Owner Author

Need a solution batch method for jsonlite for a fair comparison

@DyfanJones
Copy link
Owner Author

tmp1 <- tempfile()
tmp2 <- tempfile()
con <- file(tmp1)

system.time(jsonlite::stream_out(dt[, .(var2)], con))
user  system elapsed 
230.323   9.498 259.813 

system.time(data.table::fwrite(
  x=as.list(jsonify::to_ndjson(dt$var2,unbox = T)),
  file="test.jsonl",
  quote =F,
  col.names=F)
 )
# user  system elapsed 
# 3.547   0.444   4.191 

If possible should switch from jsonlite to jsonify. will need batch method for writing table out to file but current method is pretty fast.

@DyfanJones
Copy link
Owner Author

Possible alternatives using jsonlite.

col_to_json_raw_1 <- function(dt, col, batch = 1e4){
  max_len = nrow(dt)
  start <- seq(1, max_len, batch)
  end <- c(start[-1]-1, max_len)
  output <- unlist(
    lapply(seq_along(start), function(i) {
      con <- rawConnection(raw(), open = "w")
      jsonlite::stream_out(subset(dt[start[i]:end[i],], select = col), con, verbose = F, pagesize = batch)
      str = rawToChar(rawConnectionValue(con))
      close(con)
      strsplit(str, split = "\n")[[1]]
    }),
    recursive = FALSE
  )
  return(output)
}

col_to_json_raw_2 <- function(dt, col){
  
  con <- rawConnection(raw(), open = "w")
  on.exit(close(con))
  
  jsonlite::stream_out(subset(dt, select = col), con, verbose = F)
  
  obj = rawConnectionValue(con)
  
  end <- which(obj == charToRaw("\n"))
  start <- c(1, end[-length(end)]+1)
  
  return(sapply(seq_along(start), function(i) rawToChar(obj[start[i]:end[i]])))
}


col_to_json_raw_3 <- function(dt, col){
  
  con <- rawConnection(raw(), open = "w")
  on.exit(close(con))
  
  jsonlite::stream_out(subset(dt, select = col), con, verbose = F)
  
  return(readr::read_lines(rawConnectionValue(con),progress=F))
}

col_to_json_raw_4 <- function(dt, col){
  
  con_raw <- rawConnection(raw(), open = "w")
  
  jsonlite::stream_out(subset(dt, select = col), con_raw, verbose = F)
  
  con_out <- rawConnection(rawConnectionValue(con_raw))
  
  on.exit({
    close(con_raw)
    close(con_out)
  })
  
  return(readLines(con_out))
}

col_to_json_text <- function(dt, col){
  con <- textConnection("character", open = "w")
  on.exit(close(con))
  jsonlite::stream_out(subset(dt, select = col), con, verbose = F)
  return(textConnectionValue(con))
}
library(data.table)

n = c(1e1, 1e2,1e3,1e4, 1e5)
bench_list = lapply(n, function(x){
    dt = data.table::data.table(
        var1 = 1:x,
        var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
    )
    
    microbenchmark::microbenchmark(
        "split_text" = col_to_json_raw_1(dt,"var2"),
        "split_raw" = col_to_json_raw_2(dt,"var2"),
        "raw_readr" = col_to_json_raw_3(dt,"var2"),
        "raw_base" = col_to_json_raw_4(dt,"var2"),
        "text_base" = col_to_json_text(dt,"var2"),
        times = 10
    )
})
benchplot(bench_list, n)

Screenshot 2021-09-21 at 15 39 38

These methods seem promising. Plus if a solution with jsonlite could be found then the overall dependencies would be able to be kept low :)

@DyfanJones
Copy link
Owner Author

col_to_json_jsonify <- function(dt, col, batch = 1e4){
  max_len <- nrow(dt)
  start <- seq(1, max_len, batch)
  end <- c(start[-1]-1, max_len)
  splits <- lapply(seq_along(start), function(i) dt[[col]][start[i]:end[i]])
  output <- lapply(splits, function(i) {
      strsplit(as.character(jsonify::to_ndjson(i,unbox = T, numeric_dates = F)), split = "\n")[[1]]
    })[[1]]
  return(output)
}
library(data.table)

n = c(1e1, 1e2,1e3,1e4, 1e5)
bench_list = lapply(n, function(x){
    dt = data.table::data.table(
        var1 = 1:x,
        var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
    )
    
    microbenchmark::microbenchmark(
        "split_text" = col_to_json_raw_1(dt,"var2"),
        "split_raw" = col_to_json_raw_2(dt,"var2"),
        "raw_readr" = col_to_json_raw_3(dt,"var2"),
        "raw_base" = col_to_json_raw_4(dt,"var2"),
        "text_jsonify" = col_to_json_jsonify(dt,"var2"),
        times = 10
    )
})

Screenshot 2021-09-22 at 10 26 43

Even with the new jsonlite functions it looks like jsonify is still faster.

@DyfanJones
Copy link
Owner Author

DyfanJones commented Sep 22, 2021

col_to_json_raw_4 <- function(dt, col, batch = 500){
  
  con_raw <- rawConnection(raw(), open = "w")
  
  jsonlite::stream_out(subset(dt, select = col), con_raw, verbose = F, pagesize = batch)
  
  con_out <- rawConnection(rawConnectionValue(con_raw))
  
  on.exit({
    close(con_raw)
    close(con_out)
  })
  
  return(readLines(con_out))
}
library(data.table)

n = c(1e3,1e4, 1e5)
bench_list = lapply(n, function(x){
    dt = data.table::data.table(
        var1 = 1:x,
        var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
    )
    
    microbenchmark::microbenchmark(
        "raw_base_500" = col_to_json_raw_4(dt,"var2"),
        "raw_base_1000" = col_to_json_raw_4(dt,"var2", 1e3),
        "raw_base_10000" = col_to_json_raw_4(dt,"var2", 1e4),
        "raw_base_100000" = col_to_json_raw_4(dt,"var2", 1e5),
        "text_jsonify" = col_to_json_jsonify(dt,"var2"),
        times = 10
    )
})

Screenshot 2021-09-22 at 11 23 22

Increasing the pagesize with jsonlite::stream_out doesn't seem to be any performance improvements.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request
Projects
None yet
Development

No branches or pull requests

1 participant