Skip to content

Commit

Permalink
Initial array support
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Winkler committed Aug 21, 2020
1 parent 9cd1047 commit 7afa32f
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 1 deletion.
17 changes: 17 additions & 0 deletions src/JSONLines.jl
Expand Up @@ -10,6 +10,7 @@ import Base.Threads.@spawn

export readfile,
readlazy,
readarrays,
reset!,
writefile,
@MStructType,
Expand Down Expand Up @@ -90,4 +91,20 @@ function writefile(file, data, mode = "w")
close(fi)
end

"""
readarrays(file; namesline = 1, nrows = nothing, skip = nothing)
Read a JSONLines file in which the rows are arrays.
* `file`: JSONLines file with JSON arrays (`[val1, val2, ...]`) as rows
* Keyword Arguments:
* `namesline = 1`: Row that contains the names of the columns
* `nrows = nothing`: Number of rows to load
* `skip = nothing`: Number of rows to skip before loading
"""
function readarrays(file; namesline = 1, nrows = nothing, skip = nothing)
tups = getarrays(file, namesline, nrows, skip)
return tups
end

end # Module
44 changes: 44 additions & 0 deletions src/file.jl
Expand Up @@ -12,6 +12,50 @@ function getfile(file, nlines, skip, usemmap)
return ff
end

function getarrays(file, namesline, nlines, skip)
if isnothing(nlines)
nlines = _INT_MAX
end
if isnothing(skip)
skip = 0
end
fi = Mmap.mmap(file)
len = lastindex(fi)
if namesline > 1
namesbeg = skiprows(fi, namesline-1, 0)
else
namesbeg = 0
end
namesr = detectarrayrow(fi, namesbeg)
names = tuple(Symbol.(JSON3.read(fi[namesr[1]:namesr[2]]))...)
rowindices = Pair{Int, Int}[]
if skip > 0
filestart = skiprows(fi, skip, namesr[2])
if filestart == len
return NamedTuple{names}(tuple(fill(missing, length(names))...))
end
else
filestart = namesr[2]
end
row = detectarrayrow(fi, filestart)
if isrow(row)
push!(rowindices, row)
end
if iseof(row, len)
return [NamedTuple{names}(tuple(JSON3.read(fi[r[1]:r[2]])...)) for r in rowindices]
end
for rowi in 2:nlines
row = detectarrayrow(fi, rowindices[rowi-1][2])
if isrow(row)
push!(rowindices, row)
end
if iseof(row, len)
return [NamedTuple{names}(tuple(JSON3.read(fi[r[1]:r[2]])...)) for r in rowindices]
end
end
return [NamedTuple{names}(tuple(JSON3.read(fi[r[1]:r[2]])...)) for r in rowindices]
end

# Read everything into ram
function readstr(file)
fi = read(file)
Expand Down
14 changes: 14 additions & 0 deletions src/helpers.jl
@@ -1,6 +1,7 @@
const _LSEP = UInt8('\n')
const _EOL = UInt8('}')
const _BOL = UInt8('{')
const _ABOL = UInt8('[')
const _INT_MAX = typemax(Int)

# Detect space in UInt8
Expand All @@ -22,6 +23,19 @@ function detectrow(file::Vector{UInt8}, prevend::Int)
return rowstart => rowend
end

function detectarrayrow(file::Vector{UInt8}, prevend::Int)
searchstart = nextind(file, prevend)
rowstart = findnext(isequal(_ABOL), file, searchstart)
rowend = findnext(isequal(_LSEP), file, searchstart)
if isnothing(rowstart)
rowstart = lastindex(file)
end
if isnothing(rowend)
rowend = lastindex(file)
end
return rowstart => rowend
end

function skiprows(file::Vector{UInt8}, n::Int, prevend::Int = 0)
ind = nextind(file, prevend)
for _ in 1:n
Expand Down
1 change: 1 addition & 0 deletions test/Project.toml
Expand Up @@ -2,4 +2,5 @@
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Pipe = "b98c9c47-44ae-5843-9183-064241ee97a0"
RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
9 changes: 8 additions & 1 deletion test/runtests.jl
@@ -1,5 +1,5 @@
using JSONLines
using Test, DataFrames, RDatasets, Pipe
using Test, DataFrames, RDatasets, Pipe, Tables

full_web = readfile("testfiles/jsonlwebsite.jsonl") |> DataFrame;
nrow_fw = nrow(full_web)
Expand Down Expand Up @@ -45,6 +45,12 @@ end
@test [x for x in readlazy("testfiles/escapedeol.jsonl")] |> DataFrame == escaped
end

@testset "Read arrays" begin
@test Tables.columntable(readarrays("testfiles/array.jsonl", namesline = 2)).a[2] == 4
@test Tables.columntable(readarrays("testfiles/array.jsonl", skip = 1, namesline = 2)).a[1] == 4
@test Tables.columntable(readarrays("testfiles/jsonlwebsitearray.jsonl")).Score == [24, 29, 14, 19]
end

@testset "select" begin
webl = @pipe readlazy("testfiles/jsonlwebsite.jsonl", returnparsed = false) |> JSONLines.select(_, :name) |> DataFrame
@test webl == full_web[:, [:name]]
Expand Down Expand Up @@ -179,6 +185,7 @@ end
@MStructType EscType name
@test readfile("testfiles/escapedeol.jsonl", structtype = EscType) |> DataFrame == escaped[:, [:name]]
end

# Cleanup
rm("full_web.jsonl")
rm("full_mtcars.jsonl")
Expand Down
4 changes: 4 additions & 0 deletions test/testfiles/array.jsonl
@@ -0,0 +1,4 @@
% My comment
["a", "bcd", "efg"]
[1, 2, 3]
[4, 5, 6]
5 changes: 5 additions & 0 deletions test/testfiles/jsonlwebsitearray.jsonl
@@ -0,0 +1,5 @@
["Name", "Session", "Score", "Completed"]
["Gilbert", "2013", 24, true]
["Alexa", "2013", 29, true]
["May", "2012B", 14, false]
["Deloise", "2012A", 19, true]

0 comments on commit 7afa32f

Please sign in to comment.