# JSON and serialization

JSON can be used to represent tabular data in a string format and is often used for sending data over a network.

In [1]:
from pathlib import Path
from collections import OrderedDict

import polars as pl

In [2]:
# b can make Python convert the string to bytes object

jsonString = b"""
    [
        {"id":1,"values":"a"},
        {"id":2,"values":"b"},
        {"id":3,"values":null}
    ]
"""

In [3]:
pl.read_json(jsonString)

id,values
i64,str
1,"""a"""
2,"""b"""
3,


In [4]:
pl.read_json(
    bytes(
        """
        [
            {"id":1,"values":"a"},
            {"id":2,"values":"b"},
            {"id":3,"values":null}
        ]
    """,
        "utf-8",
    )
)

id,values
i64,str
1,"""a"""
2,"""b"""
3,


In [7]:
pl.read_json(
    jsonString,
    schema={"id": pl.Int64}
)

id
i64
1
2
3


In [8]:
pl.read_json(
    jsonString,
    schema=OrderedDict([("id", pl.Int64)])
)

id
i64
1
2
3


In [9]:
pl.read_json(
    jsonString,
    infer_schema_length=2
)

id,values
i64,str
1,"""a"""
2,"""b"""
3,


### Writing JSON

In [10]:
df = pl.read_json(jsonString)

df.write_json()

'[{"id":1,"values":"a"},{"id":2,"values":"b"},{"id":3,"values":null}]'

Be aware that we can lose dtype information by writing to and then reading JSON

In [11]:
json_output = pl.read_json(
    jsonString
).with_columns(
    pl.col("id").cast(pl.Int8)
).write_json()

pl.read_json(bytes(json_output, "utf-8"))

id,values
i64,str
1,"""a"""
2,"""b"""
3,


## Nested data

JSON may contain arbitrarily nested structures.

### Nested key-value pairs
Polars converts the key-value pair in `values` to a `pl.Struct` dtype *if the types in the nested structure are consistent* or raises an `Exception` 

In [15]:
nestedJsonString = b"""
    [
        {"id":1,"values":{"a":0,"b":1}},
        {"id":2,"values":{"a":0,"b":1}},
        {"id":3,"values":null}
    ]
"""

In [16]:
pl.read_json(
    nestedJsonString
)

id,values
i64,struct[2]
1,"{0,1}"
2,"{0,1}"
3,


### Nested arrays
Polars attempts to convert arrays to a `pl.List` dtype

In [17]:
nestedArrayJsonString = b"""
    [
        {"id":1,"values":[0,1]},
        {"id":2,"values":[0,1.0]}
    ]
"""

In [18]:
pl.read_json(
    nestedArrayJsonString
)

id,values
i64,list[f64]
1,"[0.0, 1.0]"
2,"[0.0, 1.0]"


## Newline delimited JSON
In a similar way to standard JSON we can read newline delimited JSON with `pl.read_ndjson`

In [19]:
newlineDelimitedJsonString = b"""
        {"id":1,"values":"a"}\n
        {"id":2,"values":"b"}\n
        {"id":3,"values":null}\n
"""

In [20]:
pl.read_ndjson(
    newlineDelimitedJsonString
)

id,values
i64,str
1,"""a"""
2,"""b"""
3,


In [21]:
# Specify a directory to hold the ndJSON file
ndjson_dir = Path('data/ndjson')

ndjson_file = "example.json"

# Create the ndjson sub-directory if it doesn't exist already
ndjson_dir.mkdir(parents=True,exist_ok=True)

# Set the path to the ndJSON file
ndjson_path = ndjson_dir / ndjson_file

In [22]:
pl.read_ndjson(
    newlineDelimitedJsonString
).write_ndjson(
    ndjson_path
)

Lazy mode for ndjson

In [23]:
print(
    pl.scan_ndjson(
        ndjson_path
    ).select("id").explain()
)

NDJson SCAN [data/ndjson/example.json]
PROJECT 1/2 COLUMNS


Lazy mode for ndjson doesn't support `streaming`

In [24]:
print(
    pl.scan_ndjson(
        ndjson_path
    ).select("id").explain(
        engine="streaming"
    )
)

NDJson SCAN [data/ndjson/example.json]
PROJECT 1/2 COLUMNS


## Serialization
Polars has an alternative way to serialize a `DataFrame` to a string representation **encoded as bytes**.

In [25]:
df = pl.read_json(jsonString)

In [None]:
bytes_rep = df.serialize() 

bytes_rep # you may not understand the result, it's normal

b'\xff\xff\xff\xff\xe8\x00\x00\x00\x04\x00\x00\x00\xf2\xff\xff\xff\x14\x00\x00\x00\x04\x00\x01\x00\x00\x00\n\x00\x0b\x00\x08\x00\n\x00\x04\x00\xf2\xff\xff\xffL\x00\x00\x00\x10\x00\x00\x00\x00\x00\n\x00\x0c\x00\x00\x00\x04\x00\x08\x00\x01\x00\x00\x00\x04\x00\x00\x00\xf4\xff\xff\xff\x1c\x00\x00\x00\x0c\x00\x00\x00\x08\x00\x0c\x00\x04\x00\x08\x00\x05\x00\x00\x00[0,0]\x00\x00\x00\t\x00\x00\x00_PL_FLAGS\x00\x00\x00\x02\x00\x00\x004\x00\x00\x00\x04\x00\x00\x00\xc0\xff\xff\xff\x1c\x00\x00\x00\x10\x00\x00\x00\x08\x00\x00\x00\x01\x18\x00\x00\x00\x00\x00\x00\xfc\xff\xff\xff\x04\x00\x04\x00\x06\x00\x00\x00values\x00\x00\xec\xff\xff\xff8\x00\x00\x00 \x00\x00\x00\x18\x00\x00\x00\x01\x02\x00\x00\x10\x00\x12\x00\x04\x00\x10\x00\x11\x00\x08\x00\x00\x00\x0c\x00\x00\x00\x00\x00\xf4\xff\xff\xff@\x00\x00\x00\x01\x00\x00\x00\x08\x00\t\x00\x04\x00\x08\x00\x02\x00\x00\x00id\x00\x00\xff\xff\xff\xff\xc8\x00\x00\x00\x04\x00\x00\x00\xec\xff\xff\xff\xc0\x00\x00\x00\x00\x00\x00\x00\x14\x00\x00\x00\x04\x00\x03\x00\

Deserialize

In [29]:
import io
pl.DataFrame.deserialize(io.BytesIO(bytes_rep))

id,values
i64,str
1,"""a"""
2,"""b"""
3,


Serialize JSON representation

In [30]:
df.serialize(
    format="json"
)

'[255,255,255,255,232,0,0,0,4,0,0,0,242,255,255,255,20,0,0,0,4,0,1,0,0,0,10,0,11,0,8,0,10,0,4,0,242,255,255,255,76,0,0,0,16,0,0,0,0,0,10,0,12,0,0,0,4,0,8,0,1,0,0,0,4,0,0,0,244,255,255,255,28,0,0,0,12,0,0,0,8,0,12,0,4,0,8,0,5,0,0,0,91,48,44,48,93,0,0,0,9,0,0,0,95,80,76,95,70,76,65,71,83,0,0,0,2,0,0,0,52,0,0,0,4,0,0,0,192,255,255,255,28,0,0,0,16,0,0,0,8,0,0,0,1,24,0,0,0,0,0,0,252,255,255,255,4,0,4,0,6,0,0,0,118,97,108,117,101,115,0,0,236,255,255,255,56,0,0,0,32,0,0,0,24,0,0,0,1,2,0,0,16,0,18,0,4,0,16,0,17,0,8,0,0,0,12,0,0,0,0,0,244,255,255,255,64,0,0,0,1,0,0,0,8,0,9,0,4,0,8,0,2,0,0,0,105,100,0,0,255,255,255,255,200,0,0,0,4,0,0,0,236,255,255,255,192,0,0,0,0,0,0,0,20,0,0,0,4,0,3,0,12,0,19,0,16,0,18,0,12,0,4,0,230,255,255,255,3,0,0,0,0,0,0,0,116,0,0,0,40,0,0,0,20,0,0,0,0,0,14,0,24,0,4,0,12,0,16,0,0,0,20,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,64,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,48,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0

In [33]:
string_rep = df.cast(
    {"id":pl.Int8}
).serialize(format="json")

In [34]:
pl.DataFrame.deserialize(
    bytes(
        string_rep,
        "utf-8"
    ),
    format="json"
)

id,values
i8,str
1,"""a"""
2,"""b"""
3,
