Skip to content

Commit

Permalink
feat: add serialization to base64 (#33)
Browse files Browse the repository at this point in the history
  • Loading branch information
hanxiao committed Jan 11, 2022
1 parent 5c1fb55 commit 74b9405
Show file tree
Hide file tree
Showing 8 changed files with 160 additions and 6 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ DocArray is a library for nested, unstructured data such as text, image, audio,

🧑‍🔬 **Data science powerhouse**: greatly accelerate data scientists work on embedding, matching, visualizing, evaluating via Torch/Tensorflow/ONNX/PaddlePaddle on CPU/GPU.

🚡 **Portable**: ready-to-wire at anytime with efficient and compact serialization from/to Protobuf, bytes, JSON, CSV, dataframe.
🚡 **Portable**: ready-to-wire at anytime with efficient and compact serialization from/to Protobuf, bytes, base64, JSON, CSV, dataframe.

<!-- end elevator-pitch -->

Expand Down
26 changes: 25 additions & 1 deletion docarray/array/mixins/io/binary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import io
import os.path
import base64
import pickle
from contextlib import nullcontext
from typing import Union, BinaryIO, TYPE_CHECKING, Type, Optional
Expand All @@ -12,7 +13,7 @@


class BinaryIOMixin:
"""Save/load an array to a binary file. """
"""Save/load an array to a binary file."""

@classmethod
def load_binary(
Expand Down Expand Up @@ -175,3 +176,26 @@ def from_protobuf(cls: Type['T'], pb_msg: 'DocumentArrayProto') -> 'T':

def __bytes__(self):
return self.to_bytes()

@classmethod
def from_base64(
cls: Type['T'],
data: str,
protocol: str = 'pickle-array',
compress: Optional[str] = None,
_show_progress: bool = False,
) -> 'T':
return cls.load_binary(
base64.b64decode(data),
protocol=protocol,
compress=compress,
_show_progress=_show_progress,
)

def to_base64(
self,
protocol: str = 'pickle-array',
compress: Optional[str] = None,
_show_progress: bool = False,
) -> str:
return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
28 changes: 28 additions & 0 deletions docarray/document/mixins/porting.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import dataclasses
import pickle
from typing import Optional, TYPE_CHECKING, Type, Dict, Any
import base64

from ...helper import compress_bytes, decompress_bytes

Expand Down Expand Up @@ -86,3 +87,30 @@ def to_json(self) -> str:
return MessageToJson(
self.to_protobuf(), preserving_proto_field_name=True, sort_keys=True
)

def to_base64(
self, protocol: str = 'pickle', compress: Optional[str] = None
) -> str:
"""Serialize a Document object into as base64 string
:param protocol: protocol to use
:param compress: compress method to use
:return: a base64 encoded string
"""
return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')

@classmethod
def from_base64(
cls: Type['T'],
data: str,
protocol: str = 'pickle',
compress: Optional[str] = None,
) -> 'T':
"""Build Document object from binary bytes
:param data: a base64 encoded string
:param protocol: protocol to use
:param compress: compress method to use
:return: a Document object
"""
return cls.from_bytes(base64.b64decode(data), protocol, compress)
6 changes: 4 additions & 2 deletions docarray/proto/io/ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def read_ndarray(pb_msg: 'NdArrayProto') -> 'ArrayType':

return sparse_coo_tensor(idx, val, shape)
else:
if framework in {'numpy', 'torch', 'paddle', 'tensorflow'}:
if framework in {'numpy', 'torch', 'paddle', 'tensorflow', 'list'}:
x = _get_dense_array(pb_msg.dense)
return _to_framework_array(x, framework)

Expand Down Expand Up @@ -68,7 +68,7 @@ def flush_ndarray(pb_msg: 'NdArrayProto', value: 'ArrayType'):
pb_msg.cls_name = 'numpy'
_set_dense_array(pb_msg.dense, value)
if framework == 'python':
pb_msg.cls_name = 'numpy'
pb_msg.cls_name = 'list'
_set_dense_array(pb_msg.dense, np.array(value))
if framework == 'tensorflow':
pb_msg.cls_name = 'tensorflow'
Expand Down Expand Up @@ -144,3 +144,5 @@ def _to_framework_array(x, framework):
from paddle import to_tensor

return to_tensor(x)
elif framework == 'list':
return x.tolist()
39 changes: 38 additions & 1 deletion docs/fundamentals/document/serialization.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,43 @@ If you go with default `protcol` and `compress` settings, you can simply use `by
```


## From/to base64

```{important}
Depending on your values of `protocol` and `compress` arguments, this feature may require `protobuf` and `lz4` dependencies. You can do `pip install "docarray[full]"` to install it.
```

In some cases such as in REST API, you are allowed only to send/receive string not bytes. You can serialize Document into base64 string via {meth}`~docarray.document.mixins.porting.PortingMixin.to_base64` and load it via {meth}`~docarray.document.mixins.porting.PortingMixin.from_base64`.

```python
from docarray import Document
d = Document(text='hello', embedding=[1, 2, 3])

print(d.to_base64())
```

```text
gANjZG9jYXJyYXkuZG9jdW1lbnQKRG9jdW1lbnQKcQApgXEBfXECWAUAAABfZGF0YXEDY2RvY2FycmF5LmRvY3VtZW50LmRhdGEKRG9jdW1lbnREYXRhCnEEKYFxBX1xBihYDgAAAF9yZWZlcmVuY2VfZG9jcQdoAVgCAAAAaWRxCFggAAAAZmZjNTY3ODg3MzAyMTFlY2E4NjMxZTAwOGEzNjZkNDlxCVgJAAAAcGFyZW50X2lkcQpOWAsAAABncmFudWxhcml0eXELTlgJAAAAYWRqYWNlbmN5cQxOWAYAAABidWZmZXJxDU5YBAAAAGJsb2JxDk5YCQAAAG1pbWVfdHlwZXEPWAoAAAB0ZXh0L3BsYWlucRBYBAAAAHRleHRxEVgFAAAAaGVsbG9xElgHAAAAY29udGVudHETTlgGAAAAd2VpZ2h0cRROWAMAAAB1cmlxFU5YBAAAAHRhZ3NxFk5YBgAAAG9mZnNldHEXTlgIAAAAbG9jYXRpb25xGE5YCQAAAGVtYmVkZGluZ3EZXXEaKEsBSwJLA2VYCAAAAG1vZGFsaXR5cRtOWAsAAABldmFsdWF0aW9uc3EcTlgGAAAAc2NvcmVzcR1OWAYAAABjaHVua3NxHk5YBwAAAG1hdGNoZXNxH051YnNiLg==
```

You can set `protocol` and `compress` to get a more compact string:

```python
from docarray import Document
d = Document(text='hello', embedding=[1, 2, 3])

print(len(d.to_base64()))
print(len(d.to_base64(protocol='protobuf', compress='lz4')))
```

```text
664
156
```

Note that the same `protocol` and `compress` must be followed when using `.from_base64`.


## From/to dict

```{important}
Expand Down Expand Up @@ -165,4 +202,4 @@ One can refer to the [Protobuf specification of `Document`](../../proto/index.md

## What's next?

Serializing single Document can be useful but often we want to do things in bulk, say hundreds or one million Documents at once. In that case, looping over each Document and serializing one by one is inefficient. In DocumentArray, we will introduce the similar interfaces {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_bytes`, {meth}`~docarray.array.mixins.io.json.JsonIOMixin.to_json`, and {meth}`~docarray.array.mixins.io.json.JsonIOMixin.to_list` that allows one to serialize multiple Documents much faster and more compact.
Serializing single Document can be useful but often we want to do things in bulk, say hundreds or one million Documents at once. In that case, looping over each Document and serializing one by one is inefficient. In DocumentArray, we will introduce the similar interfaces {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_bytes`, {meth}`~docarray.array.mixins.io.json.JsonIOMixin.to_json`, and {meth}`~docarray.array.mixins.io.json.JsonIOMixin.to_list` that allows one to [serialize multiple Documents much faster and more compact](../documentarray/serialization.md).
44 changes: 43 additions & 1 deletion docs/fundamentals/documentarray/serialization.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
DocArray is designed to be "ready-to-wire" at anytime. Serialization is important. DocumentArray provides multiple serialization methods that allows one transfer DocumentArray object over network and across different microservices.

- JSON string: `.from_json()`/`.to_json()`
- Bytes (compressed): `.from_bytes()`/`.to_bytes()`
- Bytes (compressed): `.from_bytes()`/`.to_bytes()`
- Base64 (compressed): `.from_base64()`/`.to_base64()`
- Protobuf Message: `.from_protobuf()`/`.to_protobuf()`
- Python List: `.from_list()`/`.to_list()`
- Pandas Dataframe: `.from_dataframe()`/`.to_dataframe()`
Expand Down Expand Up @@ -141,6 +142,47 @@ When set `protocol=pickle` or `protobuf`, the result binary string looks like th

Here `Delimiter` is a 16-bytes separator such as `b'g\x81\xcc\x1c\x0f\x93L\xed\xa2\xb0s)\x9c\xf9\xf6\xf2'` used for setting the boundary of each Document's serialization. Given a `to_bytes(protocol='pickle/protobuf')` binary string, once we know the first 16 bytes, the boundary is clear. Consequently, one can leverage this format to stream Documents, drop, skip, or early-stop, etc.

## From/to base64

```{important}
Depending on your values of `protocol` and `compress` arguments, this feature may require `protobuf` and `lz4` dependencies. You can do `pip install "docarray[full]"` to install it.
```

Serialize into base64 can be useful when binary string is not allowed, e.g. in REST API. This can be easily done via {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_base64` and {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.from_base64`. Like in binary serialization, one can specify `protocol` and `compress`:

```python
from docarray import DocumentArray
da = DocumentArray.empty(10)

d_str = da.to_base64(protocol='protobuf', compress='lz4')
print(len(d_str), d_str)
```

```text
176 BCJNGEBAwHUAAAD/Iw+uQdpL9UDNsfvomZb8m7sKIGRkNTIyOTQyNzMwMzExZWNiM2I1MWUwMDhhMzY2ZDQ5MgAEP2FiNDIAHD9iMTgyAB0vNWUyAB0fYTIAHh9myAAdP2MzYZYAHD9jODAyAB0fZDIAHT9kMTZkAABQNjZkNDkAAAAA
```

To deserialize, remember to set the correct `protocol` and `compress`:

```python
from docarray import DocumentArray

da = DocumentArray.from_base64(d_str, protocol='protobuf', compress='lz4')
da.summary()
```

```text
Length 10
Homogenous Documents True
Common Attributes ('id',)
Attributes Summary
Attribute Data type #Unique values Has empty value
──────────────────────────────────────────────────────────
id ('str',) 10 False
```

## From/to Protobuf

Serializing to Protobuf Message is less frequently used, unless you are using Python Protobuf API. Nonetheless, you can use {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.from_protobuf` and {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_protobuf` to get a Protobuf Message object in Python.
Expand Down
12 changes: 12 additions & 0 deletions tests/unit/array/mixins/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,15 @@ def test_push_pull_io(da_cls, show_progress):

assert len(da1) == len(da2) == 10
assert da1.texts == da2.texts == random_texts


@pytest.mark.parametrize('protocol', ['protobuf', 'pickle'])
@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None])
def test_from_to_base64(protocol, compress):
da = DocumentArray.empty(10)
da.embeddings = [[1, 2, 3]] * len(da)
da_r = DocumentArray.from_base64(
da.to_base64(protocol, compress), protocol, compress
)
assert da_r == da
assert da_r[0].embedding == [1, 2, 3]
9 changes: 9 additions & 0 deletions tests/unit/document/test_porting.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,12 @@ def test_dict_json(target):
d1 = Document.from_dict(d.to_dict())
d2 = Document.from_json(d.to_json())
assert d1 == d2


@pytest.mark.parametrize('protocol', ['protobuf', 'pickle'])
@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None])
def test_to_from_base64(protocol, compress):
d = Document(text='hello', embedding=[1, 2, 3])
d_r = Document.from_base64(d.to_base64(protocol, compress), protocol, compress)
assert d_r == d
assert d_r.embedding == [1, 2, 3]

0 comments on commit 74b9405

Please sign in to comment.