Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support serializing numpy and pandas types #1180

Merged
merged 4 commits into from Mar 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions dev-requirements.txt
Expand Up @@ -6,6 +6,8 @@ nosexcover
sphinx<1.7
sphinx_rtd_theme
jinja2
numpy
pandas

# PyYAML 5.3 dropped support for Python 3.4 while
# not amending that requirement to the package. :(
Expand Down
59 changes: 56 additions & 3 deletions elasticsearch/serializer.py
Expand Up @@ -2,13 +2,49 @@
import simplejson as json
except ImportError:
import json

import uuid
from datetime import date, datetime
from decimal import Decimal

from .exceptions import SerializationError, ImproperlyConfigured
from .compat import string_types

INTEGER_TYPES = ()
FLOAT_TYPES = (Decimal,)
TIME_TYPES = (date, datetime)

try:
import numpy as np

INTEGER_TYPES += (
np.int_,
np.intc,
np.int8,
np.int16,
np.int32,
np.int64,
np.uint8,
np.uint16,
np.uint32,
np.uint64,
)
FLOAT_TYPES += (
np.float_,
np.float16,
np.float32,
np.float64,
)
except ImportError:
np = None

try:
import pandas as pd

TIME_TYPES += (pd.Timestamp,)
except ImportError:
pd = None


class TextSerializer(object):
mimetype = "text/plain"
Expand All @@ -27,12 +63,29 @@ class JSONSerializer(object):
mimetype = "application/json"

def default(self, data):
if isinstance(data, (date, datetime)):
if isinstance(data, TIME_TYPES):
return data.isoformat()
elif isinstance(data, Decimal):
return float(data)
elif isinstance(data, uuid.UUID):
return str(data)
elif isinstance(data, FLOAT_TYPES):
return float(data)
elif INTEGER_TYPES and isinstance(data, INTEGER_TYPES):
return int(data)

# Special cases for numpy and pandas types
elif np:
if isinstance(data, np.bool_):
return bool(data)
elif isinstance(data, np.datetime64):
return data.item().isoformat()
elif isinstance(data, np.ndarray):
return data.tolist()
if pd:
if isinstance(data, (pd.Series, pd.Categorical)):
return data.tolist()
elif hasattr(pd, "NA") and pd.isna(data):
return None

raise TypeError("Unable to serialize %r (type: %s)" % (data, type(data)))

def loads(self, s):
Expand Down
83 changes: 83 additions & 0 deletions test_elasticsearch/test_serializer.py
Expand Up @@ -5,6 +5,9 @@
from datetime import datetime
from decimal import Decimal

import numpy as np
import pandas as pd

from elasticsearch.serializer import (
JSONSerializer,
Deserializer,
Expand Down Expand Up @@ -36,6 +39,86 @@ def test_uuid_serialization(self):
),
)

def test_serializes_numpy_bool(self):
self.assertEquals('{"d":true}', JSONSerializer().dumps({"d": np.bool_(True)}))

def test_serializes_numpy_integers(self):
ser = JSONSerializer()
for np_type in (
np.int_,
np.int8,
np.int16,
np.int32,
np.int64,
):
self.assertEquals(ser.dumps({"d": np_type(-1)}), '{"d":-1}')

for np_type in (
np.uint8,
np.uint16,
np.uint32,
np.uint64,
):
self.assertEquals(ser.dumps({"d": np_type(1)}), '{"d":1}')

def test_serializes_numpy_floats(self):
ser = JSONSerializer()
for np_type in (
np.float_,
np.float32,
np.float64,
):
self.assertRegexpMatches(
ser.dumps({"d": np_type(1.2)}), r'^\{"d":1\.2[\d]*}$'
)

def test_serializes_numpy_datetime(self):
self.assertEquals(
'{"d":"2010-10-01T02:30:00"}',
JSONSerializer().dumps({"d": np.datetime64("2010-10-01T02:30:00")}),
)

def test_serializes_numpy_ndarray(self):
self.assertEquals(
'{"d":[0,0,0,0,0]}',
JSONSerializer().dumps({"d": np.zeros((5,), dtype=np.uint8)}),
)
# This isn't useful for Elasticsearch, just want to make sure it works.
self.assertEquals(
'{"d":[[0,0],[0,0]]}',
JSONSerializer().dumps({"d": np.zeros((2, 2), dtype=np.uint8)}),
)

def test_serializes_pandas_timestamp(self):
self.assertEquals(
'{"d":"2010-10-01T02:30:00"}',
JSONSerializer().dumps({"d": pd.Timestamp("2010-10-01T02:30:00")}),
)

def test_serializes_pandas_series(self):
self.assertEquals(
'{"d":["a","b","c","d"]}',
JSONSerializer().dumps({"d": pd.Series(["a", "b", "c", "d"])}),
)

def test_serializes_pandas_na(self):
if not hasattr(pd, "NA"): # pandas.NA added in v1
raise SkipTest("pandas.NA required")
self.assertEquals(
'{"d":null}', JSONSerializer().dumps({"d": pd.NA}),
)

def test_serializes_pandas_category(self):
cat = pd.Categorical(["a", "c", "b", "a"], categories=["a", "b", "c"])
self.assertEquals(
'{"d":["a","c","b","a"]}', JSONSerializer().dumps({"d": cat}),
)

cat = pd.Categorical([1, 2, 3], categories=[1, 2, 3])
self.assertEquals(
'{"d":[1,2,3]}', JSONSerializer().dumps({"d": cat}),
)

def test_raises_serialization_error_on_dump_error(self):
self.assertRaises(SerializationError, JSONSerializer().dumps, object())

Expand Down
1 change: 1 addition & 0 deletions test_elasticsearch/test_server/test_common.py
Expand Up @@ -39,6 +39,7 @@
"TestIndicesGetAlias10Basic",
# Disallowing expensive queries is 7.7+
"TestSearch320DisallowQueries",
"TestIndicesPutIndexTemplate10Basic",
}
}

Expand Down