Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a generic data type converter to the Cursor object #442

Merged
merged 2 commits into from Oct 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.txt
Expand Up @@ -4,6 +4,8 @@ Changes for crate

Unreleased
==========
- Added a generic data type converter to the ``Cursor`` object, for converting
fetched data from CrateDB data types to Python data types.


2022/10/10 0.27.2
Expand Down
48 changes: 48 additions & 0 deletions docs/query.rst
Expand Up @@ -199,7 +199,55 @@ You can turn this into something more manageable with a `list comprehension`_::
>>> [column[0] for column in cursor.description]
['date', 'datetime_tz', 'datetime_notz', ..., 'nullable_datetime', 'position']


Data type conversion
====================

The cursor object can optionally convert database types to native Python data
types. There is a default implementation for the CrateDB data types ``IP`` and
``TIMESTAMP`` on behalf of the ``DefaultTypeConverter``.

::

>>> from crate.client.converter import DefaultTypeConverter
>>> from crate.client.cursor import Cursor
>>> cursor = connection.cursor(converter=DefaultTypeConverter())

>>> cursor.execute("SELECT datetime_tz, datetime_notz FROM locations ORDER BY name")

>>> cursor.fetchone()
[datetime.datetime(2022, 7, 18, 18, 10, 36, 758000), datetime.datetime(2022, 7, 18, 18, 10, 36, 758000)]


Custom data type conversion
===========================

By providing a custom converter instance, you can define your own data type
conversions. For investigating the list of available data types, please either
inspect the ``DataType`` enum, or the documentation about the list of available
`CrateDB data type identifiers for the HTTP interface`_.

This example creates and applies a simple custom converter for converging
CrateDB's ``BOOLEAN`` type to Python's ``str`` type. It is using a simple
converter function defined as ``lambda``, which assigns ``yes`` for boolean
``True``, and ``no`` otherwise.

::

>>> from crate.client.converter import Converter, DataType

>>> converter = Converter()
>>> converter.set(DataType.BOOLEAN, lambda value: value is True and "yes" or "no")
>>> cursor = connection.cursor(converter=converter)

>>> cursor.execute("SELECT flag FROM locations ORDER BY name")

>>> cursor.fetchone()
['no']


.. _Bulk inserts: https://crate.io/docs/crate/reference/en/latest/interfaces/http.html#bulk-operations
.. _CrateDB data type identifiers for the HTTP interface: https://crate.io/docs/crate/reference/en/latest/interfaces/http.html#column-types
.. _Database API: http://www.python.org/dev/peps/pep-0249/
.. _database cursor: https://en.wikipedia.org/wiki/Cursor_(databases)
.. _DB API 2.0: http://www.python.org/dev/peps/pep-0249/
Expand Down
2 changes: 1 addition & 1 deletion docs/sqlalchemy.rst
Expand Up @@ -629,7 +629,7 @@ column on the ``Character`` class.
.. _operator: https://docs.python.org/2/library/operator.html
.. _any: http://docs.sqlalchemy.org/en/latest/core/type_basics.html#sqlalchemy.types.ARRAY.Comparator.any
.. _tuple: https://docs.python.org/3/library/stdtypes.html#sequence-types-list-tuple-range
.. _count result rows: http://docs.sqlalchemy.org/en/latest/orm/tutorial.html#counting
.. _count result rows: http://docs.sqlalchemy.org/en/14/orm/tutorial.html#counting
.. _MATCH predicate: https://crate.io/docs/crate/reference/en/latest/general/dql/fulltext.html#match-predicate
.. _arguments reference: https://crate.io/docs/crate/reference/en/latest/general/dql/fulltext.html#arguments
.. _boost values: https://crate.io/docs/crate/reference/en/latest/general/dql/fulltext.html#arguments
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
@@ -0,0 +1,5 @@
[tool.mypy]

# Needed until `mypy-0.990` for `ConverterDefinition` in `converter.py`.
# https://github.com/python/mypy/issues/731#issuecomment-1260976955
enable_recursive_aliases = true
15 changes: 13 additions & 2 deletions src/crate/client/connection.py
Expand Up @@ -46,6 +46,7 @@ def __init__(self,
socket_tcp_keepidle=None,
socket_tcp_keepintvl=None,
socket_tcp_keepcnt=None,
converter=None,
):
"""
:param servers:
Expand Down Expand Up @@ -99,7 +100,13 @@ def __init__(self,
Set the ``TCP_KEEPCNT`` socket option, which overrides
``net.ipv4.tcp_keepalive_probes`` kernel setting if ``socket_keepalive``
is ``True``.
:param converter:
(optional, defaults to ``None``)
A `Converter` object to propagate to newly created `Cursor` objects.
"""

self._converter = converter

if client:
self.client = client
else:
Expand All @@ -123,12 +130,16 @@ def __init__(self,
self.lowest_server_version = self._lowest_server_version()
self._closed = False

def cursor(self):
def cursor(self, **kwargs) -> Cursor:
"""
Return a new Cursor Object using the connection.
"""
converter = kwargs.pop("converter", self._converter)
if not self._closed:
return Cursor(self)
return Cursor(
connection=self,
converter=converter,
)
else:
raise ProgrammingError("Connection closed")

Expand Down
135 changes: 135 additions & 0 deletions src/crate/client/converter.py
@@ -0,0 +1,135 @@
# -*- coding: utf-8; -*-
#
# Licensed to CRATE Technology GmbH ("Crate") under one or more contributor
# license agreements. See the NOTICE file distributed with this work for
# additional information regarding copyright ownership. Crate licenses
# this file to you under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. You may
# obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# However, if you have executed another commercial license agreement
# with Crate these terms will supersede the license and you may use the
# software solely pursuant to the terms of the relevant commercial agreement.
"""
Machinery for converting CrateDB database types to native Python data types.

https://crate.io/docs/crate/reference/en/latest/interfaces/http.html#column-types
"""
import ipaddress
from copy import deepcopy
from datetime import datetime
from enum import Enum
from typing import Any, Callable, Dict, List, Optional, Union

ConverterFunction = Callable[[Optional[Any]], Optional[Any]]
ColTypesDefinition = Union[int, List[Union[int, "ColTypesDefinition"]]]


def _to_ipaddress(value: Optional[str]) -> Optional[Union[ipaddress.IPv4Address, ipaddress.IPv6Address]]:
"""
https://docs.python.org/3/library/ipaddress.html
"""
if value is None:
return None
return ipaddress.ip_address(value)


def _to_datetime(value: Optional[float]) -> Optional[datetime]:
"""
https://docs.python.org/3/library/datetime.html
"""
if value is None:
return None
return datetime.utcfromtimestamp(value / 1e3)
Comment on lines +45 to +51
Copy link
Member Author

@amotl amotl Sep 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dear @mfussenegger, @matriv, and @seut,

regarding @mfussenegger's proposal to let the user optionally specify a time zone, and make the machinery return a timezone-aware datetime object, I see two options here.

Let's assume the user defines a custom timezone object like

>>> import datetime
>>> tz_mst = datetime.timezone(datetime.timedelta(hours=7), name="MST")

and a timestamp in Epoch like

# Fri, 16 Sep 2022 11:09:16 GMT
timestamp = 1663326556

On this spot, which variant would be semantically the right one? Note that datetime.utcfromtimestamp does not accept the tz keyword argument, it will always return a naive datetime object with tzinfo=None.

  1. We can either use datetime.fromtimestamp, which does accept the tz keyword argument.

    >>> datetime.datetime.fromtimestamp(timestamp, tz=tz_mst)
    datetime.datetime(2022, 9, 16, 18, 9, 16, tzinfo=datetime.timezone(datetime.timedelta(seconds=25200), 'MST'))
    
    >>> datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc)
    datetime.datetime(2022, 9, 16, 11, 9, 16, tzinfo=datetime.timezone.utc)
  2. Or we can replace the tzinfo on the naive datetime object returned by utcfromtimestamp, making it timezone-aware.

    >>> datetime.datetime.utcfromtimestamp(timestamp).replace(tzinfo=tz_mst)
    datetime.datetime(2022, 9, 16, 11, 9, 16, tzinfo=datetime.timezone(datetime.timedelta(seconds=25200), 'MST'))

I think option 1. is the right choice. So the implementation would look like something along the lines of:

def set_timezone(self, tz):
  """
  When the user specifies a time zone upfront, all returned datetime objects
  will be timezone-aware.
  """
  if tz is not None and not isinstance(tz, datetime.timezone):
    raise TypeError(f"Timezone object has wrong type: {tz}")
  self.timezone = tz

def _to_datetime(self, value):
  """
  Convert CrateDB's `TIMESTAMP` column to the native Python `datetime` object.
  When a timezone is given, return a timezone-aware object.
  Otherwise, return a naive object which is meant to be the time in the UTC time zone.
  """
  if self.timezone is not None:
    return datetime.fromtimestamp(value / 1e3, tz=self.timezone)
  else:
    return datetime.utcfromtimestamp(value / 1e3)

What do you think?

With kind regards,
Andreas.

Copy link
Member Author

@amotl amotl Sep 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another implementation could look like this, it is always using datetime.fromtimestamp(value / 1e3, tz=self.timezone), where self.timezone would be datetime.timezone.utc by default, i.e. it will always return timezone-aware datetime objects.

While definitively easier and more straight-forward, it could be less performant when not aiming to use timezone-aware datetime objects and stay in "naive" land on purpose.

def __init__(self):
  # Return datetype objects in UTC by default.
  self.timezone = datetime.timezone.utc

def set_timezone(self, tz):
  """
  Set the time zone you want your `datetime` objects to be returned as.
  """
  if not isinstance(tz, datetime.timezone):
    raise TypeError(f"Timezone object has wrong type: {tz}")
  self.timezone = tz

def _to_datetime(self, value):
  """
  Convert CrateDB's `TIMESTAMP` column to a timezone-aware native Python `datetime` object.
  """
  return datetime.fromtimestamp(value / 1e3, tz=self.timezone)

Copy link
Member Author

@amotl amotl Sep 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

definitively easier and more straight-forward

When thinking about it once more, I think I would prefer the implementation variant outlined in my latest post. In this way, it is always consistent that timezone-aware datetime objects are returned from the data converter machinery.

it could be less performant when not aiming to use timezone-aware datetime objects and stay in "naive" land on purpose.

I think when users are aiming for raw speed, they would completely opt out of the data type converter machinery at all.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi again. Talk is cheap. I've converged a few more thoughts about this into a proposed patch at #445, completely separating the topic of timezone-aware datetime handling from this one.



def _to_default(value: Optional[Any]) -> Optional[Any]:
return value


# Symbolic aliases for the numeric data type identifiers defined by the CrateDB HTTP interface.
# https://crate.io/docs/crate/reference/en/latest/interfaces/http.html#column-types
class DataType(Enum):
NULL = 0
NOT_SUPPORTED = 1
CHAR = 2
BOOLEAN = 3
TEXT = 4
IP = 5
DOUBLE = 6
REAL = 7
SMALLINT = 8
INTEGER = 9
BIGINT = 10
TIMESTAMP_WITH_TZ = 11
OBJECT = 12
GEOPOINT = 13
GEOSHAPE = 14
TIMESTAMP_WITHOUT_TZ = 15
UNCHECKED_OBJECT = 16
REGPROC = 19
TIME = 20
OIDVECTOR = 21
NUMERIC = 22
REGCLASS = 23
DATE = 24
BIT = 25
JSON = 26
CHARACTER = 27
ARRAY = 100


ConverterMapping = Dict[DataType, ConverterFunction]


# Map data type identifier to converter function.
_DEFAULT_CONVERTERS: ConverterMapping = {
DataType.IP: _to_ipaddress,
DataType.TIMESTAMP_WITH_TZ: _to_datetime,
DataType.TIMESTAMP_WITHOUT_TZ: _to_datetime,
}


class Converter:
def __init__(
self,
mappings: Optional[ConverterMapping] = None,
default: ConverterFunction = _to_default,
) -> None:
self._mappings = mappings or {}
self._default = default

def get(self, type_: ColTypesDefinition) -> ConverterFunction:
if isinstance(type_, int):
return self._mappings.get(DataType(type_), self._default)
type_, inner_type = type_
if DataType(type_) is not DataType.ARRAY:
raise ValueError(f"Data type {type_} is not implemented as collection type")

inner_convert = self.get(inner_type)

def convert(value: Any) -> Optional[List[Any]]:
if value is None:
return None
return [inner_convert(x) for x in value]

return convert


class DefaultTypeConverter(Converter):
def __init__(self, more_mappings: Optional[ConverterMapping] = None) -> None:
mappings: ConverterMapping = {}
mappings.update(deepcopy(_DEFAULT_CONVERTERS))
if more_mappings:
mappings.update(deepcopy(more_mappings))
super().__init__(
mappings=mappings, default=_to_default
)
39 changes: 35 additions & 4 deletions src/crate/client/cursor.py
Expand Up @@ -19,9 +19,11 @@
# with Crate these terms will supersede the license and you may use the
# software solely pursuant to the terms of the relevant commercial agreement.

from .exceptions import ProgrammingError
import warnings

from .converter import Converter
from .exceptions import ProgrammingError


class Cursor(object):
"""
Expand All @@ -30,9 +32,10 @@ class Cursor(object):
"""
lastrowid = None # currently not supported

def __init__(self, connection):
def __init__(self, connection, converter: Converter):
self.arraysize = 1
self.connection = connection
self._converter = converter
self._closed = False
self._result = None
self.rows = None
Expand All @@ -50,7 +53,10 @@ def execute(self, sql, parameters=None, bulk_parameters=None):
self._result = self.connection.client.sql(sql, parameters,
bulk_parameters)
if "rows" in self._result:
self.rows = iter(self._result["rows"])
if self._converter is None:
self.rows = iter(self._result["rows"])
else:
self.rows = iter(self._convert_rows())

def executemany(self, sql, seq_of_parameters):
"""
Expand All @@ -73,9 +79,13 @@ def executemany(self, sql, seq_of_parameters):
"duration": sum(durations) if durations else -1,
"rows": [],
"cols": self._result.get("cols", []),
"col_types": self._result.get("col_types", []),
"results": self._result.get("results")
}
self.rows = iter(self._result["rows"])
if self._converter is None:
self.rows = iter(self._result["rows"])
else:
self.rows = iter(self._convert_rows())
return self._result["results"]

def fetchone(self):
Expand Down Expand Up @@ -210,3 +220,24 @@ def duration(self):
"duration" not in self._result:
return -1
return self._result.get("duration", 0)

def _convert_rows(self):
"""
Iterate rows, apply type converters, and generate converted rows.
"""
assert "col_types" in self._result and self._result["col_types"], \
"Unable to apply type conversion without `col_types` information"

# Resolve `col_types` definition to converter functions. Running the lookup
# redundantly on each row loop iteration would be a huge performance hog.
types = self._result["col_types"]
converters = [
self._converter.get(type) for type in types
]

# Process result rows with conversion.
for row in self._result["rows"]:
yield [
convert(value)
for convert, value in zip(converters, row)
]