Skip to content

Commit

Permalink
apacheGH-39013: [Go][Integration] Support cABI import/export of Strin…
Browse files Browse the repository at this point in the history
…gView (apache#39019)

### Rationale for this change

The Go implementation should support import/export of the new data types. This will enable integration testing between the C++ and Go implementations.

### What changes are included in this PR?

Added import/export for the new data types and arrays of data of those types.

### Are these changes tested?

Yes, they will be covered by the integration tests and existing Go unit tests.

### Are there any user-facing changes?

This is a user facing change

* Closes: apache#39013

Lead-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
Co-authored-by: Matt Topol <zotthewizard@gmail.com>
Co-authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
  • Loading branch information
3 people authored and clayburn committed Jan 23, 2024
1 parent 2ae8961 commit 030dca9
Show file tree
Hide file tree
Showing 73 changed files with 917 additions and 859 deletions.
93 changes: 92 additions & 1 deletion dev/archery/archery/integration/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,6 +927,83 @@ class LargeListColumn(_BaseListColumn, _LargeOffsetsMixin):
pass


class ListViewField(Field):

def __init__(self, name, value_field, *, nullable=True,
metadata=None):
super().__init__(name, nullable=nullable,
metadata=metadata)
self.value_field = value_field

@property
def column_class(self):
return ListViewColumn

def _get_type(self):
return OrderedDict([
('name', 'listview')
])

def _get_children(self):
return [self.value_field.get_json()]

def generate_column(self, size, name=None):
MAX_LIST_SIZE = 4
VALUES_SIZE = size * MAX_LIST_SIZE

is_valid = self._make_is_valid(size)

MAX_OFFSET = VALUES_SIZE - MAX_LIST_SIZE
offsets = np.random.randint(0, MAX_OFFSET + 1, size=size)
sizes = np.random.randint(0, MAX_LIST_SIZE + 1, size=size)

values = self.value_field.generate_column(VALUES_SIZE)

if name is None:
name = self.name
return self.column_class(name, size, is_valid, offsets, sizes, values)


class LargeListViewField(ListViewField):

@property
def column_class(self):
return LargeListViewColumn

def _get_type(self):
return OrderedDict([
('name', 'largelistview')
])


class _BaseListViewColumn(Column):

def __init__(self, name, count, is_valid, offsets, sizes, values):
super().__init__(name, count)
self.is_valid = is_valid
self.offsets = offsets
self.sizes = sizes
self.values = values

def _get_buffers(self):
return [
('VALIDITY', [int(v) for v in self.is_valid]),
('OFFSET', self._encode_offsets(self.offsets)),
('SIZE', self._encode_offsets(self.sizes)),
]

def _get_children(self):
return [self.values.get_json()]


class ListViewColumn(_BaseListViewColumn, _NarrowOffsetsMixin):
pass


class LargeListViewColumn(_BaseListViewColumn, _LargeOffsetsMixin):
pass


class MapField(Field):

def __init__(self, name, key_field, item_field, *, nullable=True,
Expand Down Expand Up @@ -1663,6 +1740,15 @@ def generate_binary_view_case():
return _generate_file("binary_view", fields, batch_sizes)


def generate_list_view_case():
fields = [
ListViewField('lv', get_field('item', 'float32')),
LargeListViewField('llv', get_field('item', 'float32')),
]
batch_sizes = [0, 7, 256]
return _generate_file("list_view", fields, batch_sizes)


def generate_nested_large_offsets_case():
fields = [
LargeListField('large_list_nullable', get_field('item', 'int32')),
Expand Down Expand Up @@ -1847,7 +1933,12 @@ def _temp_path():

generate_binary_view_case()
.skip_tester('C#')
.skip_tester('Go')
.skip_tester('Java')
.skip_tester('JS')
.skip_tester('Rust'),

generate_list_view_case()
.skip_tester('C#')
.skip_tester('Java')
.skip_tester('JS')
.skip_tester('Rust'),
Expand Down
2 changes: 2 additions & 0 deletions dev/archery/archery/integration/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ def _run_test_cases(self,
``case_runner`` ran against ``test_cases``
"""
def case_wrapper(test_case):
if serial:
return case_runner(test_case)
with printer.cork():
return case_runner(test_case)

Expand Down
23 changes: 18 additions & 5 deletions docs/source/format/Integration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ considered equivalent to ``[]`` (no metadata). Duplicated keys are not forbidden
**Type**: ::

{
"name" : "null|struct|list|largelist|fixedsizelist|union|int|floatingpoint|utf8|largeutf8|binary|largebinary|fixedsizebinary|bool|decimal|date|time|timestamp|interval|duration|map"
"name" : "null|struct|list|largelist|listview|largelistview|fixedsizelist|union|int|floatingpoint|utf8|largeutf8|binary|largebinary|utf8view|binaryview|fixedsizebinary|bool|decimal|date|time|timestamp|interval|duration|map|runendencoded"
}

A ``Type`` will have other fields as defined in
Expand Down Expand Up @@ -446,12 +446,22 @@ or ``DATA``.

``BufferData`` is encoded based on the type of buffer:

* ``VALIDITY``: a JSON array of 1 (valid) and 0 (null). Data for non-nullable
* ``VALIDITY``: a JSON array of 1 (valid) and 0 (null). Data for non-nullable
``Field`` still has a ``VALIDITY`` array, even though all values are 1.
* ``OFFSET``: a JSON array of integers for 32-bit offsets or
string-formatted integers for 64-bit offsets
* ``TYPE_ID``: a JSON array of integers
* ``DATA``: a JSON array of encoded values
string-formatted integers for 64-bit offsets.
* ``TYPE_ID``: a JSON array of integers.
* ``DATA``: a JSON array of encoded values.
* ``VARIADIC_DATA_BUFFERS``: a JSON array of data buffers represented as
hex encoded strings.
* ``VIEWS``: a JSON array of encoded views, which are JSON objects with:
* ``SIZE``: an integer indicating the size of the view,
* ``INLINED``: an encoded value (this field will be present if ``SIZE``
is smaller than 12, otherwise the next three fields will be present),
* ``PREFIX_HEX``: the first four bytes of the view encoded as hex,
* ``BUFFER_INDEX``: the index in ``VARIADIC_DATA_BUFFERS`` of the buffer
viewed,
* ``OFFSET``: the offset in the buffer viewed.

The value encoding for ``DATA`` is different depending on the logical
type:
Expand Down Expand Up @@ -527,6 +537,9 @@ in ``datagen.py``):
- Signed indices
- Unsigned indices
- Nested dictionaries
* Run end encoded
* Binary view and string view
* List view and large list view
* Extension Types


Expand Down
6 changes: 3 additions & 3 deletions go/arrow/array/encoded.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,19 +150,19 @@ func (r *RunEndEncoded) LogicalRunEndsArray(mem memory.Allocator) arrow.Array {
case *Int16:
for _, v := range e.Int16Values()[physOffset : physOffset+physLength] {
v -= int16(r.data.offset)
v = int16(utils.MinInt(int(v), r.data.length))
v = int16(utils.Min(int(v), r.data.length))
bldr.(*Int16Builder).Append(v)
}
case *Int32:
for _, v := range e.Int32Values()[physOffset : physOffset+physLength] {
v -= int32(r.data.offset)
v = int32(utils.MinInt(int(v), r.data.length))
v = int32(utils.Min(int(v), r.data.length))
bldr.(*Int32Builder).Append(v)
}
case *Int64:
for _, v := range e.Int64Values()[physOffset : physOffset+physLength] {
v -= int64(r.data.offset)
v = int64(utils.MinInt(int(v), r.data.length))
v = int64(utils.Min(int(v), r.data.length))
bldr.(*Int64Builder).Append(v)
}
}
Expand Down
Loading

0 comments on commit 030dca9

Please sign in to comment.