apacheGH-39013: [Go][Integration] Support cABI import/export of Strin…

…gView (apache#39019) ### Rationale for this change The Go implementation should support import/export of the new data types. This will enable integration testing between the C++ and Go implementations. ### What changes are included in this PR? Added import/export for the new data types and arrays of data of those types. ### Are these changes tested? Yes, they will be covered by the integration tests and existing Go unit tests. ### Are there any user-facing changes? This is a user facing change * Closes: apache#39013 Lead-authored-by: Benjamin Kietzman <bengilgit@gmail.com> Co-authored-by: Matt Topol <zotthewizard@gmail.com> Co-authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com> Signed-off-by: Matt Topol <zotthewizard@gmail.com>
clayburn · Jan 23, 2024 · 030dca9 · 030dca9
1 parent 2ae8961
commit 030dca9
Show file tree

Hide file tree

Showing 73 changed files with 917 additions and 859 deletions.
diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
@@ -927,6 +927,83 @@ class LargeListColumn(_BaseListColumn, _LargeOffsetsMixin):
     pass
 
 
+class ListViewField(Field):
+
+    def __init__(self, name, value_field, *, nullable=True,
+                 metadata=None):
+        super().__init__(name, nullable=nullable,
+                         metadata=metadata)
+        self.value_field = value_field
+
+    @property
+    def column_class(self):
+        return ListViewColumn
+
+    def _get_type(self):
+        return OrderedDict([
+            ('name', 'listview')
+        ])
+
+    def _get_children(self):
+        return [self.value_field.get_json()]
+
+    def generate_column(self, size, name=None):
+        MAX_LIST_SIZE = 4
+        VALUES_SIZE = size * MAX_LIST_SIZE
+
+        is_valid = self._make_is_valid(size)
+
+        MAX_OFFSET = VALUES_SIZE - MAX_LIST_SIZE
+        offsets = np.random.randint(0, MAX_OFFSET + 1, size=size)
+        sizes = np.random.randint(0, MAX_LIST_SIZE + 1, size=size)
+
+        values = self.value_field.generate_column(VALUES_SIZE)
+
+        if name is None:
+            name = self.name
+        return self.column_class(name, size, is_valid, offsets, sizes, values)
+
+
+class LargeListViewField(ListViewField):
+
+    @property
+    def column_class(self):
+        return LargeListViewColumn
+
+    def _get_type(self):
+        return OrderedDict([
+            ('name', 'largelistview')
+        ])
+
+
+class _BaseListViewColumn(Column):
+
+    def __init__(self, name, count, is_valid, offsets, sizes, values):
+        super().__init__(name, count)
+        self.is_valid = is_valid
+        self.offsets = offsets
+        self.sizes = sizes
+        self.values = values
+
+    def _get_buffers(self):
+        return [
+            ('VALIDITY', [int(v) for v in self.is_valid]),
+            ('OFFSET', self._encode_offsets(self.offsets)),
+            ('SIZE', self._encode_offsets(self.sizes)),
+        ]
+
+    def _get_children(self):
+        return [self.values.get_json()]
+
+
+class ListViewColumn(_BaseListViewColumn, _NarrowOffsetsMixin):
+    pass
+
+
+class LargeListViewColumn(_BaseListViewColumn, _LargeOffsetsMixin):
+    pass
+
+
 class MapField(Field):
 
     def __init__(self, name, key_field, item_field, *, nullable=True,
@@ -1663,6 +1740,15 @@ def generate_binary_view_case():
     return _generate_file("binary_view", fields, batch_sizes)
 
 
+def generate_list_view_case():
+    fields = [
+        ListViewField('lv', get_field('item', 'float32')),
+        LargeListViewField('llv', get_field('item', 'float32')),
+    ]
+    batch_sizes = [0, 7, 256]
+    return _generate_file("list_view", fields, batch_sizes)
+
+
 def generate_nested_large_offsets_case():
     fields = [
         LargeListField('large_list_nullable', get_field('item', 'int32')),
@@ -1847,7 +1933,12 @@ def _temp_path():
 
         generate_binary_view_case()
         .skip_tester('C#')
-        .skip_tester('Go')
+        .skip_tester('Java')
+        .skip_tester('JS')
+        .skip_tester('Rust'),
+
+        generate_list_view_case()
+        .skip_tester('C#')
         .skip_tester('Java')
         .skip_tester('JS')
         .skip_tester('Rust'),

diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py
@@ -193,6 +193,8 @@ def _run_test_cases(self,
         ``case_runner`` ran against ``test_cases``
         """
         def case_wrapper(test_case):
+            if serial:
+                return case_runner(test_case)
             with printer.cork():
                 return case_runner(test_case)
 

diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst
@@ -223,7 +223,7 @@ considered equivalent to ``[]`` (no metadata). Duplicated keys are not forbidden
 **Type**: ::
 
     {
-      "name" : "null|struct|list|largelist|fixedsizelist|union|int|floatingpoint|utf8|largeutf8|binary|largebinary|fixedsizebinary|bool|decimal|date|time|timestamp|interval|duration|map"
+      "name" : "null|struct|list|largelist|listview|largelistview|fixedsizelist|union|int|floatingpoint|utf8|largeutf8|binary|largebinary|utf8view|binaryview|fixedsizebinary|bool|decimal|date|time|timestamp|interval|duration|map|runendencoded"
     }
 
 A ``Type`` will have other fields as defined in
@@ -446,12 +446,22 @@ or ``DATA``.
 
 ``BufferData`` is encoded based on the type of buffer:
 
-* ``VALIDITY``: a JSON array of 1 (valid) and 0 (null). Data for  non-nullable
+* ``VALIDITY``: a JSON array of 1 (valid) and 0 (null). Data for non-nullable
   ``Field`` still has a ``VALIDITY`` array, even though all values are 1.
 * ``OFFSET``: a JSON array of integers for 32-bit offsets or
-  string-formatted integers for 64-bit offsets
-* ``TYPE_ID``: a JSON array of integers
-* ``DATA``: a JSON array of encoded values
+  string-formatted integers for 64-bit offsets.
+* ``TYPE_ID``: a JSON array of integers.
+* ``DATA``: a JSON array of encoded values.
+* ``VARIADIC_DATA_BUFFERS``: a JSON array of data buffers represented as
+  hex encoded strings.
+* ``VIEWS``: a JSON array of encoded views, which are JSON objects with:
+  * ``SIZE``: an integer indicating the size of the view,
+  * ``INLINED``: an encoded value (this field will be present if ``SIZE``
+    is smaller than 12, otherwise the next three fields will be present),
+  * ``PREFIX_HEX``: the first four bytes of the view encoded as hex,
+  * ``BUFFER_INDEX``: the index in ``VARIADIC_DATA_BUFFERS`` of the buffer
+    viewed,
+  * ``OFFSET``: the offset in the buffer viewed.
 
 The value encoding for ``DATA`` is different depending on the logical
 type:
@@ -527,6 +537,9 @@ in ``datagen.py``):
   - Signed indices
   - Unsigned indices
   - Nested dictionaries
+* Run end encoded
+* Binary view and string view
+* List view and large list view
 * Extension Types
 
 

diff --git a/go/arrow/array/encoded.go b/go/arrow/array/encoded.go
@@ -150,19 +150,19 @@ func (r *RunEndEncoded) LogicalRunEndsArray(mem memory.Allocator) arrow.Array {
 	case *Int16:
 		for _, v := range e.Int16Values()[physOffset : physOffset+physLength] {
 			v -= int16(r.data.offset)
-			v = int16(utils.MinInt(int(v), r.data.length))
+			v = int16(utils.Min(int(v), r.data.length))
 			bldr.(*Int16Builder).Append(v)
 		}
 	case *Int32:
 		for _, v := range e.Int32Values()[physOffset : physOffset+physLength] {
 			v -= int32(r.data.offset)
-			v = int32(utils.MinInt(int(v), r.data.length))
+			v = int32(utils.Min(int(v), r.data.length))
 			bldr.(*Int32Builder).Append(v)
 		}
 	case *Int64:
 		for _, v := range e.Int64Values()[physOffset : physOffset+physLength] {
 			v -= int64(r.data.offset)
-			v = int64(utils.MinInt(int(v), r.data.length))
+			v = int64(utils.Min(int(v), r.data.length))
 			bldr.(*Int64Builder).Append(v)
 		}
 	}