diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index e9ccee08f25..56347ec618f 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -739,7 +739,7 @@ def _validate_collection_item_types( """ if sample is None: return - if all(type(item) in valid_types for item in sample): + if all(type(item) in valid_types for item in sample if item is not None): return # to_numpy() upcasts INT32/INT64 with NULL to Float64 automatically @@ -750,6 +750,8 @@ def _validate_collection_item_types( ValueType.INT64_SET, ] for item in sample: + if item is None: + continue # None elements in STRING_LIST are replaced with ""; for other types they are dropped if type(item) not in valid_types: if feast_value_type in int_collection_types: # Check if the float values are due to NULL upcast @@ -868,6 +870,41 @@ def convert_set_to_list(value: Any) -> Any: ] +# Per-type default values substituted for None elements inside list columns. +# Protobuf repeated fields do not accept None, so we replace with a +# type-appropriate zero/empty value. +_LIST_NONE_DEFAULTS: Dict[ValueType, Any] = { + ValueType.STRING_LIST: "", + ValueType.BYTES_LIST: b"", + ValueType.INT32_LIST: 0, + ValueType.INT64_LIST: 0, + ValueType.FLOAT_LIST: 0.0, + ValueType.DOUBLE_LIST: 0.0, + ValueType.BOOL_LIST: False, + ValueType.UNIX_TIMESTAMP_LIST: NULL_TIMESTAMP_INT_VALUE, + ValueType.UUID_LIST: "", + ValueType.TIME_UUID_LIST: "", + ValueType.DECIMAL_LIST: "", +} + + +def _sanitize_list_value(value: Any, feast_value_type: ValueType) -> Any: + """Convert ndarray to list and replace None elements with a type-appropriate default. + + Arrow/Athena may deserialize array columns as numpy.ndarray with object dtype + instead of plain Python lists. Protobuf repeated fields do not accept ndarrays + or None elements, so we normalise here before building proto messages. + """ + if isinstance(value, np.ndarray): + value = value.tolist() + if isinstance(value, list) and len(value) == 0: + return None + none_default = _LIST_NONE_DEFAULTS.get(feast_value_type) + if none_default is not None and isinstance(value, list): + value = [none_default if v is None else v for v in value] + return value + + def _convert_list_values_to_proto( feast_value_type: ValueType, values: List[Any], @@ -890,6 +927,13 @@ def _convert_list_values_to_proto( feast_value_type ] + values = [ + _sanitize_list_value(v, feast_value_type) if v is not None else v + for v in values + ] + if sample is not None: + sample = _sanitize_list_value(sample, feast_value_type) + # Bytes to array type conversion if isinstance(sample, (bytes, bytearray)): if feast_value_type == ValueType.BYTES_LIST: diff --git a/sdk/python/tests/unit/test_type_map.py b/sdk/python/tests/unit/test_type_map.py index bdaea63a607..6025cdaf590 100644 --- a/sdk/python/tests/unit/test_type_map.py +++ b/sdk/python/tests/unit/test_type_map.py @@ -2065,3 +2065,164 @@ def test_proto_field_name_in_map(self): from feast.type_map import PROTO_VALUE_TO_VALUE_TYPE_MAP assert PROTO_VALUE_TO_VALUE_TYPE_MAP["scalar_map_val"] == ValueType.SCALAR_MAP + + +class TestArrowArrayStringListMaterialization: + """Regression tests for Array(String) columns from Arrow/Athena materialization. + + Arrow/Athena deserializes Array(String) feature columns as numpy.ndarray with + object dtype. Two bugs were triggered: + + 1. ValueError: "The truth value of an empty array is ambiguous" + — when an empty ndarray reached the scalar null-check `elif not pd.isnull(value)`. + + 2. TypeError: "bad argument type for built-in operation" + — when proto_type(val=) was called; protobuf rejects ndarrays. + + Both are fixed by _sanitize_list_value, which converts ndarrays to plain Python + lists and replaces None elements with a type-appropriate zero/empty default + (see _LIST_NONE_DEFAULTS). + """ + + def test_sanitize_list_value_ndarray(self): + """ndarray is converted to a plain Python list.""" + from feast.type_map import _sanitize_list_value + + arr = np.array(["foo", "bar"], dtype=object) + result = _sanitize_list_value(arr, ValueType.STRING_LIST) + assert result == ["foo", "bar"] + assert isinstance(result, list) + + def test_sanitize_list_value_empty_ndarray(self): + """Empty ndarray is converted to None (treated as a missing row).""" + from feast.type_map import _sanitize_list_value + + arr = np.array([], dtype=object) + result = _sanitize_list_value(arr, ValueType.STRING_LIST) + assert result is None + + def test_sanitize_list_value_ndarray_with_none(self): + """None elements inside a STRING_LIST ndarray are replaced with empty string.""" + from feast.type_map import _sanitize_list_value + + arr = np.array(["foo", None, "baz"], dtype=object) + result = _sanitize_list_value(arr, ValueType.STRING_LIST) + assert result == ["foo", "", "baz"] + + def test_sanitize_list_value_plain_list(self): + """Plain Python lists without None pass through unchanged.""" + from feast.type_map import _sanitize_list_value + + lst = ["foo", "bar"] + result = _sanitize_list_value(lst, ValueType.STRING_LIST) + assert result == ["foo", "bar"] + + def test_sanitize_list_value_plain_list_with_none(self): + """None elements in a STRING_LIST plain list are replaced with empty string.""" + from feast.type_map import _sanitize_list_value + + lst = ["foo", None] + result = _sanitize_list_value(lst, ValueType.STRING_LIST) + assert result == ["foo", ""] + + def test_sanitize_list_value_numeric_none_replaced(self): + """None elements in numeric lists are replaced with a type-appropriate default.""" + from feast.type_map import _sanitize_list_value + + assert _sanitize_list_value([1, None, 2], ValueType.INT32_LIST) == [1, 0, 2] + assert _sanitize_list_value([1, None, 2], ValueType.INT64_LIST) == [1, 0, 2] + assert _sanitize_list_value([1.0, None, 2.0], ValueType.FLOAT_LIST) == [ + 1.0, + 0.0, + 2.0, + ] + assert _sanitize_list_value([1.0, None, 2.0], ValueType.DOUBLE_LIST) == [ + 1.0, + 0.0, + 2.0, + ] + assert _sanitize_list_value([True, None, False], ValueType.BOOL_LIST) == [ + True, + False, + False, + ] + + def test_sanitize_list_value_bytes_none_replaced(self): + """None elements in BYTES_LIST are replaced with b''.""" + from feast.type_map import _sanitize_list_value + + result = _sanitize_list_value([b"x", None], ValueType.BYTES_LIST) + assert result == [b"x", b""] + + def test_sanitize_list_value_scalar_passthrough(self): + """Non-list, non-ndarray values are returned unchanged.""" + from feast.type_map import _sanitize_list_value + + assert _sanitize_list_value("hello", ValueType.STRING_LIST) == "hello" + assert _sanitize_list_value(42, ValueType.INT32_LIST) == 42 + + def test_string_list_from_ndarray(self): + """STRING_LIST column with ndarray values materializes without TypeError.""" + values = [ + np.array(["foo", "bar"], dtype=object), + np.array(["baz"], dtype=object), + ] + protos = python_values_to_proto_values(values, ValueType.STRING_LIST) + assert len(protos) == 2 + assert list(protos[0].string_list_val.val) == ["foo", "bar"] + assert list(protos[1].string_list_val.val) == ["baz"] + + def test_string_list_from_empty_ndarray(self): + """Empty ndarray in a STRING_LIST column must not raise ValueError.""" + values = [ + np.array([], dtype=object), + np.array(["foo"], dtype=object), + ] + protos = python_values_to_proto_values(values, ValueType.STRING_LIST) + assert list(protos[0].string_list_val.val) == [] + assert list(protos[1].string_list_val.val) == ["foo"] + + def test_string_list_from_ndarray_with_none_elements(self): + """None elements inside an ndarray must not cause TypeError in protobuf.""" + values = [ + np.array(["foo", None, "baz"], dtype=object), + ] + protos = python_values_to_proto_values(values, ValueType.STRING_LIST) + # None is replaced with empty string + assert list(protos[0].string_list_val.val) == ["foo", "", "baz"] + + def test_string_list_null_row_produces_empty_proto(self): + """A None row (missing user) produces an empty ProtoValue.""" + from feast.protos.feast.types.Value_pb2 import Value as ProtoValue + + values = [ + None, + np.array(["foo"], dtype=object), + ] + protos = python_values_to_proto_values(values, ValueType.STRING_LIST) + assert protos[0] == ProtoValue() + assert list(protos[1].string_list_val.val) == ["foo"] + + def test_mixed_batch_simulating_athena_chunk(self): + """Simulate a real Athena chunk: mix of ndarray, empty ndarray, and None rows. + + This is the exact scenario that triggered the TypeError during + string_list_features materialization. + """ + from feast.protos.feast.types.Value_pb2 import Value as ProtoValue + + # tags / labels column from Athena + values = [ + np.array(["foo", "bar"], dtype=object), # normal entity + np.array([], dtype=object), # entity with no values set + None, # missing entity (NULL row) + np.array(["baz"], dtype=object), # normal entity + np.array(["qux", None], dtype=object), # entity with partial null + ] + protos = python_values_to_proto_values(values, ValueType.STRING_LIST) + + assert list(protos[0].string_list_val.val) == ["foo", "bar"] + assert list(protos[1].string_list_val.val) == [] + assert protos[2] == ProtoValue() + assert list(protos[3].string_list_val.val) == ["baz"] + assert list(protos[4].string_list_val.val) == ["qux", ""]