From ddc531400944ee1d820428a5f175118e10d92489 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Tue, 3 Mar 2026 17:05:06 -0500 Subject: [PATCH 01/30] feat: Add MongoDB offline store (ibis-based PIT join, v1 alpha) - MongoDBSource: DataSource backed by a MongoDB collection, schema sampled via \ aggregation (default N=100) - MongoDBOfflineStoreConfig: connection_string + default database - MongoDBOfflineStore: delegates to ibis PIT join engine via in-memory memtable approach - SavedDatasetMongoDBStorage: persist training datasets to MongoDB - _build_data_source_reader/_build_data_source_writer closures capture config (connection_string, database) for MongoDB access Signed-off-by: Casey Clements --- .../contrib/mongodb_offline_store/__init__.py | 1 + .../contrib/mongodb_offline_store/mongodb.py | 224 ++++++++++++++ .../mongodb_offline_store/mongodb_source.py | 276 ++++++++++++++++++ 3 files changed, 501 insertions(+) create mode 100644 sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/__init__.py create mode 100644 sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py create mode 100644 sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_source.py diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/__init__.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/__init__.py @@ -0,0 +1 @@ + diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py new file mode 100644 index 0000000000..482f2cdc88 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py @@ -0,0 +1,224 @@ +# Copyright 2025 The Feast Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from datetime import datetime +from typing import Any, Callable, List, Optional, Union + +import ibis +import pandas as pd +from ibis.expr.types import Table +from pydantic import StrictStr + +try: + from pymongo import MongoClient +except ImportError: + MongoClient = None # type: ignore[assignment,misc] + +from feast.data_source import DataSource +from feast.errors import ( + FeastExtrasDependencyImportError, + SavedDatasetLocationAlreadyExists, +) +from feast.feature_view import FeatureView +from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_source import ( + MongoDBSource, +) +from feast.infra.offline_stores.ibis import ( + get_historical_features_ibis, + pull_all_from_table_or_query_ibis, + pull_latest_from_table_or_query_ibis, +) +from feast.infra.offline_stores.offline_store import OfflineStore, RetrievalJob +from feast.infra.registry.base_registry import BaseRegistry +from feast.repo_config import FeastConfigBaseModel, RepoConfig + +# Print RuntimeWarning only once per process. +warnings.simplefilter("once", RuntimeWarning) + + +class MongoDBOfflineStoreConfig(FeastConfigBaseModel): + """Configuration for the MongoDB offline store.""" + + type: StrictStr = "feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb.MongoDBOfflineStore" + """Offline store type selector""" + + connection_string: StrictStr = "mongodb://localhost:27017" + """MongoDB connection URI""" + + database: StrictStr = "feast" + """Default MongoDB database name""" + + +class MongoDBOfflineStore(OfflineStore): + """Offline store backed by MongoDB, using ibis for point-in-time joins.""" + + @staticmethod + def pull_latest_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + timestamp_field: str, + created_timestamp_column: Optional[str], + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + assert isinstance(data_source, MongoDBSource) + warnings.warn( + "MongoDB offline store is in alpha. API may change without notice.", + RuntimeWarning, + ) + return pull_latest_from_table_or_query_ibis( + config=config, + data_source=data_source, + join_key_columns=join_key_columns, + feature_name_columns=feature_name_columns, + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, + start_date=start_date, + end_date=end_date, + data_source_reader=_build_data_source_reader(config), + data_source_writer=_build_data_source_writer(config), # type: ignore[arg-type] + ) + + @staticmethod + def get_historical_features( + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pd.DataFrame, str], + registry: BaseRegistry, + project: str, + full_feature_names: bool = False, + ) -> RetrievalJob: + warnings.warn( + "MongoDB offline store is in alpha. API may change without notice.", + RuntimeWarning, + ) + return get_historical_features_ibis( + config=config, + feature_views=feature_views, + feature_refs=feature_refs, + entity_df=entity_df, + registry=registry, + project=project, + full_feature_names=full_feature_names, + data_source_reader=_build_data_source_reader(config), + data_source_writer=_build_data_source_writer(config), # type: ignore[arg-type] + ) + + @staticmethod + def pull_all_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + timestamp_field: str, + created_timestamp_column: Optional[str] = None, + start_date: Optional[datetime] = None, + end_date: Optional[datetime] = None, + ) -> RetrievalJob: + assert isinstance(data_source, MongoDBSource) + warnings.warn( + "MongoDB offline store is in alpha. API may change without notice.", + RuntimeWarning, + ) + return pull_all_from_table_or_query_ibis( + config=config, + data_source=data_source, + join_key_columns=join_key_columns, + feature_name_columns=feature_name_columns, + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, + start_date=start_date, + end_date=end_date, + data_source_reader=_build_data_source_reader(config), + data_source_writer=_build_data_source_writer(config), # type: ignore[arg-type] + ) + + +def _build_data_source_reader(config: RepoConfig) -> Callable[[DataSource, str], Table]: + """Return a closure that fetches a MongoDB collection as an ibis in-memory table.""" + + def reader(data_source: DataSource, repo_path: str) -> Table: + if MongoClient is None: + raise FeastExtrasDependencyImportError( + "mongodb", "pymongo is not installed." + ) + assert isinstance(data_source, MongoDBSource) + connection_string = config.offline_store.connection_string + db_name = data_source.database or config.offline_store.database + client: Any = MongoClient(connection_string, tz_aware=True) + try: + docs = list(client[db_name][data_source.collection].find({}, {"_id": 0})) + finally: + client.close() + + df = pd.DataFrame(docs) + if df.empty: + return ibis.memtable(df) + + # Ensure datetime-like columns are timezone-aware UTC pandas timestamps. + for col in df.columns: + if pd.api.types.is_datetime64_any_dtype(df[col]): + if df[col].dt.tz is None: + df[col] = pd.to_datetime(df[col], utc=True) + elif df[col].dtype == object and len(df[col].dropna()) > 0: + sample = df[col].dropna().iloc[0] + if isinstance(sample, datetime): + try: + df[col] = pd.to_datetime(df[col], utc=True) + except Exception: + pass + + return ibis.memtable(df) + + return reader + + +def _build_data_source_writer( + config: RepoConfig, +) -> Callable[[Table, DataSource, str, str, bool], None]: + """Return a closure that writes an ibis table to a MongoDB collection.""" + + def writer( + table: Table, + data_source: DataSource, + repo_path: str, + mode: str = "append", + allow_overwrite: bool = False, + ) -> None: + if MongoClient is None: + raise FeastExtrasDependencyImportError( + "mongodb", "pymongo is not installed." + ) + assert isinstance(data_source, MongoDBSource) + connection_string = config.offline_store.connection_string + db_name = data_source.database or config.offline_store.database + location = f"{db_name}.{data_source.collection}" + client: Any = MongoClient(connection_string) + try: + coll = client[db_name][data_source.collection] + if mode == "overwrite": + if not allow_overwrite and coll.estimated_document_count() > 0: + raise SavedDatasetLocationAlreadyExists(location=location) + coll.drop() + records = table.to_pyarrow().to_pylist() + if records: + coll.insert_many(records) + finally: + client.close() + + return writer diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_source.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_source.py new file mode 100644 index 0000000000..825f2910f7 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_source.py @@ -0,0 +1,276 @@ +# Copyright 2025 The Feast Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from datetime import datetime +from typing import Any, Callable, Dict, Iterable, Optional, Tuple + +try: + from pymongo import MongoClient +except ImportError: + MongoClient = None # type: ignore[assignment,misc] + +from feast.data_source import DataSource +from feast.errors import DataSourceNoNameException, FeastExtrasDependencyImportError +from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto +from feast.protos.feast.core.SavedDataset_pb2 import ( + SavedDatasetStorage as SavedDatasetStorageProto, +) +from feast.repo_config import RepoConfig +from feast.saved_dataset import SavedDatasetStorage +from feast.value_type import ValueType + + +def _infer_python_type_str(value: Any) -> Optional[str]: + """Infer a Feast-compatible type string from a Python value returned by pymongo.""" + if value is None: + return None + if isinstance(value, bool): + return "bool" + if isinstance(value, int): + return "int" + if isinstance(value, float): + return "float" + if isinstance(value, str): + return "str" + if isinstance(value, bytes): + return "bytes" + if isinstance(value, datetime): + return "datetime" + if isinstance(value, list): + if not value: + return "list[str]" + elem_type = _infer_python_type_str(value[0]) + if elem_type: + return f"list[{elem_type}]" + return "list[str]" + return None + + +def mongodb_to_feast_value_type(type_str: str) -> ValueType: + """Map a Python type string (from pymongo) to a Feast ValueType.""" + _MAP: Dict[str, ValueType] = { + "str": ValueType.STRING, + "int": ValueType.INT64, + "float": ValueType.DOUBLE, + "bool": ValueType.BOOL, + "bytes": ValueType.BYTES, + "datetime": ValueType.UNIX_TIMESTAMP, + "list[str]": ValueType.STRING_LIST, + "list[int]": ValueType.INT64_LIST, + "list[float]": ValueType.DOUBLE_LIST, + "list[bool]": ValueType.BOOL_LIST, + "list[bytes]": ValueType.BYTES_LIST, + "list[datetime]": ValueType.UNIX_TIMESTAMP_LIST, + } + return _MAP.get(type_str, ValueType.UNKNOWN) + + +class MongoDBOptions: + """Options for a MongoDB data source (database + collection).""" + + def __init__(self, database: str, collection: str): + self._database = database + self._collection = collection + + def to_proto(self) -> DataSourceProto.CustomSourceOptions: + return DataSourceProto.CustomSourceOptions( + configuration=json.dumps( + {"database": self._database, "collection": self._collection} + ).encode() + ) + + @classmethod + def from_proto( + cls, options_proto: DataSourceProto.CustomSourceOptions + ) -> "MongoDBOptions": + config = json.loads(options_proto.configuration.decode("utf8")) + return cls(database=config["database"], collection=config["collection"]) + + +class MongoDBSource(DataSource): + """A MongoDB collection as a Feast offline data source.""" + + def source_type(self) -> DataSourceProto.SourceType.ValueType: + return DataSourceProto.CUSTOM_SOURCE + + def __init__( + self, + name: Optional[str] = None, + database: Optional[str] = None, + collection: Optional[str] = None, + timestamp_field: Optional[str] = "", + created_timestamp_column: Optional[str] = "", + field_mapping: Optional[Dict[str, str]] = None, + description: Optional[str] = "", + tags: Optional[Dict[str, str]] = None, + owner: Optional[str] = "", + schema_sample_size: int = 100, + ): + if name is None and collection is None: + raise DataSourceNoNameException() + name = name or collection + assert name + + self._mongodb_options = MongoDBOptions( + database=database or "", + collection=collection or name, + ) + self._schema_sample_size = schema_sample_size + + super().__init__( + name=name, + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, + field_mapping=field_mapping, + description=description, + tags=tags, + owner=owner, + ) + + def __hash__(self): + return super().__hash__() + + def __eq__(self, other): + if not isinstance(other, MongoDBSource): + raise TypeError( + "Comparisons should only involve MongoDBSource class objects." + ) + return ( + super().__eq__(other) + and self._mongodb_options._database == other._mongodb_options._database + and self._mongodb_options._collection == other._mongodb_options._collection + and self.timestamp_field == other.timestamp_field + and self.created_timestamp_column == other.created_timestamp_column + and self.field_mapping == other.field_mapping + ) + + @property + def database(self) -> str: + return self._mongodb_options._database + + @property + def collection(self) -> str: + return self._mongodb_options._collection + + @staticmethod + def from_proto(data_source: DataSourceProto) -> "MongoDBSource": + assert data_source.HasField("custom_options") + options = json.loads(data_source.custom_options.configuration) + return MongoDBSource( + name=data_source.name, + database=options["database"], + collection=options["collection"], + field_mapping=dict(data_source.field_mapping), + timestamp_field=data_source.timestamp_field, + created_timestamp_column=data_source.created_timestamp_column, + description=data_source.description, + tags=dict(data_source.tags), + owner=data_source.owner, + ) + + def _to_proto_impl(self) -> DataSourceProto: + data_source_proto = DataSourceProto( + name=self.name, + type=DataSourceProto.CUSTOM_SOURCE, + data_source_class_type="feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_source.MongoDBSource", + field_mapping=self.field_mapping, + custom_options=self._mongodb_options.to_proto(), + description=self.description, + tags=self.tags, + owner=self.owner, + ) + data_source_proto.timestamp_field = self.timestamp_field + data_source_proto.created_timestamp_column = self.created_timestamp_column + return data_source_proto + + def validate(self, config: RepoConfig): + pass + + @staticmethod + def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: + return mongodb_to_feast_value_type + + def get_table_query_string(self) -> str: + return f"{self._mongodb_options._database}.{self._mongodb_options._collection}" + + def get_table_column_names_and_types( + self, config: RepoConfig + ) -> Iterable[Tuple[str, str]]: + if MongoClient is None: + raise FeastExtrasDependencyImportError( + "mongodb", "pymongo is not installed." + ) + connection_string = config.offline_store.connection_string + db_name = self.database or config.offline_store.database + client: Any = MongoClient(connection_string, tz_aware=True) + try: + docs = list( + client[db_name][self.collection].aggregate( + [{"$sample": {"size": self._schema_sample_size}}] + ) + ) + finally: + client.close() + + field_type_counts: Dict[str, Dict[str, int]] = {} + for doc in docs: + for field, value in doc.items(): + if field == "_id": + continue + type_str = _infer_python_type_str(value) + if type_str is None: + continue + field_type_counts.setdefault(field, {}) + field_type_counts[field][type_str] = ( + field_type_counts[field].get(type_str, 0) + 1 + ) + + return [ + (field, max(counts, key=lambda t: counts[t])) + for field, counts in field_type_counts.items() + ] + + +class SavedDatasetMongoDBStorage(SavedDatasetStorage): + """Persists a Feast SavedDataset into a MongoDB collection.""" + + _proto_attr_name = "custom_storage" + + mongodb_options: MongoDBOptions + + def __init__(self, database: str, collection: str): + self.mongodb_options = MongoDBOptions( + database=database, + collection=collection, + ) + + @staticmethod + def from_proto( + storage_proto: SavedDatasetStorageProto, + ) -> "SavedDatasetMongoDBStorage": + options = json.loads(storage_proto.custom_storage.configuration) + return SavedDatasetMongoDBStorage( + database=options["database"], + collection=options["collection"], + ) + + def to_proto(self) -> SavedDatasetStorageProto: + return SavedDatasetStorageProto(custom_storage=self.mongodb_options.to_proto()) + + def to_data_source(self) -> DataSource: + return MongoDBSource( + database=self.mongodb_options._database, + collection=self.mongodb_options._collection, + ) From 8b7f7105b679dfb928026982f188524b274f2823 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Wed, 4 Mar 2026 12:35:19 -0500 Subject: [PATCH 02/30] refactor: improve MongoDB offline store code quality - Update copyright headers to 2026 - Move mongodb_to_feast_value_type to feast/type_map.py, consistent with pg_type_to_feast_value_type and cb_columnar_type_to_feast_value_type - Add docstrings to MongoDBOptions.to_proto/from_proto, MongoDBSource class, and get_table_column_names_and_types - Replace dead 'assert name' with cast(str, ...) for type-checker safety - Add explanatory comment to validate() stub - Remove module-level warnings.simplefilter('once', RuntimeWarning), which was a process-wide side effect; per-call warnings.warn is enough - Convert all assert isinstance(data_source, MongoDBSource) guards to ValueError with descriptive messages in both public API methods and the reader/writer closures - Fix bug: add tz_aware=True to MongoClient in the writer closure, matching the reader, to ensure consistent timezone-aware datetime handling across read and write paths Signed-off-by: Casey Clements --- .../contrib/mongodb_offline_store/mongodb.py | 31 ++++++++--- .../mongodb_offline_store/mongodb_source.py | 55 +++++++++++-------- sdk/python/feast/type_map.py | 24 ++++++++ 3 files changed, 77 insertions(+), 33 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py index 482f2cdc88..23b1295286 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py @@ -1,4 +1,4 @@ -# Copyright 2025 The Feast Authors +# Copyright 2026 The Feast Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -44,9 +44,6 @@ from feast.infra.registry.base_registry import BaseRegistry from feast.repo_config import FeastConfigBaseModel, RepoConfig -# Print RuntimeWarning only once per process. -warnings.simplefilter("once", RuntimeWarning) - class MongoDBOfflineStoreConfig(FeastConfigBaseModel): """Configuration for the MongoDB offline store.""" @@ -75,7 +72,11 @@ def pull_latest_from_table_or_query( start_date: datetime, end_date: datetime, ) -> RetrievalJob: - assert isinstance(data_source, MongoDBSource) + if not isinstance(data_source, MongoDBSource): + raise ValueError( + f"MongoDBOfflineStore expected a MongoDBSource, " + f"got {type(data_source).__name__!r}." + ) warnings.warn( "MongoDB offline store is in alpha. API may change without notice.", RuntimeWarning, @@ -130,7 +131,11 @@ def pull_all_from_table_or_query( start_date: Optional[datetime] = None, end_date: Optional[datetime] = None, ) -> RetrievalJob: - assert isinstance(data_source, MongoDBSource) + if not isinstance(data_source, MongoDBSource): + raise ValueError( + f"MongoDBOfflineStore expected a MongoDBSource, " + f"got {type(data_source).__name__!r}." + ) warnings.warn( "MongoDB offline store is in alpha. API may change without notice.", RuntimeWarning, @@ -157,7 +162,11 @@ def reader(data_source: DataSource, repo_path: str) -> Table: raise FeastExtrasDependencyImportError( "mongodb", "pymongo is not installed." ) - assert isinstance(data_source, MongoDBSource) + if not isinstance(data_source, MongoDBSource): + raise ValueError( + f"MongoDBOfflineStore reader expected a MongoDBSource, " + f"got {type(data_source).__name__!r}." + ) connection_string = config.offline_store.connection_string db_name = data_source.database or config.offline_store.database client: Any = MongoClient(connection_string, tz_aware=True) @@ -204,11 +213,15 @@ def writer( raise FeastExtrasDependencyImportError( "mongodb", "pymongo is not installed." ) - assert isinstance(data_source, MongoDBSource) + if not isinstance(data_source, MongoDBSource): + raise ValueError( + f"MongoDBOfflineStore writer expected a MongoDBSource, " + f"got {type(data_source).__name__!r}." + ) connection_string = config.offline_store.connection_string db_name = data_source.database or config.offline_store.database location = f"{db_name}.{data_source.collection}" - client: Any = MongoClient(connection_string) + client: Any = MongoClient(connection_string, tz_aware=True) try: coll = client[db_name][data_source.collection] if mode == "overwrite": diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_source.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_source.py index 825f2910f7..ee55fe24e6 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_source.py @@ -1,4 +1,4 @@ -# Copyright 2025 The Feast Authors +# Copyright 2026 The Feast Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ import json from datetime import datetime -from typing import Any, Callable, Dict, Iterable, Optional, Tuple +from typing import Any, Callable, Dict, Iterable, Optional, Tuple, cast try: from pymongo import MongoClient @@ -29,6 +29,7 @@ ) from feast.repo_config import RepoConfig from feast.saved_dataset import SavedDatasetStorage +from feast.type_map import mongodb_to_feast_value_type from feast.value_type import ValueType @@ -58,25 +59,6 @@ def _infer_python_type_str(value: Any) -> Optional[str]: return None -def mongodb_to_feast_value_type(type_str: str) -> ValueType: - """Map a Python type string (from pymongo) to a Feast ValueType.""" - _MAP: Dict[str, ValueType] = { - "str": ValueType.STRING, - "int": ValueType.INT64, - "float": ValueType.DOUBLE, - "bool": ValueType.BOOL, - "bytes": ValueType.BYTES, - "datetime": ValueType.UNIX_TIMESTAMP, - "list[str]": ValueType.STRING_LIST, - "list[int]": ValueType.INT64_LIST, - "list[float]": ValueType.DOUBLE_LIST, - "list[bool]": ValueType.BOOL_LIST, - "list[bytes]": ValueType.BYTES_LIST, - "list[datetime]": ValueType.UNIX_TIMESTAMP_LIST, - } - return _MAP.get(type_str, ValueType.UNKNOWN) - - class MongoDBOptions: """Options for a MongoDB data source (database + collection).""" @@ -85,6 +67,7 @@ def __init__(self, database: str, collection: str): self._collection = collection def to_proto(self) -> DataSourceProto.CustomSourceOptions: + """Serialize database and collection names as JSON into a CustomSourceOptions proto.""" return DataSourceProto.CustomSourceOptions( configuration=json.dumps( {"database": self._database, "collection": self._collection} @@ -95,12 +78,28 @@ def to_proto(self) -> DataSourceProto.CustomSourceOptions: def from_proto( cls, options_proto: DataSourceProto.CustomSourceOptions ) -> "MongoDBOptions": + """Deserialize a CustomSourceOptions proto back into a MongoDBOptions instance.""" config = json.loads(options_proto.configuration.decode("utf8")) return cls(database=config["database"], collection=config["collection"]) class MongoDBSource(DataSource): - """A MongoDB collection as a Feast offline data source.""" + """A MongoDB collection used as a Feast offline data source. + + ``name`` is the logical Feast name for this source. If omitted, it defaults + to the value of ``collection``. At least one of ``name`` or ``collection`` + must be supplied. + + ``database`` is the MongoDB database that contains the collection. When + omitted it falls back to ``MongoDBOfflineStoreConfig.database`` at query + time, so a single store-level default can be shared across many sources. + + ``schema_sample_size`` controls how many documents are randomly sampled + when Feast infers the collection schema (used by ``feast apply`` and + ``get_table_column_names_and_types``). Increase it for collections with + highly variable document shapes; decrease it to speed up ``feast apply`` + at the cost of schema coverage. + """ def source_type(self) -> DataSourceProto.SourceType.ValueType: return DataSourceProto.CUSTOM_SOURCE @@ -120,8 +119,8 @@ def __init__( ): if name is None and collection is None: raise DataSourceNoNameException() - name = name or collection - assert name + # At least one of name / collection is non-None; cast to satisfy the type checker. + name = cast(str, name or collection) self._mongodb_options = MongoDBOptions( database=database or "", @@ -196,6 +195,8 @@ def _to_proto_impl(self) -> DataSourceProto: return data_source_proto def validate(self, config: RepoConfig): + # No upfront schema validation is required for MongoDB; the connection + # is exercised lazily when features are actually retrieved. pass @staticmethod @@ -208,6 +209,12 @@ def get_table_query_string(self) -> str: def get_table_column_names_and_types( self, config: RepoConfig ) -> Iterable[Tuple[str, str]]: + """Sample documents from the collection to infer field names and their Feast type strings. + + Uses ``$sample`` to fetch up to ``schema_sample_size`` documents, then + picks the most-frequent Python type observed per field. The ``_id`` + field is always excluded. + """ if MongoClient is None: raise FeastExtrasDependencyImportError( "mongodb", "pymongo is not installed." diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 5e77f532c9..b383963c0c 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -1762,6 +1762,30 @@ def cb_columnar_type_to_feast_value_type(type_str: str) -> ValueType: return value +def mongodb_to_feast_value_type(type_str: str) -> ValueType: + """Map a Python type string (as inferred from pymongo documents) to a Feast ValueType. + + The type strings are produced by + ``feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_source._infer_python_type_str``. + Unrecognised strings are mapped to ``ValueType.UNKNOWN``. + """ + type_map: Dict[str, ValueType] = { + "str": ValueType.STRING, + "int": ValueType.INT64, + "float": ValueType.DOUBLE, + "bool": ValueType.BOOL, + "bytes": ValueType.BYTES, + "datetime": ValueType.UNIX_TIMESTAMP, + "list[str]": ValueType.STRING_LIST, + "list[int]": ValueType.INT64_LIST, + "list[float]": ValueType.DOUBLE_LIST, + "list[bool]": ValueType.BOOL_LIST, + "list[bytes]": ValueType.BYTES_LIST, + "list[datetime]": ValueType.UNIX_TIMESTAMP_LIST, + } + return type_map.get(type_str, ValueType.UNKNOWN) + + def convert_scalar_column( series: pd.Series, value_type: ValueType, target_pandas_type: str ) -> pd.Series: From 62695aa3be52916e09aa98192e9cba19475af5fa Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 9 Mar 2026 10:13:47 -0400 Subject: [PATCH 03/30] Started work on full Mongo/MQL implementation. Kept MongoDBOfflineStoreIbis and MongoDBOfflineStoreNative Signed-off-by: Casey Clements --- .../contrib/mongodb_offline_store/mongodb.py | 421 +++++++++++++++++- 1 file changed, 413 insertions(+), 8 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py index 23b1295286..89794e3ba8 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py @@ -13,11 +13,12 @@ # limitations under the License. import warnings -from datetime import datetime -from typing import Any, Callable, List, Optional, Union +from datetime import datetime, timezone +from typing import Any, Callable, Dict, List, Optional, Union import ibis import pandas as pd +import pyarrow from ibis.expr.types import Table from pydantic import StrictStr @@ -40,15 +41,23 @@ pull_all_from_table_or_query_ibis, pull_latest_from_table_or_query_ibis, ) -from feast.infra.offline_stores.offline_store import OfflineStore, RetrievalJob +from feast.infra.offline_stores.offline_store import ( + OfflineStore, + RetrievalJob, + RetrievalMetadata, +) +from feast.infra.offline_stores.offline_utils import ( + infer_event_timestamp_from_entity_df, +) from feast.infra.registry.base_registry import BaseRegistry from feast.repo_config import FeastConfigBaseModel, RepoConfig +from feast.saved_dataset import SavedDatasetStorage -class MongoDBOfflineStoreConfig(FeastConfigBaseModel): - """Configuration for the MongoDB offline store.""" +class MongoDBOfflineStoreIbisConfig(FeastConfigBaseModel): + """Configuration for the MongoDB Ibis-backed offline store.""" - type: StrictStr = "feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb.MongoDBOfflineStore" + type: StrictStr = "feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb.MongoDBOfflineStoreIbis" """Offline store type selector""" connection_string: StrictStr = "mongodb://localhost:27017" @@ -58,8 +67,8 @@ class MongoDBOfflineStoreConfig(FeastConfigBaseModel): """Default MongoDB database name""" -class MongoDBOfflineStore(OfflineStore): - """Offline store backed by MongoDB, using ibis for point-in-time joins.""" +class MongoDBOfflineStoreIbis(OfflineStore): + """Offline store backed by MongoDB, using Ibis for point-in-time joins.""" @staticmethod def pull_latest_from_table_or_query( @@ -235,3 +244,399 @@ def writer( client.close() return writer + + +# --------------------------------------------------------------------------- +# Native MQL implementation +# --------------------------------------------------------------------------- + + +class MongoDBOfflineStoreNativeConfig(FeastConfigBaseModel): + """Configuration for the MongoDB native-MQL offline store.""" + + type: StrictStr = "feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb.MongoDBOfflineStoreNative" + """Offline store type selector""" + + connection_string: StrictStr = "mongodb://localhost:27017" + """MongoDB connection URI""" + + database: StrictStr = "feast" + """Default MongoDB database name""" + + +def _fetch_collection_as_arrow( + connection_string: str, + db_name: str, + collection: str, + pipeline: Optional[List[Dict]] = None, +) -> pyarrow.Table: + """Run an aggregation pipeline (or full scan) via PyMongo and return a pyarrow Table. + + If *pipeline* is None the entire collection is scanned (``_id`` excluded). + The ``_id`` field is stripped from every result document before conversion. + """ + if MongoClient is None: + raise FeastExtrasDependencyImportError("mongodb", "pymongo is not installed.") + client: Any = MongoClient(connection_string, tz_aware=True) + try: + if pipeline is not None: + docs = list(client[db_name][collection].aggregate(pipeline)) + else: + docs = list(client[db_name][collection].find({}, {"_id": 0})) + finally: + client.close() + + if not docs: + return pyarrow.table({}) + + for doc in docs: + doc.pop("_id", None) + + return pyarrow.Table.from_pylist(docs) + + +class MongoDBNativeRetrievalJob(RetrievalJob): + """A RetrievalJob whose results come from a lazy PyMongo query callable. + + The callable is only executed when the caller materialises the job (e.g. + ``to_df()``, ``to_arrow()``, ``persist()``). + """ + + def __init__( + self, + query_fn: Callable[[], pyarrow.Table], + full_feature_names: bool, + on_demand_feature_views: List, + metadata: Optional[RetrievalMetadata], + config: RepoConfig, + ) -> None: + super().__init__() + self._query_fn = query_fn + self._full_feature_names = full_feature_names + self._on_demand_feature_views = on_demand_feature_views or [] + self._metadata = metadata + self._config = config + + def _to_arrow_internal(self, timeout: Optional[int] = None) -> pyarrow.Table: + return self._query_fn() + + def _to_df_internal(self, timeout: Optional[int] = None) -> pd.DataFrame: + return self._to_arrow_internal().to_pandas() + + @property + def full_feature_names(self) -> bool: + return self._full_feature_names + + @property + def on_demand_feature_views(self) -> List: + return self._on_demand_feature_views + + @property + def metadata(self) -> Optional[RetrievalMetadata]: + return self._metadata + + def persist( + self, + storage: SavedDatasetStorage, + allow_overwrite: bool = False, + timeout: Optional[int] = None, + ) -> None: + if MongoClient is None: + raise FeastExtrasDependencyImportError( + "mongodb", "pymongo is not installed." + ) + data_source = storage.to_data_source() + if not isinstance(data_source, MongoDBSource): + raise ValueError( + f"MongoDBNativeRetrievalJob.persist expected a MongoDBSource storage, " + f"got {type(data_source).__name__!r}." + ) + table = self._to_arrow_internal() + connection_string = self._config.offline_store.connection_string + db_name = data_source.database or self._config.offline_store.database + location = f"{db_name}.{data_source.collection}" + client: Any = MongoClient(connection_string, tz_aware=True) + try: + coll = client[db_name][data_source.collection] + if not allow_overwrite and coll.estimated_document_count() > 0: + raise SavedDatasetLocationAlreadyExists(location=location) + coll.drop() + records = table.to_pylist() + if records: + coll.insert_many(records) + finally: + client.close() + + +class MongoDBOfflineStoreNative(OfflineStore): + """Offline store backed by MongoDB using native MQL aggregation pipelines. + + Compared with :class:`MongoDBOfflineStoreIbis`, this implementation avoids + the Ibis dependency entirely. The three main workflows map to: + + * ``offline_write_batch`` – Arrow → ``insert_many`` + * ``pull_latest_from_table_or_query`` – ``$match`` → ``$sort`` → ``$group`` + * ``pull_all_from_table_or_query`` – ``$match`` → ``$project`` + * ``get_historical_features`` – per-collection fetch + ``merge_asof`` + """ + + @staticmethod + def offline_write_batch( + config: RepoConfig, + feature_view: FeatureView, + table: pyarrow.Table, + progress: Optional[Callable[[int], Any]], + ) -> None: + if MongoClient is None: + raise FeastExtrasDependencyImportError( + "mongodb", "pymongo is not installed." + ) + data_source = feature_view.batch_source + if not isinstance(data_source, MongoDBSource): + raise ValueError( + f"MongoDBOfflineStoreNative.offline_write_batch expected a MongoDBSource, " + f"got {type(data_source).__name__!r}." + ) + connection_string = config.offline_store.connection_string + db_name = data_source.database or config.offline_store.database + records = table.to_pylist() + client: Any = MongoClient(connection_string, tz_aware=True) + try: + coll = client[db_name][data_source.collection] + if records: + coll.insert_many(records) + if progress: + progress(len(records)) + finally: + client.close() + + @staticmethod + def pull_latest_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + timestamp_field: str, + created_timestamp_column: Optional[str], + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + if not isinstance(data_source, MongoDBSource): + raise ValueError( + f"MongoDBOfflineStoreNative expected a MongoDBSource, " + f"got {type(data_source).__name__!r}." + ) + warnings.warn( + "MongoDB offline store (native) is in alpha. API may change without notice.", + RuntimeWarning, + ) + start_utc = start_date.astimezone(tz=timezone.utc) + end_utc = end_date.astimezone(tz=timezone.utc) + connection_string = config.offline_store.connection_string + db_name = data_source.database or config.offline_store.database + collection = data_source.collection + + sort_spec: Dict = {timestamp_field: -1} + if created_timestamp_column: + sort_spec[created_timestamp_column] = -1 + + group_id = {k: f"${k}" for k in join_key_columns} + group_stage: Dict = { + "_id": group_id, # todo this isn't correct. or i don't follow + **{f: {"$first": f"${f}"} for f in feature_name_columns}, + timestamp_field: {"$first": f"${timestamp_field}"}, + } + if created_timestamp_column: + group_stage[created_timestamp_column] = { + "$first": f"${created_timestamp_column}" + } + + project_stage: Dict = { + "_id": 0, + **{k: f"$_id.{k}" for k in join_key_columns}, # todo here too + **{f: 1 for f in feature_name_columns}, + timestamp_field: 1, + } + if created_timestamp_column: + project_stage[created_timestamp_column] = 1 + + pipeline = [ + {"$match": {timestamp_field: {"$gte": start_utc, "$lte": end_utc}}}, + {"$sort": sort_spec}, + {"$group": group_stage}, + {"$project": project_stage}, + ] + + def _run() -> pyarrow.Table: + return _fetch_collection_as_arrow( + connection_string, db_name, collection, pipeline + ) + + return MongoDBNativeRetrievalJob( + query_fn=_run, + full_feature_names=False, + on_demand_feature_views=[], + metadata=None, + config=config, + ) + + @staticmethod + def pull_all_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + timestamp_field: str, + created_timestamp_column: Optional[str] = None, + start_date: Optional[datetime] = None, + end_date: Optional[datetime] = None, + ) -> RetrievalJob: + if not isinstance(data_source, MongoDBSource): + raise ValueError( + f"MongoDBOfflineStoreNative expected a MongoDBSource, " + f"got {type(data_source).__name__!r}." + ) + warnings.warn( + "MongoDB offline store (native) is in alpha. API may change without notice.", + RuntimeWarning, + ) + connection_string = config.offline_store.connection_string + db_name = data_source.database or config.offline_store.database + collection = data_source.collection + + fields = join_key_columns + feature_name_columns + [timestamp_field] + if created_timestamp_column: + fields.append(created_timestamp_column) + + match_filter: Dict = {} + if start_date or end_date: + ts_filter: Dict = {} + if start_date: + ts_filter["$gte"] = start_date.astimezone(tz=timezone.utc) + if end_date: + ts_filter["$lte"] = end_date.astimezone(tz=timezone.utc) + match_filter[timestamp_field] = ts_filter + + pipeline = [ + {"$match": match_filter}, + {"$project": {"_id": 0, **{f: 1 for f in fields}}}, + ] + + def _run() -> pyarrow.Table: + return _fetch_collection_as_arrow( + connection_string, db_name, collection, pipeline + ) + + return MongoDBNativeRetrievalJob( + query_fn=_run, + full_feature_names=False, + on_demand_feature_views=[], + metadata=None, + config=config, + ) + + @staticmethod + def get_historical_features( + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pd.DataFrame, str], + registry: BaseRegistry, + project: str, + full_feature_names: bool = False, + ) -> RetrievalJob: + if isinstance(entity_df, str): + raise ValueError( + "MongoDBOfflineStoreNative does not support SQL entity_df strings. " + "Pass a pandas DataFrame instead." + ) + warnings.warn( + "MongoDB offline store (native) is in alpha. API may change without notice.", # todo change wording: alpha -> preview + RuntimeWarning, + ) + connection_string = config.offline_store.connection_string + default_db = config.offline_store.database + + entity_schema = dict(zip(entity_df.columns, entity_df.dtypes)) + event_timestamp_col = infer_event_timestamp_from_entity_df(entity_schema) + + # Map "feature_view:feature" refs → {fv_name: [feature, ...]} + fv_to_features: Dict[str, List[str]] = {} + for ref in feature_refs: + fv_name, feat_name = ref.split(":", 1) + fv_to_features.setdefault(fv_name, []).append(feat_name) + + fv_by_name = {fv.name: fv for fv in feature_views} + + def _run() -> pyarrow.Table: + result = entity_df.copy() + # Ensure the entity timestamp is tz-aware UTC for merge_asof + if result[event_timestamp_col].dt.tz is None: + result[event_timestamp_col] = pd.to_datetime( + result[event_timestamp_col], utc=True + ) + result = result.sort_values(event_timestamp_col) + + for fv_name, features in fv_to_features.items(): + fv = fv_by_name[fv_name] + source = fv.batch_source + if not isinstance(source, MongoDBSource): + raise ValueError( + f"MongoDBOfflineStoreNative: feature view {fv_name!r} has " + f"a non-MongoDBSource batch source ({type(source).__name__!r})." + ) + db_name = source.database or default_db + ts_field = source.timestamp_field + join_keys = [e.name for e in fv.entity_columns] + + arrow_table = _fetch_collection_as_arrow( + connection_string, db_name, source.collection + ) + if arrow_table.num_rows == 0: + for f in features: + col = f"{fv_name}__{f}" if full_feature_names else f + result[col] = None + continue + + feature_df = arrow_table.to_pandas() + # Ensure tz-aware UTC + if feature_df[ts_field].dt.tz is None: + feature_df[ts_field] = pd.to_datetime( + feature_df[ts_field], utc=True + ) + feature_df = feature_df.sort_values(ts_field) + + col_rename = { + f: (f"{fv_name}__{f}" if full_feature_names else f) + for f in features + } + cols_to_select = join_keys + features + [ts_field] + feature_df = feature_df[cols_to_select].rename(columns=col_rename) + out_features = list(col_rename.values()) + + merged = pd.merge_asof( + result, + feature_df, + left_on=event_timestamp_col, + right_on=ts_field, + by=join_keys, + direction="backward", + ) + # Apply TTL: null out features whose timestamp is too far in the past + if fv.ttl: + cutoff = merged[event_timestamp_col] - fv.ttl + too_old = merged[ts_field] < cutoff + for col in out_features: + merged.loc[too_old, col] = None + + result = merged.drop(columns=[ts_field], errors="ignore") + + return pyarrow.Table.from_pandas(result, preserve_index=False) + + return MongoDBNativeRetrievalJob( + query_fn=_run, + full_feature_names=full_feature_names, + on_demand_feature_views=[], + metadata=None, + config=config, + ) From 812d03d4583e56b745d53ddd5263a1ff12930ee9 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Tue, 17 Mar 2026 11:14:25 -0400 Subject: [PATCH 04/30] refactor: rename alpha to preview, clarify MQL pipeline comments Signed-off-by: Casey Clements --- .../contrib/mongodb_offline_store/mongodb.py | 36 +++++++++++-------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py index 89794e3ba8..ee37b11c41 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py @@ -87,7 +87,7 @@ def pull_latest_from_table_or_query( f"got {type(data_source).__name__!r}." ) warnings.warn( - "MongoDB offline store is in alpha. API may change without notice.", + "MongoDB offline store is in preview. API may change without notice.", RuntimeWarning, ) return pull_latest_from_table_or_query_ibis( @@ -114,7 +114,7 @@ def get_historical_features( full_feature_names: bool = False, ) -> RetrievalJob: warnings.warn( - "MongoDB offline store is in alpha. API may change without notice.", + "MongoDB offline store is in preview. API may change without notice.", RuntimeWarning, ) return get_historical_features_ibis( @@ -146,7 +146,7 @@ def pull_all_from_table_or_query( f"got {type(data_source).__name__!r}." ) warnings.warn( - "MongoDB offline store is in alpha. API may change without notice.", + "MongoDB offline store is in preview. API may change without notice.", RuntimeWarning, ) return pull_all_from_table_or_query_ibis( @@ -178,7 +178,7 @@ def reader(data_source: DataSource, repo_path: str) -> Table: ) connection_string = config.offline_store.connection_string db_name = data_source.database or config.offline_store.database - client: Any = MongoClient(connection_string, tz_aware=True) + client: Any = MongoClient(connection_string) try: docs = list(client[db_name][data_source.collection].find({}, {"_id": 0})) finally: @@ -188,17 +188,17 @@ def reader(data_source: DataSource, repo_path: str) -> Table: if df.empty: return ibis.memtable(df) - # Ensure datetime-like columns are timezone-aware UTC pandas timestamps. + # Localize naive datetime columns to UTC. MongoDB stores all dates as UTC, + # and with tz_aware=False (default), pymongo returns naive datetime objects. + # We convert them to timezone-aware UTC timestamps for pyarrow compatibility. for col in df.columns: - if pd.api.types.is_datetime64_any_dtype(df[col]): - if df[col].dt.tz is None: - df[col] = pd.to_datetime(df[col], utc=True) - elif df[col].dtype == object and len(df[col].dropna()) > 0: + if df[col].dtype == object and len(df[col].dropna()) > 0: sample = df[col].dropna().iloc[0] if isinstance(sample, datetime): try: df[col] = pd.to_datetime(df[col], utc=True) - except Exception: + except (ValueError, TypeError): + # Skip columns that can't be converted (e.g., mixed types) pass return ibis.memtable(df) @@ -427,7 +427,7 @@ def pull_latest_from_table_or_query( f"got {type(data_source).__name__!r}." ) warnings.warn( - "MongoDB offline store (native) is in alpha. API may change without notice.", + "MongoDB offline store (native) is in preview. API may change without notice.", RuntimeWarning, ) start_utc = start_date.astimezone(tz=timezone.utc) @@ -436,13 +436,17 @@ def pull_latest_from_table_or_query( db_name = data_source.database or config.offline_store.database collection = data_source.collection + # Sort by timestamp descending so $first in $group gets the latest document sort_spec: Dict = {timestamp_field: -1} if created_timestamp_column: sort_spec[created_timestamp_column] = -1 + # Group by entity/join keys. _id becomes a subdocument like {driver_id: 1}. + # $first grabs values from the first document in each group (the latest, + # due to prior $sort). group_id = {k: f"${k}" for k in join_key_columns} group_stage: Dict = { - "_id": group_id, # todo this isn't correct. or i don't follow + "_id": group_id, **{f: {"$first": f"${f}"} for f in feature_name_columns}, timestamp_field: {"$first": f"${timestamp_field}"}, } @@ -451,9 +455,11 @@ def pull_latest_from_table_or_query( "$first": f"${created_timestamp_column}" } + # Project to flatten the output: extract join keys from _id subdocument, + # include feature columns directly. Excludes the _id field from output. project_stage: Dict = { "_id": 0, - **{k: f"$_id.{k}" for k in join_key_columns}, # todo here too + **{k: f"$_id.{k}" for k in join_key_columns}, **{f: 1 for f in feature_name_columns}, timestamp_field: 1, } @@ -497,7 +503,7 @@ def pull_all_from_table_or_query( f"got {type(data_source).__name__!r}." ) warnings.warn( - "MongoDB offline store (native) is in alpha. API may change without notice.", + "MongoDB offline store (native) is in preview. API may change without notice.", RuntimeWarning, ) connection_string = config.offline_store.connection_string @@ -551,7 +557,7 @@ def get_historical_features( "Pass a pandas DataFrame instead." ) warnings.warn( - "MongoDB offline store (native) is in alpha. API may change without notice.", # todo change wording: alpha -> preview + "MongoDB offline store (native) is in preview. API may change without notice.", RuntimeWarning, ) connection_string = config.offline_store.connection_string From c3401ea2524cee785d9dd96a070ea539e0816726 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Tue, 17 Mar 2026 11:27:44 -0400 Subject: [PATCH 05/30] Added unit tests for offline store retrieval, requiring docker and pymongo, skipping as natural. Signed-off-by: Casey Clements --- .../contrib/test_mongodb_offline_retrieval.py | 388 ++++++++++++++++++ 1 file changed, 388 insertions(+) create mode 100644 sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py b/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py new file mode 100644 index 0000000000..cd83e33c0d --- /dev/null +++ b/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py @@ -0,0 +1,388 @@ +""" +Unit tests for MongoDB offline store (Ibis-based implementation). + +Docker-dependent tests are marked with ``@_requires_docker`` and are skipped when +Docker is unavailable. +""" + +from datetime import datetime, timedelta +from typing import Generator +from unittest.mock import MagicMock + +import pandas as pd +import pytest +import pytz + +pytest.importorskip("pymongo") + +from pymongo import MongoClient +from testcontainers.mongodb import MongoDbContainer + +from feast import Entity, FeatureView, Field +from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb import ( + MongoDBOfflineStoreIbis, + MongoDBOfflineStoreIbisConfig, +) +from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_source import ( + MongoDBSource, +) +from feast.repo_config import RepoConfig +from feast.types import Float64, Int64 +from feast.value_type import ValueType + +# Check if Docker is available +docker_available = False +try: + import docker + + try: + client = docker.from_env() + client.ping() + docker_available = True + except Exception: + pass +except ImportError: + pass + +_requires_docker = pytest.mark.skipif( + not docker_available, + reason="Docker is not available or not running.", +) + + +@pytest.fixture(scope="module") +def mongodb_container() -> Generator[MongoDbContainer, None, None]: + """Start a MongoDB container for testing.""" + container = MongoDbContainer( + "mongo:latest", + username="test", + password="test", # pragma: allowlist secret + ).with_exposed_ports(27017) + container.start() + yield container + container.stop() + + +@pytest.fixture +def mongodb_connection_string(mongodb_container: MongoDbContainer) -> str: + """Get MongoDB connection string from the container.""" + exposed_port = mongodb_container.get_exposed_port(27017) + return f"mongodb://test:test@localhost:{exposed_port}" # pragma: allowlist secret + + +@pytest.fixture +def repo_config(mongodb_connection_string: str) -> RepoConfig: + """Create a RepoConfig with MongoDB offline store.""" + return RepoConfig( + project="test_project", + registry="memory://", + provider="local", + offline_store=MongoDBOfflineStoreIbisConfig( + connection_string=mongodb_connection_string, + database="feast_test", + ), + online_store={"type": "sqlite"}, + entity_key_serialization_version=3, + ) + + +@pytest.fixture +def sample_data(mongodb_connection_string: str) -> datetime: + """Insert sample driver stats data into MongoDB. + + Returns the 'now' timestamp used as the latest event_timestamp. + + Note: The collection name 'driver_stats' is defined in the MongoDBSource + (see driver_source fixture), not in the RepoConfig. RepoConfig provides + connection_string and database; the source defines the collection. + """ + client: MongoClient = MongoClient(mongodb_connection_string) + db = client["feast_test"] + collection = db["driver_stats"] + collection.drop() + + now = datetime.now(tz=pytz.UTC) + docs = [ + { + "driver_id": 1, + "conv_rate": 0.5, + "acc_rate": 0.9, + "event_timestamp": now - timedelta(hours=2), + }, + { + "driver_id": 1, + "conv_rate": 0.6, + "acc_rate": 0.85, + "event_timestamp": now - timedelta(hours=1), + }, + {"driver_id": 1, "conv_rate": 0.7, "acc_rate": 0.8, "event_timestamp": now}, + { + "driver_id": 2, + "conv_rate": 0.3, + "acc_rate": 0.95, + "event_timestamp": now - timedelta(hours=2), + }, + # Driver 2 has no "now" timestamp - only data from 2 hours ago + # This tests that pull_latest correctly handles entities with different latest timestamps + ] + collection.insert_many(docs) + client.close() + return now + + +@pytest.fixture +def driver_source() -> MongoDBSource: + """Create a MongoDBSource for driver stats.""" + return MongoDBSource( + name="driver_stats", + database="feast_test", + collection="driver_stats", + timestamp_field="event_timestamp", + ) + + +@pytest.fixture +def driver_fv(driver_source: MongoDBSource) -> FeatureView: + """Create a FeatureView for driver stats. + + The ttl (time-to-live) parameter defines how far back in time Feast will look + for feature values during point-in-time joins. If a feature's event_timestamp + is older than (entity_timestamp - ttl), that feature value is considered stale + and will be returned as NULL. + + This is different from MongoDB TTL indexes which automatically delete documents + after a period of time. Feast TTL is a query-time filter, not a storage policy. + """ + driver_entity = Entity( + name="driver_id", join_keys=["driver_id"], value_type=ValueType.INT64 + ) + return FeatureView( + name="driver_stats", + entities=[driver_entity], + schema=[ + # Include entity column in schema so entity_columns is populated + Field(name="driver_id", dtype=Int64), + Field(name="conv_rate", dtype=Float64), + Field(name="acc_rate", dtype=Float64), + ], + source=driver_source, + ttl=timedelta(days=1), + ) + + +@_requires_docker +def test_pull_latest_from_table_or_query( + repo_config: RepoConfig, sample_data: datetime, driver_source: MongoDBSource +) -> None: + """Test pulling latest features per entity from MongoDB. + + This test verifies that pull_latest returns only the most recent feature + values for each entity (driver_id), even when entities have different + latest timestamps. Driver 1 has data at now, but driver 2's latest data + is from 2 hours ago. + """ + now = sample_data + job = MongoDBOfflineStoreIbis.pull_latest_from_table_or_query( + config=repo_config, + data_source=driver_source, + join_key_columns=["driver_id"], + feature_name_columns=["conv_rate", "acc_rate"], + timestamp_field="event_timestamp", + created_timestamp_column=None, + start_date=now - timedelta(days=1), + end_date=now + timedelta(hours=1), + ) + + df = job.to_df() + + # Validate DataFrame structure + assert isinstance(df, pd.DataFrame) + assert set(df.columns) == {"driver_id", "conv_rate", "acc_rate", "event_timestamp"} + assert len(df) == 2 # Two unique drivers + + # Extract rows for each driver + driver1_rows = df[df["driver_id"] == 1] + driver2_rows = df[df["driver_id"] == 2] + + # Each driver should have exactly one row (the latest) + assert len(driver1_rows) == 1 + assert len(driver2_rows) == 1 + + driver1 = driver1_rows.iloc[0] + driver2 = driver2_rows.iloc[0] + + # Validate types + assert isinstance(driver1["conv_rate"], float) + assert isinstance(driver1["acc_rate"], float) + + # Driver 1's latest values (from "now") + assert driver1["conv_rate"] == pytest.approx(0.7) + assert driver1["acc_rate"] == pytest.approx(0.8) + + # Driver 2's latest values (from 2 hours ago - driver 2 has no "now" data) + # This demonstrates that pull_latest correctly handles entities with + # different "latest" timestamps + assert driver2["conv_rate"] == pytest.approx(0.3) + assert driver2["acc_rate"] == pytest.approx(0.95) + + +@_requires_docker +def test_get_historical_features_pit_join( + repo_config: RepoConfig, sample_data: datetime, driver_fv: FeatureView +) -> None: + """Test point-in-time join retrieves correct feature values. + + Point-in-time (PIT) join ensures that for each entity row, we get the + feature values that were valid AT THAT POINT IN TIME - not future data + that would cause data leakage in ML training. + """ + now = sample_data + + # Entity dataframe: request features at specific timestamps + # Each row says "give me driver X's features as they were at time T" + entity_df = pd.DataFrame( + { + "driver_id": [1, 1, 2], + "event_timestamp": [ + now + - timedelta( + hours=1, minutes=30 + ), # Should get conv_rate=0.5 (before 0.6 was written) + now + - timedelta( + minutes=30 + ), # Should get conv_rate=0.6 (before 0.7 was written) + now + - timedelta(hours=1), # Should get conv_rate=0.3 (only data available) + ], + } + ) + + job = MongoDBOfflineStoreIbis.get_historical_features( + config=repo_config, + feature_views=[driver_fv], + feature_refs=["driver_stats:conv_rate", "driver_stats:acc_rate"], + entity_df=entity_df, + registry=MagicMock(), + project=repo_config.project, + full_feature_names=False, + ) + + result_df = job.to_df() + assert isinstance(result_df, pd.DataFrame) + assert len(result_df) == 3 + + # Sort by driver_id and event_timestamp for predictable assertions + result_df = result_df.sort_values(["driver_id", "event_timestamp"]).reset_index( + drop=True + ) + + # Driver 1, first request (1.5 hours ago) → should get value from 2 hours ago + assert result_df.loc[0, "conv_rate"] == pytest.approx(0.5) + + # Driver 1, second request (30 min ago) → should get value from 1 hour ago + assert result_df.loc[1, "conv_rate"] == pytest.approx(0.6) + + # Driver 2, request (1 hour ago) → should get value from 2 hours ago + assert result_df.loc[2, "conv_rate"] == pytest.approx(0.3) + + +@_requires_docker +def test_pull_all_from_table_or_query( + repo_config: RepoConfig, sample_data: datetime, driver_source: MongoDBSource +) -> None: + """Test pulling all features within a time range (no deduplication).""" + now = sample_data + job = MongoDBOfflineStoreIbis.pull_all_from_table_or_query( + config=repo_config, + data_source=driver_source, + join_key_columns=["driver_id"], + feature_name_columns=["conv_rate", "acc_rate"], + timestamp_field="event_timestamp", + created_timestamp_column=None, + start_date=now - timedelta(hours=1, minutes=30), + end_date=now + timedelta(hours=1), + ) + + df = job.to_df() + assert isinstance(df, pd.DataFrame) + # Should get 2 rows: driver 1 (1hr ago, now) + # Excludes: driver 1 row from 2 hours ago (before start_date) + # driver 2 row from 2 hours ago (before start_date) + assert len(df) == 2 + + +@_requires_docker +def test_ttl_excludes_stale_features( + repo_config: RepoConfig, + mongodb_connection_string: str, + driver_source: MongoDBSource, +) -> None: + """Test that TTL causes stale feature values to be returned as NULL. + + Feast TTL (time-to-live) is a query-time filter: if a feature's event_timestamp + is older than (entity_timestamp - ttl), that feature is considered stale. + This is different from MongoDB TTL indexes which delete documents. + """ + # Insert data with a very old timestamp + client: MongoClient = MongoClient(mongodb_connection_string) + db = client["feast_test"] + collection = db["driver_stats_ttl_test"] + collection.drop() + + now = datetime.now(tz=pytz.UTC) + docs = [ + # Fresh data (within TTL) + {"driver_id": 1, "conv_rate": 0.9, "event_timestamp": now - timedelta(hours=1)}, + # Stale data (outside 1-day TTL when queried from "now") + {"driver_id": 2, "conv_rate": 0.5, "event_timestamp": now - timedelta(days=2)}, + ] + collection.insert_many(docs) + client.close() + + # Create source and feature view with 1-day TTL + ttl_source = MongoDBSource( + name="driver_stats_ttl_test", + database="feast_test", + collection="driver_stats_ttl_test", + timestamp_field="event_timestamp", + ) + driver_entity = Entity( + name="driver_id", join_keys=["driver_id"], value_type=ValueType.INT64 + ) + ttl_fv = FeatureView( + name="driver_stats_ttl_test", + entities=[driver_entity], + schema=[ + Field(name="driver_id", dtype=Int64), + Field(name="conv_rate", dtype=Float64), + ], + source=ttl_source, + ttl=timedelta(days=1), # Features older than 1 day are stale + ) + + # Request features "as of now" for both drivers + entity_df = pd.DataFrame( + { + "driver_id": [1, 2], + "event_timestamp": [now, now], + } + ) + + job = MongoDBOfflineStoreIbis.get_historical_features( + config=repo_config, + feature_views=[ttl_fv], + feature_refs=["driver_stats_ttl_test:conv_rate"], + entity_df=entity_df, + registry=MagicMock(), + project=repo_config.project, + full_feature_names=False, + ) + + result_df = job.to_df().sort_values("driver_id").reset_index(drop=True) + + # Driver 1: fresh data within TTL → should have value + assert result_df.loc[0, "conv_rate"] == pytest.approx(0.9) + + # Driver 2: stale data outside TTL → should be NULL + assert pd.isna(result_df.loc[1, "conv_rate"]) From ec2e7ba7b1828ef04f6abedc9981c0747e06c19e Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Tue, 17 Mar 2026 12:42:09 -0400 Subject: [PATCH 06/30] Added test of multiple feature views and compound join keys Signed-off-by: Casey Clements --- .../contrib/test_mongodb_offline_retrieval.py | 262 +++++++++++++++++- 1 file changed, 261 insertions(+), 1 deletion(-) diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py b/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py index cd83e33c0d..225d18d3e9 100644 --- a/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py +++ b/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py @@ -27,7 +27,7 @@ MongoDBSource, ) from feast.repo_config import RepoConfig -from feast.types import Float64, Int64 +from feast.types import Float64, Int64, String from feast.value_type import ValueType # Check if Docker is available @@ -386,3 +386,263 @@ def test_ttl_excludes_stale_features( # Driver 2: stale data outside TTL → should be NULL assert pd.isna(result_df.loc[1, "conv_rate"]) + + +@_requires_docker +def test_multiple_feature_views( + repo_config: RepoConfig, mongodb_connection_string: str +) -> None: + """Test joining features from multiple MongoDB collections/FeatureViews. + + This simulates a real-world scenario where features come from different + data sources (e.g., driver stats from one collection, vehicle stats from another). + """ + client: MongoClient = MongoClient(mongodb_connection_string) + db = client["feast_test"] + + # Collection 1: Driver stats + driver_collection = db["driver_stats_multi"] + driver_collection.drop() + now = datetime.now(tz=pytz.UTC) + driver_docs = [ + {"driver_id": 1, "rating": 4.8, "event_timestamp": now - timedelta(hours=1)}, + {"driver_id": 2, "rating": 4.5, "event_timestamp": now - timedelta(hours=1)}, + ] + driver_collection.insert_many(driver_docs) + + # Collection 2: Vehicle stats (same driver_id, different features) + vehicle_collection = db["vehicle_stats_multi"] + vehicle_collection.drop() + vehicle_docs = [ + { + "driver_id": 1, + "vehicle_age": 2, + "mileage": 50000, + "event_timestamp": now - timedelta(hours=1), + }, + { + "driver_id": 2, + "vehicle_age": 5, + "mileage": 120000, + "event_timestamp": now - timedelta(hours=1), + }, + ] + vehicle_collection.insert_many(vehicle_docs) + client.close() + + # Create sources for each collection + driver_source = MongoDBSource( + name="driver_stats_multi", + database="feast_test", + collection="driver_stats_multi", + timestamp_field="event_timestamp", + ) + vehicle_source = MongoDBSource( + name="vehicle_stats_multi", + database="feast_test", + collection="vehicle_stats_multi", + timestamp_field="event_timestamp", + ) + + # Create entities and feature views + driver_entity = Entity( + name="driver_id", join_keys=["driver_id"], value_type=ValueType.INT64 + ) + + driver_fv = FeatureView( + name="driver_stats_multi", + entities=[driver_entity], + schema=[ + Field(name="driver_id", dtype=Int64), + Field(name="rating", dtype=Float64), + ], + source=driver_source, + ttl=timedelta(days=1), + ) + + vehicle_fv = FeatureView( + name="vehicle_stats_multi", + entities=[ + driver_entity + ], # todo these two FeatureViews have the same entities list [driver_entity] + schema=[ + Field(name="driver_id", dtype=Int64), + Field(name="vehicle_age", dtype=Int64), + Field(name="mileage", dtype=Int64), + ], + source=vehicle_source, + ttl=timedelta(days=1), + ) + + # Entity dataframe requesting features for both drivers + entity_df = pd.DataFrame( + { + "driver_id": [1, 2], + "event_timestamp": [now, now], + } + ) + + # Request features from BOTH feature views + job = MongoDBOfflineStoreIbis.get_historical_features( + config=repo_config, + feature_views=[driver_fv, vehicle_fv], + feature_refs=[ + "driver_stats_multi:rating", + "vehicle_stats_multi:vehicle_age", + "vehicle_stats_multi:mileage", + ], + entity_df=entity_df, + registry=MagicMock(), + project=repo_config.project, + full_feature_names=False, + ) + + result_df = job.to_df().sort_values("driver_id").reset_index(drop=True) + + # Verify we got features from both collections joined correctly + assert len(result_df) == 2 + assert set(result_df.columns) >= {"driver_id", "rating", "vehicle_age", "mileage"} + + # Driver 1 + assert result_df.loc[0, "rating"] == pytest.approx(4.8) + assert result_df.loc[0, "vehicle_age"] == 2 + assert result_df.loc[0, "mileage"] == 50000 + + # Driver 2 + assert result_df.loc[1, "rating"] == pytest.approx(4.5) + assert result_df.loc[1, "vehicle_age"] == 5 + assert result_df.loc[1, "mileage"] == 120000 + + +@_requires_docker +def test_compound_join_keys( + repo_config: RepoConfig, mongodb_connection_string: str +) -> None: + """Test with compound/composite join keys (multiple entity columns). + + This tests scenarios where entities are identified by multiple keys, + e.g., (user_id, device_id) or (store_id, product_id). + """ + client: MongoClient = MongoClient(mongodb_connection_string) + db = client["feast_test"] + + # Create collection with compound key (user_id + device_id) + collection = db["user_device_features"] + collection.drop() + now = datetime.now(tz=pytz.UTC) + + # Same user_id can have different device_ids with different features + docs = [ + { + "user_id": 1, + "device_id": "mobile", + "app_opens": 50, + "event_timestamp": now - timedelta(hours=2), + }, + { + "user_id": 1, + "device_id": "mobile", + "app_opens": 55, + "event_timestamp": now - timedelta(hours=1), + }, + { + "user_id": 1, + "device_id": "desktop", + "app_opens": 10, + "event_timestamp": now - timedelta(hours=1), + }, + { + "user_id": 2, + "device_id": "mobile", + "app_opens": 100, + "event_timestamp": now - timedelta(hours=1), + }, + { + "user_id": 2, + "device_id": "tablet", + "app_opens": 25, + "event_timestamp": now - timedelta(hours=1), + }, + ] + collection.insert_many(docs) + client.close() + + # Create source + source = MongoDBSource( + name="user_device_features", + database="feast_test", + collection="user_device_features", + timestamp_field="event_timestamp", + ) + + # Create entities with compound keys + user_entity = Entity( + name="user_id", join_keys=["user_id"], value_type=ValueType.INT64 + ) + device_entity = Entity( + name="device_id", join_keys=["device_id"], value_type=ValueType.STRING + ) + + fv = FeatureView( + name="user_device_features", + entities=[user_entity, device_entity], + schema=[ + Field(name="user_id", dtype=Int64), + Field(name="device_id", dtype=String), + Field(name="app_opens", dtype=Int64), + ], + source=source, + ttl=timedelta(days=1), + ) + + # Test pull_latest: should get one row per unique (user_id, device_id) combination + job = MongoDBOfflineStoreIbis.pull_latest_from_table_or_query( + config=repo_config, + data_source=source, + join_key_columns=["user_id", "device_id"], + feature_name_columns=["app_opens"], + timestamp_field="event_timestamp", + created_timestamp_column=None, + start_date=now - timedelta(days=1), + end_date=now + timedelta(hours=1), + ) + + df = job.to_df() + assert len(df) == 4 # 4 unique (user_id, device_id) combinations + + # Verify user 1, mobile got the LATEST value (55, not 50) + user1_mobile = df[(df["user_id"] == 1) & (df["device_id"] == "mobile")] + assert len(user1_mobile) == 1 + assert user1_mobile.iloc[0]["app_opens"] == 55 + + # Test get_historical_features with compound keys + entity_df = pd.DataFrame( + { + "user_id": [1, 1, 2], + "device_id": ["mobile", "desktop", "tablet"], + "event_timestamp": [now, now, now], + } + ) + + job = MongoDBOfflineStoreIbis.get_historical_features( + config=repo_config, + feature_views=[fv], + feature_refs=["user_device_features:app_opens"], + entity_df=entity_df, + registry=MagicMock(), + project=repo_config.project, + full_feature_names=False, + ) + + result_df = job.to_df() + assert len(result_df) == 3 + + # Sort for predictable assertions + result_df = result_df.sort_values(["user_id", "device_id"]).reset_index(drop=True) + + # user 1, desktop + assert result_df.loc[0, "app_opens"] == 10 + # user 1, mobile (latest value) + assert result_df.loc[1, "app_opens"] == 55 + # user 2, tablet + assert result_df.loc[2, "app_opens"] == 25 From a4d2886138e2efc756e766660ef683c6d42f3c2f Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Tue, 17 Mar 2026 13:58:58 -0400 Subject: [PATCH 07/30] Initial implementation of native single-collection offline store Signed-off-by: Casey Clements --- .../mongodb_offline_store/mongodb_native.py | 622 ++++++++++++++++++ .../test_mongodb_offline_retrieval_native.py | 609 +++++++++++++++++ 2 files changed, 1231 insertions(+) create mode 100644 sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py create mode 100644 sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval_native.py diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py new file mode 100644 index 0000000000..a6f0a8acfc --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py @@ -0,0 +1,622 @@ +# Copyright 2026 The Feast Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Native MongoDB Offline Store Implementation. + +This module implements a MongoDB offline store using native MQL aggregation +pipelines. It uses a single-collection schema where all feature views share +one collection, discriminated by a ``feature_view`` field. + +Schema: + { + "_id": ObjectId(), + "entity_id": "", + "feature_view": "driver_stats", + "features": { + "rating": 4.91, + "trips_last_7d": 132 + }, + "event_timestamp": ISODate("2026-01-20T12:00:00Z"), + "created_at": ISODate("2026-01-20T12:00:05Z") + } + +Recommended Index: + db.feature_history.create_index([ + ("entity_id", ASCENDING), + ("feature_view", ASCENDING), + ("event_timestamp", DESCENDING), + ]) +""" + +import json +import warnings +from datetime import datetime, timezone +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union + +import pandas as pd +import pyarrow + +try: + from pymongo import MongoClient +except ImportError: + MongoClient = None # type: ignore[assignment,misc] + +from pydantic import StrictStr + +from feast.data_source import DataSource +from feast.errors import DataSourceNoNameException, FeastExtrasDependencyImportError +from feast.feature_view import FeatureView +from feast.infra.key_encoding_utils import serialize_entity_key +from feast.infra.offline_stores.offline_store import ( + OfflineStore, + RetrievalJob, + RetrievalMetadata, +) +from feast.infra.offline_stores.offline_utils import ( + infer_event_timestamp_from_entity_df, +) +from feast.infra.registry.base_registry import BaseRegistry +from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto +from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto +from feast.protos.feast.types.Value_pb2 import Value as ValueProto +from feast.repo_config import FeastConfigBaseModel, RepoConfig +from feast.type_map import mongodb_to_feast_value_type +from feast.value_type import ValueType + + +class MongoDBOfflineStoreNativeConfig(FeastConfigBaseModel): + """Configuration for the Native MongoDB offline store. + + Uses a single shared collection for all feature views, with documents + containing an ``entity_id``, ``feature_view`` discriminator, and nested + ``features`` subdocument. + """ + + type: StrictStr = "feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_native.MongoDBOfflineStoreNative" + """Offline store type selector""" + + connection_string: StrictStr = "mongodb://localhost:27017" + """MongoDB connection URI""" + + database: StrictStr = "feast" + """MongoDB database name""" + + collection: StrictStr = "feature_history" + """Single collection name for all feature views""" + + +class MongoDBSourceNative(DataSource): + """A MongoDB data source for the Native offline store. + + Unlike MongoDBSource (Ibis), this source does not specify a collection + per FeatureView. Instead, all FeatureViews share a single collection + (configured at the store level), and are discriminated by the + ``feature_view`` field in each document. + + The ``name`` parameter becomes the ``feature_view`` discriminator value + used to filter documents in queries. + """ + + def __init__( + self, + name: Optional[str] = None, + timestamp_field: str = "event_timestamp", + created_timestamp_column: str = "created_at", + field_mapping: Optional[Dict[str, str]] = None, + description: Optional[str] = "", + tags: Optional[Dict[str, str]] = None, + owner: Optional[str] = "", + ): + if name is None: + raise DataSourceNoNameException() + + super().__init__( + name=name, + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, + field_mapping=field_mapping, + description=description, + tags=tags, + owner=owner, + ) + + def __hash__(self): + return super().__hash__() + + def __eq__(self, other): + if not isinstance(other, MongoDBSourceNative): + raise TypeError( + "Comparisons should only involve MongoDBSourceNative class objects." + ) + return ( + super().__eq__(other) + and self.timestamp_field == other.timestamp_field + and self.created_timestamp_column == other.created_timestamp_column + and self.field_mapping == other.field_mapping + ) + + @property + def feature_view_name(self) -> str: + """The feature_view discriminator value (same as source name).""" + return self.name + + def source_type(self) -> DataSourceProto.SourceType.ValueType: + return DataSourceProto.CUSTOM_SOURCE + + @staticmethod + def from_proto(data_source: DataSourceProto) -> "MongoDBSourceNative": + assert data_source.HasField("custom_options") + return MongoDBSourceNative( + name=data_source.name, + timestamp_field=data_source.timestamp_field, + created_timestamp_column=data_source.created_timestamp_column, + field_mapping=dict(data_source.field_mapping), + description=data_source.description, + tags=dict(data_source.tags), + owner=data_source.owner, + ) + + def _to_proto_impl(self) -> DataSourceProto: + return DataSourceProto( + name=self.name, + type=DataSourceProto.CUSTOM_SOURCE, + data_source_class_type="feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_native.MongoDBSourceNative", + field_mapping=self.field_mapping, + custom_options=DataSourceProto.CustomSourceOptions( + configuration=json.dumps({"feature_view": self.name}).encode() + ), + description=self.description, + tags=self.tags, + owner=self.owner, + timestamp_field=self.timestamp_field, + created_timestamp_column=self.created_timestamp_column, + ) + + def validate(self, config: RepoConfig): + pass + + @staticmethod + def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: + return mongodb_to_feast_value_type + + def get_table_query_string(self) -> str: + return f"feature_history[feature_view={self.name}]" + + def get_table_column_names_and_types( + self, config: RepoConfig + ) -> Iterable[Tuple[str, str]]: + """Sample documents to infer feature names and types. + + Queries documents matching this source's feature_view name and + inspects the ``features`` subdocument to determine schema. + """ + if MongoClient is None: + raise FeastExtrasDependencyImportError( + "mongodb", "pymongo is not installed." + ) + connection_string = config.offline_store.connection_string + db_name = config.offline_store.database + collection_name = config.offline_store.collection + client: Any = MongoClient(connection_string) + try: + pipeline = [ + {"$match": {"feature_view": self.name}}, + {"$sample": {"size": 100}}, + ] + docs = list(client[db_name][collection_name].aggregate(pipeline)) + finally: + client.close() + + field_type_counts: Dict[str, Dict[str, int]] = {} + for doc in docs: + features = doc.get("features", {}) + for field, value in features.items(): + type_str = _infer_python_type_str(value) + if type_str is None: + continue + field_type_counts.setdefault(field, {}) + field_type_counts[field][type_str] = ( + field_type_counts[field].get(type_str, 0) + 1 + ) + + return [ + (field, max(counts, key=lambda t: counts[t])) + for field, counts in field_type_counts.items() + ] + + +def _infer_python_type_str(value: Any) -> Optional[str]: + """Infer a Feast-compatible type string from a Python value.""" + if value is None: + return None + if isinstance(value, bool): + return "bool" + if isinstance(value, int): + return "int" + if isinstance(value, float): + return "float" + if isinstance(value, str): + return "str" + if isinstance(value, bytes): + return "bytes" + if isinstance(value, datetime): + return "datetime" + if isinstance(value, list): + if not value: + return "list[str]" + elem_type = _infer_python_type_str(value[0]) + if elem_type: + return f"list[{elem_type}]" + return "list[str]" + return None + + +def _fetch_documents( + connection_string: str, + database: str, + collection: str, + pipeline: List[Dict], +) -> List[Dict]: + """Execute an aggregation pipeline and return documents.""" + if MongoClient is None: + raise FeastExtrasDependencyImportError("mongodb", "pymongo is not installed.") + client: Any = MongoClient(connection_string) + try: + return list(client[database][collection].aggregate(pipeline)) + finally: + client.close() + + +class MongoDBNativeRetrievalJob(RetrievalJob): + """Retrieval job for native MongoDB offline store queries.""" + + def __init__( + self, + query_fn: Callable[[], pyarrow.Table], + full_feature_names: bool, + on_demand_feature_views: Optional[List[Any]] = None, + metadata: Optional[RetrievalMetadata] = None, + ): + self._query_fn = query_fn + self._full_feature_names = full_feature_names + self._on_demand_feature_views = on_demand_feature_views or [] + self._metadata = metadata + + @property + def full_feature_names(self) -> bool: + return self._full_feature_names + + @property + def on_demand_feature_views(self) -> List[Any]: + return self._on_demand_feature_views + + def _to_df_internal(self, timeout: Optional[int] = None) -> pd.DataFrame: + return self._to_arrow_internal(timeout).to_pandas() + + def _to_arrow_internal(self, timeout: Optional[int] = None) -> pyarrow.Table: + return self._query_fn() + + @property + def metadata(self) -> Optional[RetrievalMetadata]: + return self._metadata + + def persist( + self, + storage: Any, + allow_overwrite: bool = False, + timeout: Optional[int] = None, + ) -> None: + # TODO: Implement persist for native store + raise NotImplementedError("persist() not yet implemented for native store") + + +def _serialize_entity_key_from_row( + row: pd.Series, join_keys: List[str], entity_key_serialization_version: int +) -> bytes: + """Serialize entity key from a DataFrame row.""" + entity_key = EntityKeyProto() + for key in sorted(join_keys): + entity_key.join_keys.append(key) + value = row[key] + val = ValueProto() + if isinstance(value, int): + val.int64_val = value + elif isinstance(value, str): + val.string_val = value + elif isinstance(value, float): + val.double_val = value + else: + val.string_val = str(value) + entity_key.entity_values.append(val) + return serialize_entity_key(entity_key, entity_key_serialization_version) + + +class MongoDBOfflineStoreNative(OfflineStore): + """Native MongoDB offline store using single-collection schema. + + All feature views share one collection (``feature_history``), with documents + containing: + - ``entity_id``: serialized entity key (bytes) + - ``feature_view``: discriminator field matching FeatureView name + - ``features``: subdocument with feature name/value pairs + - ``event_timestamp``: event time + - ``created_at``: ingestion time + """ + + @staticmethod + def pull_latest_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + timestamp_field: str, + created_timestamp_column: Optional[str], + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + if not isinstance(data_source, MongoDBSourceNative): + raise ValueError( + f"MongoDBOfflineStoreNative expected MongoDBSourceNative, " + f"got {type(data_source).__name__!r}." + ) + warnings.warn( + "MongoDB offline store (native) is in preview. API may change without notice.", + RuntimeWarning, + ) + + connection_string = config.offline_store.connection_string + db_name = config.offline_store.database + collection = config.offline_store.collection + feature_view_name = data_source.feature_view_name + + start_utc = start_date.astimezone(tz=timezone.utc) + end_utc = end_date.astimezone(tz=timezone.utc) + + # Build aggregation pipeline + pipeline: List[Dict[str, Any]] = [ + { + "$match": { + "feature_view": feature_view_name, + "event_timestamp": {"$gte": start_utc, "$lte": end_utc}, + } + }, + {"$sort": {"entity_id": 1, "event_timestamp": -1}}, + { + "$group": { + "_id": "$entity_id", + "doc": {"$first": "$$ROOT"}, + } + }, + ] + + def _run() -> pyarrow.Table: + docs = _fetch_documents(connection_string, db_name, collection, pipeline) + if not docs: + return pyarrow.Table.from_pydict({}) + + # Flatten documents + rows = [] + for d in docs: + doc = d["doc"] + row = { + "entity_id": doc["entity_id"], + "event_timestamp": doc["event_timestamp"], + } + features = doc.get("features", {}) + for feat in feature_name_columns: + row[feat] = features.get(feat) + rows.append(row) + + df = pd.DataFrame(rows) + # Ensure timestamp is tz-aware + if not df.empty and df["event_timestamp"].dt.tz is None: + df["event_timestamp"] = pd.to_datetime(df["event_timestamp"], utc=True) + return pyarrow.Table.from_pandas(df, preserve_index=False) + + return MongoDBNativeRetrievalJob(query_fn=_run, full_feature_names=False) + + @staticmethod + def pull_all_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + timestamp_field: str, + created_timestamp_column: Optional[str] = None, + start_date: Optional[datetime] = None, + end_date: Optional[datetime] = None, + ) -> RetrievalJob: + if not isinstance(data_source, MongoDBSourceNative): + raise ValueError( + f"MongoDBOfflineStoreNative expected MongoDBSourceNative, " + f"got {type(data_source).__name__!r}." + ) + warnings.warn( + "MongoDB offline store (native) is in preview. API may change without notice.", + RuntimeWarning, + ) + + connection_string = config.offline_store.connection_string + db_name = config.offline_store.database + collection = config.offline_store.collection + feature_view_name = data_source.feature_view_name + + # Build match filter + match_filter: Dict[str, Any] = {"feature_view": feature_view_name} + if start_date or end_date: + ts_filter: Dict[str, Any] = {} + if start_date: + ts_filter["$gte"] = start_date.astimezone(tz=timezone.utc) + if end_date: + ts_filter["$lte"] = end_date.astimezone(tz=timezone.utc) + match_filter["event_timestamp"] = ts_filter + + pipeline = [{"$match": match_filter}] + + def _run() -> pyarrow.Table: + docs = _fetch_documents(connection_string, db_name, collection, pipeline) + if not docs: + return pyarrow.Table.from_pydict({}) + + rows = [] + for doc in docs: + row = { + "entity_id": doc["entity_id"], + "event_timestamp": doc["event_timestamp"], + } + features = doc.get("features", {}) + for feat in feature_name_columns: + row[feat] = features.get(feat) + rows.append(row) + + df = pd.DataFrame(rows) + if not df.empty and df["event_timestamp"].dt.tz is None: + df["event_timestamp"] = pd.to_datetime(df["event_timestamp"], utc=True) + return pyarrow.Table.from_pandas(df, preserve_index=False) + + return MongoDBNativeRetrievalJob(query_fn=_run, full_feature_names=False) + + @staticmethod + def get_historical_features( + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pd.DataFrame, str], + registry: BaseRegistry, + project: str, + full_feature_names: bool = False, + ) -> RetrievalJob: + if isinstance(entity_df, str): + raise ValueError( + "MongoDBOfflineStoreNative does not support SQL entity_df strings. " + "Pass a pandas DataFrame instead." + ) + warnings.warn( + "MongoDB offline store (native) is in preview. API may change without notice.", + RuntimeWarning, + ) + + connection_string = config.offline_store.connection_string + db_name = config.offline_store.database + collection = config.offline_store.collection + entity_key_version = config.entity_key_serialization_version + + entity_schema = dict(zip(entity_df.columns, entity_df.dtypes)) + event_timestamp_col = infer_event_timestamp_from_entity_df(entity_schema) + + # Map "feature_view:feature" refs → {fv_name: [feature, ...]} + fv_to_features: Dict[str, List[str]] = {} + for ref in feature_refs: + fv_name, feat_name = ref.split(":", 1) + fv_to_features.setdefault(fv_name, []).append(feat_name) + + fv_by_name = {fv.name: fv for fv in feature_views} + + def _run() -> pyarrow.Table: + result = entity_df.copy() + + # Ensure entity timestamp is tz-aware UTC + if result[event_timestamp_col].dt.tz is None: + result[event_timestamp_col] = pd.to_datetime( + result[event_timestamp_col], utc=True + ) + result = result.sort_values(event_timestamp_col) + + # Get join keys from entity_df columns (excluding event_timestamp) + entity_columns = [c for c in result.columns if c != event_timestamp_col] + + # Serialize entity keys for lookup + result["_entity_id"] = result.apply( + lambda row: _serialize_entity_key_from_row( + row, entity_columns, entity_key_version + ), + axis=1, + ) + + for fv_name, features in fv_to_features.items(): + fv = fv_by_name[fv_name] + source = fv.batch_source + if not isinstance(source, MongoDBSourceNative): + raise ValueError( + f"MongoDBOfflineStoreNative: feature view {fv_name!r} has " + f"non-MongoDBSourceNative source ({type(source).__name__!r})." + ) + + # Fetch all documents for this feature view + pipeline = [{"$match": {"feature_view": fv_name}}] + docs = _fetch_documents( + connection_string, db_name, collection, pipeline + ) + + if not docs: + for f in features: + col = f"{fv_name}__{f}" if full_feature_names else f + result[col] = None + continue + + # Build feature DataFrame + feature_rows = [] + for doc in docs: + row = { + "_entity_id": doc["entity_id"], + "_fv_ts": doc["event_timestamp"], + } + feat_data = doc.get("features", {}) + for f in features: + row[f] = feat_data.get(f) + feature_rows.append(row) + + feature_df = pd.DataFrame(feature_rows) + if feature_df["_fv_ts"].dt.tz is None: + feature_df["_fv_ts"] = pd.to_datetime( + feature_df["_fv_ts"], utc=True + ) + feature_df = feature_df.sort_values("_fv_ts") + + # Rename features if full_feature_names + col_rename = { + f: (f"{fv_name}__{f}" if full_feature_names else f) + for f in features + } + feature_df = feature_df.rename(columns=col_rename) + out_features = list(col_rename.values()) + + # Point-in-time join using merge_asof + merged = pd.merge_asof( + result, + feature_df, + left_on=event_timestamp_col, + right_on="_fv_ts", + by="_entity_id", + direction="backward", + ) + + # Apply TTL: null out stale features + if fv.ttl: + cutoff = merged[event_timestamp_col] - fv.ttl + too_old = merged["_fv_ts"] < cutoff + for col in out_features: + merged.loc[too_old, col] = None + + result = merged.drop(columns=["_fv_ts"], errors="ignore") + + # Remove internal entity_id column + result = result.drop(columns=["_entity_id"], errors="ignore") + return pyarrow.Table.from_pandas(result, preserve_index=False) + + return MongoDBNativeRetrievalJob( + query_fn=_run, + full_feature_names=full_feature_names, + ) diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval_native.py b/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval_native.py new file mode 100644 index 0000000000..5c02299254 --- /dev/null +++ b/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval_native.py @@ -0,0 +1,609 @@ +""" +Unit tests for MongoDB Native offline store implementation. + +This tests the single-collection schema where all feature views share one +collection (``feature_history``), discriminated by ``feature_view`` field. + +Schema: + { + "entity_id": bytes, # serialized entity key + "feature_view": str, + "features": { "feat1": val, ... }, + "event_timestamp": datetime, + "created_at": datetime + } + +Docker-dependent tests are marked with ``@_requires_docker`` and are skipped +when Docker is unavailable. +""" + +from datetime import datetime, timedelta +from typing import Generator +from unittest.mock import MagicMock + +import pandas as pd +import pytest +import pytz + +pytest.importorskip("pymongo") + +from pymongo import MongoClient +from testcontainers.mongodb import MongoDbContainer + +from feast import Entity, FeatureView, Field +from feast.infra.key_encoding_utils import serialize_entity_key +from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_native import ( + MongoDBOfflineStoreNative, + MongoDBOfflineStoreNativeConfig, + MongoDBSourceNative, +) +from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto +from feast.protos.feast.types.Value_pb2 import Value as ValueProto +from feast.repo_config import RepoConfig +from feast.types import Float64, Int64, String +from feast.value_type import ValueType + +# Check if Docker is available +docker_available = False +try: + import docker + + try: + client = docker.from_env() + client.ping() + docker_available = True + except Exception: + pass +except ImportError: + pass + +_requires_docker = pytest.mark.skipif( + not docker_available, + reason="Docker is not available or not running.", +) + +ENTITY_KEY_VERSION = 3 + + +def _make_entity_id(join_keys: dict) -> bytes: + """Create serialized entity key from join key dict.""" + entity_key = EntityKeyProto() + for key in sorted(join_keys.keys()): + entity_key.join_keys.append(key) + val = ValueProto() + value = join_keys[key] + if isinstance(value, int): + val.int64_val = value + elif isinstance(value, str): + val.string_val = value + else: + val.string_val = str(value) + entity_key.entity_values.append(val) + return serialize_entity_key(entity_key, ENTITY_KEY_VERSION) + + +@pytest.fixture(scope="module") +def mongodb_container() -> Generator[MongoDbContainer, None, None]: + """Start a MongoDB container for testing.""" + container = MongoDbContainer( + "mongo:latest", + username="test", + password="test", # pragma: allowlist secret + ).with_exposed_ports(27017) + container.start() + yield container + container.stop() + + +@pytest.fixture +def mongodb_connection_string(mongodb_container: MongoDbContainer) -> str: + """Get MongoDB connection string from the container.""" + exposed_port = mongodb_container.get_exposed_port(27017) + return f"mongodb://test:test@localhost:{exposed_port}" # pragma: allowlist secret + + +@pytest.fixture +def repo_config(mongodb_connection_string: str) -> RepoConfig: + """Create a RepoConfig with MongoDB Native offline store.""" + return RepoConfig( + project="test_project", + registry="memory://", + provider="local", + offline_store=MongoDBOfflineStoreNativeConfig( + connection_string=mongodb_connection_string, + database="feast_test", + collection="feature_history", + ), + online_store={"type": "sqlite"}, + entity_key_serialization_version=ENTITY_KEY_VERSION, + ) + + +@pytest.fixture +def sample_data(mongodb_connection_string: str) -> datetime: + """Insert sample data using the single-collection schema. + + Creates documents for 'driver_stats' feature view with entity_id, + feature_view discriminator, and nested features subdocument. + """ + client: MongoClient = MongoClient(mongodb_connection_string) + db = client["feast_test"] + collection = db["feature_history"] + collection.drop() + + now = datetime.now(tz=pytz.UTC) + + # Create documents using the native schema + docs = [ + { + "entity_id": _make_entity_id({"driver_id": 1}), + "feature_view": "driver_stats", + "features": {"conv_rate": 0.5, "acc_rate": 0.9}, + "event_timestamp": now - timedelta(hours=2), + "created_at": now - timedelta(hours=2), + }, + { + "entity_id": _make_entity_id({"driver_id": 1}), + "feature_view": "driver_stats", + "features": {"conv_rate": 0.6, "acc_rate": 0.85}, + "event_timestamp": now - timedelta(hours=1), + "created_at": now - timedelta(hours=1), + }, + { + "entity_id": _make_entity_id({"driver_id": 1}), + "feature_view": "driver_stats", + "features": {"conv_rate": 0.7, "acc_rate": 0.8}, + "event_timestamp": now, + "created_at": now, + }, + { + "entity_id": _make_entity_id({"driver_id": 2}), + "feature_view": "driver_stats", + "features": {"conv_rate": 0.3, "acc_rate": 0.95}, + "event_timestamp": now - timedelta(hours=2), + "created_at": now - timedelta(hours=2), + }, + ] + collection.insert_many(docs) + client.close() + return now + + +@pytest.fixture +def driver_source() -> MongoDBSourceNative: + """Create a MongoDBSourceNative for driver stats.""" + return MongoDBSourceNative( + name="driver_stats", + timestamp_field="event_timestamp", + created_timestamp_column="created_at", + ) + + +@pytest.fixture +def driver_fv(driver_source: MongoDBSourceNative) -> FeatureView: + """Create a FeatureView for driver stats.""" + driver_entity = Entity( + name="driver_id", join_keys=["driver_id"], value_type=ValueType.INT64 + ) + return FeatureView( + name="driver_stats", + entities=[driver_entity], + schema=[ + Field(name="driver_id", dtype=Int64), + Field(name="conv_rate", dtype=Float64), + Field(name="acc_rate", dtype=Float64), + ], + source=driver_source, + ttl=timedelta(days=1), + ) + + +@_requires_docker +def test_pull_latest_from_table_or_query( + repo_config: RepoConfig, sample_data: datetime, driver_source: MongoDBSourceNative +) -> None: + """Test pulling latest features per entity from the single collection.""" + now = sample_data + job = MongoDBOfflineStoreNative.pull_latest_from_table_or_query( + config=repo_config, + data_source=driver_source, + join_key_columns=["driver_id"], + feature_name_columns=["conv_rate", "acc_rate"], + timestamp_field="event_timestamp", + created_timestamp_column="created_at", + start_date=now - timedelta(days=1), + end_date=now + timedelta(hours=1), + ) + + df = job.to_df() + + assert isinstance(df, pd.DataFrame) + assert len(df) == 2 # Two unique entity_ids + + # Sort by entity_id for predictable assertions + # Note: entity_id is bytes, so we check features directly + conv_rates = sorted(df["conv_rate"].tolist()) + assert conv_rates[0] == pytest.approx(0.3) # Driver 2's only value + assert conv_rates[1] == pytest.approx(0.7) # Driver 1's latest value + + +@_requires_docker +def test_get_historical_features_pit_join( + repo_config: RepoConfig, sample_data: datetime, driver_fv: FeatureView +) -> None: + """Test point-in-time join retrieves correct feature values.""" + now = sample_data + + # Entity dataframe with driver_id column (must match join keys) + entity_df = pd.DataFrame( + { + "driver_id": [1, 1, 2], + "event_timestamp": [ + now - timedelta(hours=1, minutes=30), # Should get conv_rate=0.5 + now - timedelta(minutes=30), # Should get conv_rate=0.6 + now - timedelta(hours=1), # Should get conv_rate=0.3 + ], + } + ) + + job = MongoDBOfflineStoreNative.get_historical_features( + config=repo_config, + feature_views=[driver_fv], + feature_refs=["driver_stats:conv_rate", "driver_stats:acc_rate"], + entity_df=entity_df, + registry=MagicMock(), + project=repo_config.project, + full_feature_names=False, + ) + + result_df = job.to_df() + assert isinstance(result_df, pd.DataFrame) + assert len(result_df) == 3 + + # Sort by driver_id and event_timestamp for predictable assertions + result_df = result_df.sort_values(["driver_id", "event_timestamp"]).reset_index( + drop=True + ) + + # Driver 1, first request (1.5 hours ago) → should get value from 2 hours ago + assert result_df.loc[0, "conv_rate"] == pytest.approx(0.5) + + # Driver 1, second request (30 min ago) → should get value from 1 hour ago + assert result_df.loc[1, "conv_rate"] == pytest.approx(0.6) + + # Driver 2, request (1 hour ago) → should get value from 2 hours ago + assert result_df.loc[2, "conv_rate"] == pytest.approx(0.3) + + +@_requires_docker +def test_pull_all_from_table_or_query( + repo_config: RepoConfig, sample_data: datetime, driver_source: MongoDBSourceNative +) -> None: + """Test pulling all features within a time range (no deduplication).""" + now = sample_data + job = MongoDBOfflineStoreNative.pull_all_from_table_or_query( + config=repo_config, + data_source=driver_source, + join_key_columns=["driver_id"], + feature_name_columns=["conv_rate", "acc_rate"], + timestamp_field="event_timestamp", + created_timestamp_column="created_at", + start_date=now - timedelta(hours=1, minutes=30), + end_date=now + timedelta(hours=1), + ) + + df = job.to_df() + assert isinstance(df, pd.DataFrame) + # Should get 2 rows: driver 1 (1hr ago, now) + # Excludes: driver 1 from 2 hours ago, driver 2 from 2 hours ago + assert len(df) == 2 + + +@_requires_docker +def test_ttl_excludes_stale_features( + repo_config: RepoConfig, mongodb_connection_string: str +) -> None: + """Test that TTL causes stale feature values to be returned as NULL.""" + client: MongoClient = MongoClient(mongodb_connection_string) + db = client["feast_test"] + collection = db["feature_history"] + + now = datetime.now(tz=pytz.UTC) + + # Insert docs with different ages + ttl_docs = [ + { + "entity_id": _make_entity_id({"driver_id": 1}), + "feature_view": "driver_stats_ttl", + "features": {"conv_rate": 0.9}, + "event_timestamp": now - timedelta(hours=1), + "created_at": now - timedelta(hours=1), + }, + { + "entity_id": _make_entity_id({"driver_id": 2}), + "feature_view": "driver_stats_ttl", + "features": {"conv_rate": 0.5}, + "event_timestamp": now - timedelta(days=2), # Stale + "created_at": now - timedelta(days=2), + }, + ] + collection.insert_many(ttl_docs) + client.close() + + ttl_source = MongoDBSourceNative( + name="driver_stats_ttl", + timestamp_field="event_timestamp", + ) + driver_entity = Entity( + name="driver_id", join_keys=["driver_id"], value_type=ValueType.INT64 + ) + ttl_fv = FeatureView( + name="driver_stats_ttl", + entities=[driver_entity], + schema=[ + Field(name="driver_id", dtype=Int64), + Field(name="conv_rate", dtype=Float64), + ], + source=ttl_source, + ttl=timedelta(days=1), + ) + + entity_df = pd.DataFrame( + { + "driver_id": [1, 2], + "event_timestamp": [now, now], + } + ) + + job = MongoDBOfflineStoreNative.get_historical_features( + config=repo_config, + feature_views=[ttl_fv], + feature_refs=["driver_stats_ttl:conv_rate"], + entity_df=entity_df, + registry=MagicMock(), + project=repo_config.project, + full_feature_names=False, + ) + + result_df = job.to_df().sort_values("driver_id").reset_index(drop=True) + + # Driver 1: fresh → has value + assert result_df.loc[0, "conv_rate"] == pytest.approx(0.9) + + # Driver 2: stale → NULL + assert pd.isna(result_df.loc[1, "conv_rate"]) + + +@_requires_docker +def test_multiple_feature_views( + repo_config: RepoConfig, mongodb_connection_string: str +) -> None: + """Test joining features from multiple feature views in the same collection.""" + client: MongoClient = MongoClient(mongodb_connection_string) + db = client["feast_test"] + collection = db["feature_history"] + + now = datetime.now(tz=pytz.UTC) + + # Insert documents for two different feature views + multi_docs = [ + # driver_stats_multi + { + "entity_id": _make_entity_id({"driver_id": 1}), + "feature_view": "driver_stats_multi", + "features": {"rating": 4.8}, + "event_timestamp": now - timedelta(hours=1), + "created_at": now - timedelta(hours=1), + }, + { + "entity_id": _make_entity_id({"driver_id": 2}), + "feature_view": "driver_stats_multi", + "features": {"rating": 4.5}, + "event_timestamp": now - timedelta(hours=1), + "created_at": now - timedelta(hours=1), + }, + # vehicle_stats_multi + { + "entity_id": _make_entity_id({"driver_id": 1}), + "feature_view": "vehicle_stats_multi", + "features": {"vehicle_age": 2, "mileage": 50000}, + "event_timestamp": now - timedelta(hours=1), + "created_at": now - timedelta(hours=1), + }, + { + "entity_id": _make_entity_id({"driver_id": 2}), + "feature_view": "vehicle_stats_multi", + "features": {"vehicle_age": 5, "mileage": 120000}, + "event_timestamp": now - timedelta(hours=1), + "created_at": now - timedelta(hours=1), + }, + ] + collection.insert_many(multi_docs) + client.close() + + # Create sources and feature views + driver_source = MongoDBSourceNative(name="driver_stats_multi") + vehicle_source = MongoDBSourceNative(name="vehicle_stats_multi") + + driver_entity = Entity( + name="driver_id", join_keys=["driver_id"], value_type=ValueType.INT64 + ) + + driver_fv = FeatureView( + name="driver_stats_multi", + entities=[driver_entity], + schema=[ + Field(name="driver_id", dtype=Int64), + Field(name="rating", dtype=Float64), + ], + source=driver_source, + ttl=timedelta(days=1), + ) + + vehicle_fv = FeatureView( + name="vehicle_stats_multi", + entities=[driver_entity], + schema=[ + Field(name="driver_id", dtype=Int64), + Field(name="vehicle_age", dtype=Int64), + Field(name="mileage", dtype=Int64), + ], + source=vehicle_source, + ttl=timedelta(days=1), + ) + + entity_df = pd.DataFrame( + { + "driver_id": [1, 2], + "event_timestamp": [now, now], + } + ) + + job = MongoDBOfflineStoreNative.get_historical_features( + config=repo_config, + feature_views=[driver_fv, vehicle_fv], + feature_refs=[ + "driver_stats_multi:rating", + "vehicle_stats_multi:vehicle_age", + "vehicle_stats_multi:mileage", + ], + entity_df=entity_df, + registry=MagicMock(), + project=repo_config.project, + full_feature_names=False, + ) + + result_df = job.to_df().sort_values("driver_id").reset_index(drop=True) + + assert len(result_df) == 2 + assert set(result_df.columns) >= {"driver_id", "rating", "vehicle_age", "mileage"} + + # Driver 1 + assert result_df.loc[0, "rating"] == pytest.approx(4.8) + assert result_df.loc[0, "vehicle_age"] == 2 + assert result_df.loc[0, "mileage"] == 50000 + + # Driver 2 + assert result_df.loc[1, "rating"] == pytest.approx(4.5) + assert result_df.loc[1, "vehicle_age"] == 5 + assert result_df.loc[1, "mileage"] == 120000 + + +@_requires_docker +def test_compound_join_keys( + repo_config: RepoConfig, mongodb_connection_string: str +) -> None: + """Test with compound/composite join keys (multiple entity columns).""" + client: MongoClient = MongoClient(mongodb_connection_string) + db = client["feast_test"] + collection = db["feature_history"] + + now = datetime.now(tz=pytz.UTC) + + # Insert documents with compound keys (user_id + device_id) + compound_docs = [ + { + "entity_id": _make_entity_id({"user_id": 1, "device_id": "mobile"}), + "feature_view": "user_device_features", + "features": {"app_opens": 50}, + "event_timestamp": now - timedelta(hours=2), + "created_at": now - timedelta(hours=2), + }, + { + "entity_id": _make_entity_id({"user_id": 1, "device_id": "mobile"}), + "feature_view": "user_device_features", + "features": {"app_opens": 55}, # Latest for this entity + "event_timestamp": now - timedelta(hours=1), + "created_at": now - timedelta(hours=1), + }, + { + "entity_id": _make_entity_id({"user_id": 1, "device_id": "desktop"}), + "feature_view": "user_device_features", + "features": {"app_opens": 10}, + "event_timestamp": now - timedelta(hours=1), + "created_at": now - timedelta(hours=1), + }, + { + "entity_id": _make_entity_id({"user_id": 2, "device_id": "tablet"}), + "feature_view": "user_device_features", + "features": {"app_opens": 25}, + "event_timestamp": now - timedelta(hours=1), + "created_at": now - timedelta(hours=1), + }, + ] + collection.insert_many(compound_docs) + client.close() + + source = MongoDBSourceNative(name="user_device_features") + + user_entity = Entity( + name="user_id", join_keys=["user_id"], value_type=ValueType.INT64 + ) + device_entity = Entity( + name="device_id", join_keys=["device_id"], value_type=ValueType.STRING + ) + + fv = FeatureView( + name="user_device_features", + entities=[user_entity, device_entity], + schema=[ + Field(name="user_id", dtype=Int64), + Field(name="device_id", dtype=String), + Field(name="app_opens", dtype=Int64), + ], + source=source, + ttl=timedelta(days=1), + ) + + # Test pull_latest: should get one row per unique (user_id, device_id) + job = MongoDBOfflineStoreNative.pull_latest_from_table_or_query( + config=repo_config, + data_source=source, + join_key_columns=["user_id", "device_id"], + feature_name_columns=["app_opens"], + timestamp_field="event_timestamp", + created_timestamp_column="created_at", + start_date=now - timedelta(days=1), + end_date=now + timedelta(hours=1), + ) + + df = job.to_df() + assert len(df) == 3 # 3 unique (user_id, device_id) combinations + + # Verify we got the latest value (55) for user 1, mobile + app_opens_values = sorted(df["app_opens"].tolist()) + assert 55 in app_opens_values # Latest for user 1, mobile + assert 10 in app_opens_values # user 1, desktop + assert 25 in app_opens_values # user 2, tablet + + # Test get_historical_features with compound keys + entity_df = pd.DataFrame( + { + "user_id": [1, 1, 2], + "device_id": ["mobile", "desktop", "tablet"], + "event_timestamp": [now, now, now], + } + ) + + job = MongoDBOfflineStoreNative.get_historical_features( + config=repo_config, + feature_views=[fv], + feature_refs=["user_device_features:app_opens"], + entity_df=entity_df, + registry=MagicMock(), + project=repo_config.project, + full_feature_names=False, + ) + + result_df = job.to_df() + assert len(result_df) == 3 + + # Sort for predictable assertions + result_df = result_df.sort_values(["user_id", "device_id"]).reset_index(drop=True) + + # user 1, desktop + assert result_df.loc[0, "app_opens"] == 10 + # user 1, mobile (latest value) + assert result_df.loc[1, "app_opens"] == 55 + # user 2, tablet + assert result_df.loc[2, "app_opens"] == 25 From e9de6f3017d58f6f36846f570cf4bf16c5718469 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Wed, 18 Mar 2026 11:30:42 -0400 Subject: [PATCH 08/30] Added DriverInfo to MongoDBClients Signed-off-by: Casey Clements --- .../contrib/mongodb_offline_store/__init__.py | 7 +++++++ .../contrib/mongodb_offline_store/mongodb.py | 17 ++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/__init__.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/__init__.py index 8b13789179..535583bc38 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/__init__.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/__init__.py @@ -1 +1,8 @@ +import feast.version +try: + from pymongo.driver_info import DriverInfo + + DRIVER_METADATA = DriverInfo(name="Feast", version=feast.version.get_version()) +except ImportError: + DRIVER_METADATA = None # type: ignore[assignment] diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py index ee37b11c41..51100ef827 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py @@ -33,6 +33,7 @@ SavedDatasetLocationAlreadyExists, ) from feast.feature_view import FeatureView +from feast.infra.offline_stores.contrib.mongodb_offline_store import DRIVER_METADATA from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_source import ( MongoDBSource, ) @@ -178,7 +179,7 @@ def reader(data_source: DataSource, repo_path: str) -> Table: ) connection_string = config.offline_store.connection_string db_name = data_source.database or config.offline_store.database - client: Any = MongoClient(connection_string) + client: Any = MongoClient(connection_string, driver=DRIVER_METADATA) try: docs = list(client[db_name][data_source.collection].find({}, {"_id": 0})) finally: @@ -230,7 +231,9 @@ def writer( connection_string = config.offline_store.connection_string db_name = data_source.database or config.offline_store.database location = f"{db_name}.{data_source.collection}" - client: Any = MongoClient(connection_string, tz_aware=True) + client: Any = MongoClient( + connection_string, driver=DRIVER_METADATA, tz_aware=True + ) try: coll = client[db_name][data_source.collection] if mode == "overwrite": @@ -277,7 +280,7 @@ def _fetch_collection_as_arrow( """ if MongoClient is None: raise FeastExtrasDependencyImportError("mongodb", "pymongo is not installed.") - client: Any = MongoClient(connection_string, tz_aware=True) + client: Any = MongoClient(connection_string, driver=DRIVER_METADATA, tz_aware=True) try: if pipeline is not None: docs = list(client[db_name][collection].aggregate(pipeline)) @@ -355,7 +358,9 @@ def persist( connection_string = self._config.offline_store.connection_string db_name = data_source.database or self._config.offline_store.database location = f"{db_name}.{data_source.collection}" - client: Any = MongoClient(connection_string, tz_aware=True) + client: Any = MongoClient( + connection_string, driver=DRIVER_METADATA, tz_aware=True + ) try: coll = client[db_name][data_source.collection] if not allow_overwrite and coll.estimated_document_count() > 0: @@ -400,7 +405,9 @@ def offline_write_batch( connection_string = config.offline_store.connection_string db_name = data_source.database or config.offline_store.database records = table.to_pylist() - client: Any = MongoClient(connection_string, tz_aware=True) + client: Any = MongoClient( + connection_string, driver=DRIVER_METADATA, tz_aware=True + ) try: coll = client[db_name][data_source.collection] if records: From 81d194c55c8790c0aa1535f7a7c013dd431e5565 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Wed, 18 Mar 2026 14:29:38 -0400 Subject: [PATCH 09/30] Optimized MQL. Applied FV-level TTL Signed-off-by: Casey Clements --- .../mongodb_offline_store/mongodb_native.py | 307 ++++++++++++------ 1 file changed, 203 insertions(+), 104 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py index a6f0a8acfc..0b0dfcc06f 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py @@ -41,6 +41,7 @@ """ import json +import uuid import warnings from datetime import datetime, timezone from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union @@ -59,6 +60,7 @@ from feast.errors import DataSourceNoNameException, FeastExtrasDependencyImportError from feast.feature_view import FeatureView from feast.infra.key_encoding_utils import serialize_entity_key +from feast.infra.offline_stores.contrib.mongodb_offline_store import DRIVER_METADATA from feast.infra.offline_stores.offline_store import ( OfflineStore, RetrievalJob, @@ -209,7 +211,7 @@ def get_table_column_names_and_types( connection_string = config.offline_store.connection_string db_name = config.offline_store.database collection_name = config.offline_store.collection - client: Any = MongoClient(connection_string) + client: Any = MongoClient(connection_string, driver=DRIVER_METADATA) try: pipeline = [ {"$match": {"feature_view": self.name}}, @@ -272,7 +274,7 @@ def _fetch_documents( """Execute an aggregation pipeline and return documents.""" if MongoClient is None: raise FeastExtrasDependencyImportError("mongodb", "pymongo is not installed.") - client: Any = MongoClient(connection_string) + client: Any = MongoClient(connection_string, driver=DRIVER_METADATA) try: return list(client[database][collection].aggregate(pipeline)) finally: @@ -343,6 +345,55 @@ def _serialize_entity_key_from_row( return serialize_entity_key(entity_key, entity_key_serialization_version) +def _ttl_to_ms(fv: FeatureView) -> Optional[int]: + """Convert FeatureView TTL to milliseconds.""" + if fv.ttl is None: + return None + return int(fv.ttl.total_seconds() * 1000) + + +def _build_ttl_gte_expr(feature_views: List[FeatureView]) -> Optional[Dict[str, Any]]: + """Build a $gte expression with per-FV TTL using $switch. + + Returns a MongoDB expression that evaluates to: + event_timestamp >= (entity_timestamp - ttl_for_this_feature_view) + + Each feature_view can have a different TTL, handled via $switch branches. + If no feature views have TTL, returns None (no filtering needed). + """ + branches = [] + + for fv in feature_views: + ttl_ms = _ttl_to_ms(fv) + if ttl_ms is None: + # No TTL for this FV - skip (effectively infinite history) + continue + + branches.append( + { + "case": {"$eq": ["$feature_view", fv.name]}, + "then": {"$subtract": ["$$ts", ttl_ms]}, + } + ) + + # If no TTLs at all, no lower bound needed + if not branches: + return None + + return { + "$gte": [ + "$event_timestamp", + { + "$switch": { + "branches": branches, + # Default: no lower bound (for FVs without TTL) + "default": {"$literal": 0}, + } + }, + ] + } + + class MongoDBOfflineStoreNative(OfflineStore): """Native MongoDB offline store using single-collection schema. @@ -384,6 +435,17 @@ def pull_latest_from_table_or_query( start_utc = start_date.astimezone(tz=timezone.utc) end_utc = end_date.astimezone(tz=timezone.utc) + # Build projection to flatten features subdoc to top-level fields + project_stage: Dict[str, Any] = { + "_id": 0, + "entity_id": "$doc.entity_id", + "event_timestamp": "$doc.event_timestamp", + } + if created_timestamp_column: + project_stage["created_at"] = "$doc.created_at" + for feat in feature_name_columns: + project_stage[feat] = f"$doc.features.{feat}" + # Build aggregation pipeline pipeline: List[Dict[str, Any]] = [ { @@ -399,6 +461,7 @@ def pull_latest_from_table_or_query( "doc": {"$first": "$$ROOT"}, } }, + {"$project": project_stage}, ] def _run() -> pyarrow.Table: @@ -406,23 +469,12 @@ def _run() -> pyarrow.Table: if not docs: return pyarrow.Table.from_pydict({}) - # Flatten documents - rows = [] - for d in docs: - doc = d["doc"] - row = { - "entity_id": doc["entity_id"], - "event_timestamp": doc["event_timestamp"], - } - features = doc.get("features", {}) - for feat in feature_name_columns: - row[feat] = features.get(feat) - rows.append(row) - - df = pd.DataFrame(rows) - # Ensure timestamp is tz-aware - if not df.empty and df["event_timestamp"].dt.tz is None: - df["event_timestamp"] = pd.to_datetime(df["event_timestamp"], utc=True) + df = pd.DataFrame(docs) + if not df.empty and "event_timestamp" in df.columns: + if df["event_timestamp"].dt.tz is None: + df["event_timestamp"] = pd.to_datetime( + df["event_timestamp"], utc=True + ) return pyarrow.Table.from_pandas(df, preserve_index=False) return MongoDBNativeRetrievalJob(query_fn=_run, full_feature_names=False) @@ -453,7 +505,7 @@ def pull_all_from_table_or_query( collection = config.offline_store.collection feature_view_name = data_source.feature_view_name - # Build match filter + # Build match filter: feature_view + optional time range match_filter: Dict[str, Any] = {"feature_view": feature_view_name} if start_date or end_date: ts_filter: Dict[str, Any] = {} @@ -463,27 +515,35 @@ def pull_all_from_table_or_query( ts_filter["$lte"] = end_date.astimezone(tz=timezone.utc) match_filter["event_timestamp"] = ts_filter - pipeline = [{"$match": match_filter}] + # Build projection: flatten features subdoc to top-level fields + # This uses $getField to extract each feature from the features subdoc + project_stage: Dict[str, Any] = { + "_id": 0, + "entity_id": 1, + "event_timestamp": 1, + } + if created_timestamp_column: + project_stage["created_at"] = 1 + for feat in feature_name_columns: + project_stage[feat] = f"$features.{feat}" + + # Simple range scan pipeline - no sorting for efficiency + pipeline: List[Dict[str, Any]] = [ + {"$match": match_filter}, + {"$project": project_stage}, + ] def _run() -> pyarrow.Table: docs = _fetch_documents(connection_string, db_name, collection, pipeline) if not docs: return pyarrow.Table.from_pydict({}) - rows = [] - for doc in docs: - row = { - "entity_id": doc["entity_id"], - "event_timestamp": doc["event_timestamp"], - } - features = doc.get("features", {}) - for feat in feature_name_columns: - row[feat] = features.get(feat) - rows.append(row) - - df = pd.DataFrame(rows) - if not df.empty and df["event_timestamp"].dt.tz is None: - df["event_timestamp"] = pd.to_datetime(df["event_timestamp"], utc=True) + df = pd.DataFrame(docs) + if not df.empty and "event_timestamp" in df.columns: + if df["event_timestamp"].dt.tz is None: + df["event_timestamp"] = pd.to_datetime( + df["event_timestamp"], utc=True + ) return pyarrow.Table.from_pandas(df, preserve_index=False) return MongoDBNativeRetrievalJob(query_fn=_run, full_feature_names=False) @@ -510,7 +570,7 @@ def get_historical_features( connection_string = config.offline_store.connection_string db_name = config.offline_store.database - collection = config.offline_store.collection + feature_collection = config.offline_store.collection entity_key_version = config.entity_key_serialization_version entity_schema = dict(zip(entity_df.columns, entity_df.dtypes)) @@ -522,22 +582,28 @@ def get_historical_features( fv_name, feat_name = ref.split(":", 1) fv_to_features.setdefault(fv_name, []).append(feat_name) - fv_by_name = {fv.name: fv for fv in feature_views} + fv_names = list(fv_to_features.keys()) + + # Build per-FV TTL expression using $switch + ttl_expr = _build_ttl_gte_expr(feature_views) def _run() -> pyarrow.Table: - result = entity_df.copy() + if MongoClient is None: + raise FeastExtrasDependencyImportError( + "mongodb", "pymongo is not installed." + ) - # Ensure entity timestamp is tz-aware UTC + # Prepare entity_df: ensure timestamps are UTC and serialize entity keys + result = entity_df.copy() if result[event_timestamp_col].dt.tz is None: result[event_timestamp_col] = pd.to_datetime( result[event_timestamp_col], utc=True ) - result = result.sort_values(event_timestamp_col) - # Get join keys from entity_df columns (excluding event_timestamp) + # Get join keys (all columns except event_timestamp) entity_columns = [c for c in result.columns if c != event_timestamp_col] - # Serialize entity keys for lookup + # Serialize entity keys to bytes (same format as online store) result["_entity_id"] = result.apply( lambda row: _serialize_entity_key_from_row( row, entity_columns, entity_key_version @@ -545,76 +611,109 @@ def _run() -> pyarrow.Table: axis=1, ) - for fv_name, features in fv_to_features.items(): - fv = fv_by_name[fv_name] - source = fv.batch_source - if not isinstance(source, MongoDBSourceNative): - raise ValueError( - f"MongoDBOfflineStoreNative: feature view {fv_name!r} has " - f"non-MongoDBSourceNative source ({type(source).__name__!r})." - ) - - # Fetch all documents for this feature view - pipeline = [{"$match": {"feature_view": fv_name}}] - docs = _fetch_documents( - connection_string, db_name, collection, pipeline + # Build temp collection documents + temp_docs = [] + for _, row in result.iterrows(): + temp_docs.append( + { + "entity_id": row["_entity_id"], + "event_timestamp": row[event_timestamp_col], + "_row_idx": _, # Preserve original order + } ) - if not docs: - for f in features: - col = f"{fv_name}__{f}" if full_feature_names else f - result[col] = None - continue + # Create temp collection with unique name + temp_collection_name = f"entity_df_{uuid.uuid4().hex[:12]}" + + client: Any = MongoClient(connection_string, driver=DRIVER_METADATA) + try: + db = client[db_name] + temp_collection = db[temp_collection_name] + temp_collection.insert_many(temp_docs) + + # Build $lookup subpipeline with PIT join logic + # Match: entity_id, feature_view in list, event_timestamp <= entity.ts + match_conditions: List[Dict[str, Any]] = [ + {"$eq": ["$entity_id", "$$entity_id"]}, + {"$in": ["$feature_view", fv_names]}, + {"$lte": ["$event_timestamp", "$$ts"]}, + ] + # Add per-FV TTL filter using $switch + if ttl_expr is not None: + match_conditions.append(ttl_expr) + + lookup_pipeline: List[Dict[str, Any]] = [ + {"$match": {"$expr": {"$and": match_conditions}}}, + {"$sort": {"feature_view": 1, "event_timestamp": -1}}, + { + "$group": { + "_id": "$feature_view", + "doc": {"$first": "$$ROOT"}, + } + }, + ] + + # Main aggregation pipeline + pipeline: List[Dict[str, Any]] = [ + { + "$lookup": { + "from": feature_collection, + "let": { + "entity_id": "$entity_id", + "ts": "$event_timestamp", + }, + "pipeline": lookup_pipeline, + "as": "feature_rows", + } + }, + {"$sort": {"_row_idx": 1}}, # Preserve original order + ] + + docs = list(temp_collection.aggregate(pipeline)) + + finally: + # Cleanup temp collection + client[db_name][temp_collection_name].drop() + client.close() - # Build feature DataFrame - feature_rows = [] - for doc in docs: - row = { - "_entity_id": doc["entity_id"], - "_fv_ts": doc["event_timestamp"], - } - feat_data = doc.get("features", {}) - for f in features: - row[f] = feat_data.get(f) - feature_rows.append(row) - - feature_df = pd.DataFrame(feature_rows) - if feature_df["_fv_ts"].dt.tz is None: - feature_df["_fv_ts"] = pd.to_datetime( - feature_df["_fv_ts"], utc=True - ) - feature_df = feature_df.sort_values("_fv_ts") + if not docs: + return pyarrow.Table.from_pydict({}) - # Rename features if full_feature_names - col_rename = { - f: (f"{fv_name}__{f}" if full_feature_names else f) - for f in features + # Build result DataFrame + rows = [] + for doc in docs: + # Start with entity columns from original entity_df + row_idx = doc["_row_idx"] + row = result.iloc[row_idx][ + entity_columns + [event_timestamp_col] + ].to_dict() + + # Extract features from each feature_view's matched doc + feature_rows_by_fv = { + fr["_id"]: fr["doc"] for fr in doc.get("feature_rows", []) } - feature_df = feature_df.rename(columns=col_rename) - out_features = list(col_rename.values()) - - # Point-in-time join using merge_asof - merged = pd.merge_asof( - result, - feature_df, - left_on=event_timestamp_col, - right_on="_fv_ts", - by="_entity_id", - direction="backward", - ) - # Apply TTL: null out stale features - if fv.ttl: - cutoff = merged[event_timestamp_col] - fv.ttl - too_old = merged["_fv_ts"] < cutoff - for col in out_features: - merged.loc[too_old, col] = None + # Extract features from each feature_view's matched doc + # TTL is already applied server-side via $switch expression + for fv_name, features in fv_to_features.items(): + fv_doc = feature_rows_by_fv.get(fv_name) - result = merged.drop(columns=["_fv_ts"], errors="ignore") + for feat in features: + col_name = f"{fv_name}__{feat}" if full_feature_names else feat + if fv_doc is None: + row[col_name] = None + else: + row[col_name] = fv_doc.get("features", {}).get(feat) - # Remove internal entity_id column - result = result.drop(columns=["_entity_id"], errors="ignore") - return pyarrow.Table.from_pandas(result, preserve_index=False) + rows.append(row) + + result_df = pd.DataFrame(rows) + if not result_df.empty and event_timestamp_col in result_df.columns: + if result_df[event_timestamp_col].dt.tz is None: + result_df[event_timestamp_col] = pd.to_datetime( + result_df[event_timestamp_col], utc=True + ) + return pyarrow.Table.from_pandas(result_df, preserve_index=False) return MongoDBNativeRetrievalJob( query_fn=_run, From ad853855db03cbdecc0a39c0c36d2bc7e6e37a23 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Wed, 18 Mar 2026 17:10:40 -0400 Subject: [PATCH 10/30] filter TTL by relevant FVs only, cautiously reset df index; add created_at tie-breaker in sort Signed-off-by: Casey Clements --- .../contrib/mongodb_offline_store/mongodb_native.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py index 0b0dfcc06f..214d5657d3 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py @@ -454,7 +454,7 @@ def pull_latest_from_table_or_query( "event_timestamp": {"$gte": start_utc, "$lte": end_utc}, } }, - {"$sort": {"entity_id": 1, "event_timestamp": -1}}, + {"$sort": {"entity_id": 1, "event_timestamp": -1, "created_at": -1}}, { "$group": { "_id": "$entity_id", @@ -585,7 +585,8 @@ def get_historical_features( fv_names = list(fv_to_features.keys()) # Build per-FV TTL expression using $switch - ttl_expr = _build_ttl_gte_expr(feature_views) + relevant_fvs = [fv for fv in feature_views if fv.name in fv_to_features] + ttl_expr = _build_ttl_gte_expr(relevant_fvs) def _run() -> pyarrow.Table: if MongoClient is None: @@ -623,7 +624,7 @@ def _run() -> pyarrow.Table: ) # Create temp collection with unique name - temp_collection_name = f"entity_df_{uuid.uuid4().hex[:12]}" + temp_collection_name = f"tmp_entity_df_{uuid.uuid4().hex[:12]}" client: Any = MongoClient(connection_string, driver=DRIVER_METADATA) try: @@ -680,6 +681,7 @@ def _run() -> pyarrow.Table: return pyarrow.Table.from_pydict({}) # Build result DataFrame + result = result.reset_index(drop=True) rows = [] for doc in docs: # Start with entity columns from original entity_df From 4d02febe572a5e129549be8eb5e9d1445af304d1 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Wed, 18 Mar 2026 17:47:39 -0400 Subject: [PATCH 11/30] Updated docstrings Signed-off-by: Casey Clements --- .../mongodb_offline_store/mongodb_native.py | 82 +++++++++++++------ 1 file changed, 59 insertions(+), 23 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py index 214d5657d3..ba2d9e29a0 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py @@ -17,9 +17,19 @@ This module implements a MongoDB offline store using native MQL aggregation pipelines. It uses a single-collection schema where all feature views share -one collection, discriminated by a ``feature_view`` field. +one collection. It is event-based: each document represents an observation +of a FeatureView at a specific point in time. Each document may contain a +subset (0 or more) of the features defined in that FeatureView, all sharing +a single event_timestamp. -Schema: +Collection Index: + db.feature_history.create_index([ + ("feature_view", ASCENDING), + ("entity_id", ASCENDING), + ("event_timestamp", DESCENDING), + ]) + +Document Schema (example): { "_id": ObjectId(), "entity_id": "", @@ -32,12 +42,42 @@ "created_at": ISODate("2026-01-20T12:00:05Z") } -Recommended Index: - db.feature_history.create_index([ - ("entity_id", ASCENDING), - ("feature_view", ASCENDING), - ("event_timestamp", DESCENDING), - ]) +Feature Freshness Semantics: + This implementation operates at *document-level freshness*, not + per-feature freshness. During retrieval (e.g. point-in-time joins), + the system selects the most recent document for a given + (entity_id, feature_view) that satisfies time constraints, and then + extracts all requested features from that document. + + As a result, if a newer document contains only a subset of features, + missing features will be returned as NULL—even if older documents + contained values for those features. The system does not backfill + individual feature values from earlier events. + + This behavior matches common Feast offline store semantics, but may + differ from systems that compute "latest value per feature". + +Schema Evolution ("Feature Creep"): + Because features are stored in a flexible subdocument, different + documents for the same FeatureView may contain different sets of + feature fields over time. This supports: + - adding new features without backfilling historical data + - partial writes or sparse feature computation + + However, it also implies: + - newly added features will be NULL for older events + - partially populated documents may lead to NULL values even + when older data contained those features + + Users should ensure that feature computation pipelines write + complete feature sets when consistent availability is required. + +Notes: + - Entity keys are serialized to ensure consistency with Feast’s + online store and to avoid type ambiguity. + - Point-in-time correctness is enforced per FeatureView. + - TTL (time-to-live) constraints are applied per FeatureView during + historical retrieval. """ import json @@ -79,12 +119,7 @@ class MongoDBOfflineStoreNativeConfig(FeastConfigBaseModel): - """Configuration for the Native MongoDB offline store. - - Uses a single shared collection for all feature views, with documents - containing an ``entity_id``, ``feature_view`` discriminator, and nested - ``features`` subdocument. - """ + """Configuration for the Native MongoDB offline store.""" type: StrictStr = "feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_native.MongoDBOfflineStoreNative" """Offline store type selector""" @@ -100,15 +135,16 @@ class MongoDBOfflineStoreNativeConfig(FeastConfigBaseModel): class MongoDBSourceNative(DataSource): - """A MongoDB data source for the Native offline store. + """A MongoDB data source for the native offline store. - Unlike MongoDBSource (Ibis), this source does not specify a collection - per FeatureView. Instead, all FeatureViews share a single collection - (configured at the store level), and are discriminated by the - ``feature_view`` field in each document. + Unlike many data source implementations, this source does not map each + FeatureView to its own table or collection. Instead, all FeatureViews + share a single MongoDB collection (configured at the store level). - The ``name`` parameter becomes the ``feature_view`` discriminator value - used to filter documents in queries. + Each document in that collection includes a ``feature_view`` field that + identifies which FeatureView it belongs to. The ``name`` of this data + source corresponds to that value and is used to filter documents during + queries. """ def __init__( @@ -400,7 +436,7 @@ class MongoDBOfflineStoreNative(OfflineStore): All feature views share one collection (``feature_history``), with documents containing: - ``entity_id``: serialized entity key (bytes) - - ``feature_view``: discriminator field matching FeatureView name + - ``feature_view``: field matching FeatureView name - ``features``: subdocument with feature name/value pairs - ``event_timestamp``: event time - ``created_at``: ingestion time @@ -623,7 +659,7 @@ def _run() -> pyarrow.Table: } ) - # Create temp collection with unique name + # Create temporary collection for query temp_collection_name = f"tmp_entity_df_{uuid.uuid4().hex[:12]}" client: Any = MongoClient(connection_string, driver=DRIVER_METADATA) From 8d86cdd54861c61eae71056f66c1e13330246b76 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Wed, 18 Mar 2026 18:21:39 -0400 Subject: [PATCH 12/30] Lazy index creation via _get_client_and_ensure_indexes Signed-off-by: Casey Clements --- .../mongodb_offline_store/mongodb_native.py | 103 ++++++++++++------ 1 file changed, 69 insertions(+), 34 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py index ba2d9e29a0..c9cbae587a 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py @@ -302,19 +302,13 @@ def _infer_python_type_str(value: Any) -> Optional[str]: def _fetch_documents( - connection_string: str, + client: Any, database: str, collection: str, pipeline: List[Dict], ) -> List[Dict]: """Execute an aggregation pipeline and return documents.""" - if MongoClient is None: - raise FeastExtrasDependencyImportError("mongodb", "pymongo is not installed.") - client: Any = MongoClient(connection_string, driver=DRIVER_METADATA) - try: - return list(client[database][collection].aggregate(pipeline)) - finally: - client.close() + return list(client[database][collection].aggregate(pipeline)) class MongoDBNativeRetrievalJob(RetrievalJob): @@ -442,6 +436,42 @@ class MongoDBOfflineStoreNative(OfflineStore): - ``created_at``: ingestion time """ + _index_initialized: bool = False + + @staticmethod + def _ensure_indexes(client: Any, db_name: str, collection_name: str) -> None: + """Create recommended indexes on the feature_history collection.""" + collection = client[db_name][collection_name] + collection.create_index( + [ + ("entity_id", 1), + ("feature_view", 1), + ("event_timestamp", -1), + ], + name="entity_fv_ts_idx", + ) + + @classmethod + def _get_client_and_ensure_indexes(cls, config: RepoConfig) -> Any: + """Get a MongoClient and ensure indexes exist (once per process).""" + if MongoClient is None: + raise FeastExtrasDependencyImportError( + "mongodb", "pymongo is not installed." + ) + client: Any = MongoClient( + config.offline_store.connection_string, driver=DRIVER_METADATA + ) + + if not cls._index_initialized: + cls._ensure_indexes( + client, + config.offline_store.database, + config.offline_store.collection, + ) + cls._index_initialized = True + + return client + @staticmethod def pull_latest_from_table_or_query( config: RepoConfig, @@ -463,7 +493,6 @@ def pull_latest_from_table_or_query( RuntimeWarning, ) - connection_string = config.offline_store.connection_string db_name = config.offline_store.database collection = config.offline_store.collection feature_view_name = data_source.feature_view_name @@ -501,17 +530,21 @@ def pull_latest_from_table_or_query( ] def _run() -> pyarrow.Table: - docs = _fetch_documents(connection_string, db_name, collection, pipeline) - if not docs: - return pyarrow.Table.from_pydict({}) - - df = pd.DataFrame(docs) - if not df.empty and "event_timestamp" in df.columns: - if df["event_timestamp"].dt.tz is None: - df["event_timestamp"] = pd.to_datetime( - df["event_timestamp"], utc=True - ) - return pyarrow.Table.from_pandas(df, preserve_index=False) + client = MongoDBOfflineStoreNative._get_client_and_ensure_indexes(config) + try: + docs = _fetch_documents(client, db_name, collection, pipeline) + if not docs: + return pyarrow.Table.from_pydict({}) + + df = pd.DataFrame(docs) + if not df.empty and "event_timestamp" in df.columns: + if df["event_timestamp"].dt.tz is None: + df["event_timestamp"] = pd.to_datetime( + df["event_timestamp"], utc=True + ) + return pyarrow.Table.from_pandas(df, preserve_index=False) + finally: + client.close() return MongoDBNativeRetrievalJob(query_fn=_run, full_feature_names=False) @@ -536,7 +569,6 @@ def pull_all_from_table_or_query( RuntimeWarning, ) - connection_string = config.offline_store.connection_string db_name = config.offline_store.database collection = config.offline_store.collection feature_view_name = data_source.feature_view_name @@ -570,17 +602,21 @@ def pull_all_from_table_or_query( ] def _run() -> pyarrow.Table: - docs = _fetch_documents(connection_string, db_name, collection, pipeline) - if not docs: - return pyarrow.Table.from_pydict({}) - - df = pd.DataFrame(docs) - if not df.empty and "event_timestamp" in df.columns: - if df["event_timestamp"].dt.tz is None: - df["event_timestamp"] = pd.to_datetime( - df["event_timestamp"], utc=True - ) - return pyarrow.Table.from_pandas(df, preserve_index=False) + client = MongoDBOfflineStoreNative._get_client_and_ensure_indexes(config) + try: + docs = _fetch_documents(client, db_name, collection, pipeline) + if not docs: + return pyarrow.Table.from_pydict({}) + + df = pd.DataFrame(docs) + if not df.empty and "event_timestamp" in df.columns: + if df["event_timestamp"].dt.tz is None: + df["event_timestamp"] = pd.to_datetime( + df["event_timestamp"], utc=True + ) + return pyarrow.Table.from_pandas(df, preserve_index=False) + finally: + client.close() return MongoDBNativeRetrievalJob(query_fn=_run, full_feature_names=False) @@ -604,7 +640,6 @@ def get_historical_features( RuntimeWarning, ) - connection_string = config.offline_store.connection_string db_name = config.offline_store.database feature_collection = config.offline_store.collection entity_key_version = config.entity_key_serialization_version @@ -662,7 +697,7 @@ def _run() -> pyarrow.Table: # Create temporary collection for query temp_collection_name = f"tmp_entity_df_{uuid.uuid4().hex[:12]}" - client: Any = MongoClient(connection_string, driver=DRIVER_METADATA) + client = MongoDBOfflineStoreNative._get_client_and_ensure_indexes(config) try: db = client[db_name] temp_collection = db[temp_collection_name] From a1e3c9386b69dbd4158a4eee88fd06ade2c5f9a2 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Wed, 18 Mar 2026 19:13:48 -0400 Subject: [PATCH 13/30] Add performance benchmarks comparing Ibis vs Native MongoDB offline stores Signed-off-by: Casey Clements --- .../benchmark_mongodb_offline_stores.py | 836 ++++++++++++++++++ 1 file changed, 836 insertions(+) create mode 100644 sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py b/sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py new file mode 100644 index 0000000000..177023dd6f --- /dev/null +++ b/sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py @@ -0,0 +1,836 @@ +""" +Performance benchmarks comparing Ibis vs Native MongoDB offline store implementations. + +These tests measure performance across different scaling dimensions: +1. Row count scaling (entity_df size) +2. Feature width scaling (features per FeatureView) +3. Entity distribution (unique vs skewed/repeated entity_ids) + +Metrics captured: +- Runtime (wall clock) +- Memory (peak Python memory via tracemalloc) +- MongoDB server metrics (opcounters, execution stats) + +Run with: pytest benchmark_mongodb_offline_stores.py -v -s +Skip slow tests: pytest benchmark_mongodb_offline_stores.py -v -s -m "not slow" +""" + +import time +import tracemalloc +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from typing import Any, Dict, Generator, Optional + +import pandas as pd +import pytest +import pytz + +pytest.importorskip("pymongo") + +from unittest.mock import MagicMock + +from pymongo import MongoClient +from testcontainers.mongodb import MongoDbContainer + +from feast import Entity, FeatureView, Field +from feast.infra.key_encoding_utils import serialize_entity_key +from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb import ( + MongoDBOfflineStoreIbis, + MongoDBOfflineStoreIbisConfig, +) +from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_native import ( + MongoDBOfflineStoreNative, + MongoDBOfflineStoreNativeConfig, + MongoDBSourceNative, +) +from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_source import ( + MongoDBSource, +) +from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto +from feast.protos.feast.types.Value_pb2 import Value as ValueProto +from feast.repo_config import RepoConfig +from feast.types import Float64, Int64 +from feast.value_type import ValueType + +# Check if Docker is available +docker_available = False +try: + import docker + + try: + client = docker.from_env() + client.ping() + docker_available = True + except Exception: + pass +except ImportError: + pass + +_requires_docker = pytest.mark.skipif( + not docker_available, + reason="Docker is not available or not running.", +) + +ENTITY_KEY_VERSION = 3 + + +@dataclass +class BenchmarkResult: + """Container for benchmark results.""" + + implementation: str + test_name: str + dimension: str + value: int + duration_seconds: float + rows_per_second: float + peak_memory_mb: float = 0.0 + mongo_docs_examined: int = 0 + mongo_keys_examined: int = 0 + mongo_execution_time_ms: int = 0 + + +@dataclass +class MongoMetrics: + """MongoDB server metrics captured before/after a query.""" + + opcounters: Dict[str, int] = field(default_factory=dict) + docs_examined: int = 0 + keys_examined: int = 0 + + @staticmethod + def capture(client: Any) -> "MongoMetrics": + """Capture current MongoDB server metrics.""" + status = client.admin.command("serverStatus") + return MongoMetrics( + opcounters=dict(status.get("opcounters", {})), + ) + + def delta(self, after: "MongoMetrics") -> Dict[str, int]: + """Calculate delta between two metric snapshots.""" + return { + k: after.opcounters.get(k, 0) - self.opcounters.get(k, 0) + for k in after.opcounters + } + + +def _make_entity_id(driver_id: int) -> bytes: + """Create serialized entity key.""" + entity_key = EntityKeyProto() + entity_key.join_keys.append("driver_id") + val = ValueProto() + val.int64_val = driver_id + entity_key.entity_values.append(val) + return serialize_entity_key(entity_key, ENTITY_KEY_VERSION) + + +@pytest.fixture(scope="module") +def mongodb_container() -> Generator[MongoDbContainer, None, None]: + """Start a MongoDB container for benchmarks.""" + container = MongoDbContainer( + "mongo:latest", + username="test", + password="test", # pragma: allowlist secret + ).with_exposed_ports(27017) + container.start() + yield container + container.stop() + + +@pytest.fixture +def mongodb_connection_string(mongodb_container: MongoDbContainer) -> str: + """Get MongoDB connection string.""" + exposed_port = mongodb_container.get_exposed_port(27017) + return f"mongodb://test:test@localhost:{exposed_port}" # pragma: allowlist secret + + +@pytest.fixture +def ibis_config(mongodb_connection_string: str) -> RepoConfig: + """RepoConfig for Ibis implementation.""" + return RepoConfig( + project="benchmark", + registry="memory://", + provider="local", + offline_store=MongoDBOfflineStoreIbisConfig( + connection_string=mongodb_connection_string, + database="benchmark_db", + ), + online_store={"type": "sqlite"}, + entity_key_serialization_version=ENTITY_KEY_VERSION, + ) + + +@pytest.fixture +def native_config(mongodb_connection_string: str) -> RepoConfig: + """RepoConfig for Native implementation.""" + return RepoConfig( + project="benchmark", + registry="memory://", + provider="local", + offline_store=MongoDBOfflineStoreNativeConfig( + connection_string=mongodb_connection_string, + database="benchmark_db", + collection="feature_history", + ), + online_store={"type": "sqlite"}, + entity_key_serialization_version=ENTITY_KEY_VERSION, + ) + + +def _generate_ibis_data( + client: MongoClient, + db_name: str, + collection_name: str, + num_entities: int, + num_features: int, + rows_per_entity: int = 5, +) -> datetime: + """Generate test data for Ibis (one collection per FV, flat schema).""" + collection = client[db_name][collection_name] + collection.drop() + + now = datetime.now(tz=pytz.UTC) + docs = [] + + for entity_id in range(num_entities): + for row in range(rows_per_entity): + doc = { + "driver_id": entity_id, + "event_timestamp": now - timedelta(hours=row), + } + for f in range(num_features): + doc[f"feature_{f}"] = float(entity_id * 100 + f + row * 0.1) + docs.append(doc) + + collection.insert_many(docs) + return now + + +def _generate_native_data( + client: MongoClient, + db_name: str, + collection_name: str, + feature_view_name: str, + num_entities: int, + num_features: int, + rows_per_entity: int = 5, +) -> datetime: + """Generate test data for Native (single collection, nested features).""" + collection = client[db_name][collection_name] + # Don't drop - may have multiple FVs in same collection + + now = datetime.now(tz=pytz.UTC) + docs = [] + + for entity_id in range(num_entities): + for row in range(rows_per_entity): + features = {} + for f in range(num_features): + features[f"feature_{f}"] = float(entity_id * 100 + f + row * 0.1) + + doc = { + "entity_id": _make_entity_id(entity_id), + "feature_view": feature_view_name, + "features": features, + "event_timestamp": now - timedelta(hours=row), + "created_at": now - timedelta(hours=row), + } + docs.append(doc) + + collection.insert_many(docs) + return now + + +def _create_ibis_fv(num_features: int) -> tuple: + """Create Ibis source and FeatureView.""" + source = MongoDBSource( + name="driver_benchmark", + database="benchmark_db", + collection="driver_benchmark", + timestamp_field="event_timestamp", + ) + entity = Entity( + name="driver_id", join_keys=["driver_id"], value_type=ValueType.INT64 + ) + + schema = [Field(name="driver_id", dtype=Int64)] + for f in range(num_features): + schema.append(Field(name=f"feature_{f}", dtype=Float64)) + + fv = FeatureView( + name="driver_benchmark", + entities=[entity], + schema=schema, + source=source, + ttl=timedelta(days=1), + ) + return source, fv + + +def _create_native_fv(num_features: int) -> tuple: + """Create Native source and FeatureView.""" + source = MongoDBSourceNative( + name="driver_benchmark", + timestamp_field="event_timestamp", + ) + entity = Entity( + name="driver_id", join_keys=["driver_id"], value_type=ValueType.INT64 + ) + + schema = [Field(name="driver_id", dtype=Int64)] + for f in range(num_features): + schema.append(Field(name=f"feature_{f}", dtype=Float64)) + + fv = FeatureView( + name="driver_benchmark", + entities=[entity], + schema=schema, + source=source, + ttl=timedelta(days=1), + ) + return source, fv + + +def _run_benchmark(func, name: str) -> float: + """Run a function and return elapsed time.""" + start = time.perf_counter() + func() # Execute the function + elapsed = time.perf_counter() - start + return elapsed + + +@dataclass +class FullBenchmarkResult: + """Full benchmark results with all metrics.""" + + elapsed_seconds: float + peak_memory_mb: float + mongo_opcounters_delta: Dict[str, int] + + +def _run_benchmark_full( + func, + mongo_client: Optional[Any] = None, +) -> FullBenchmarkResult: + """Run a benchmark capturing runtime, memory, and MongoDB metrics.""" + # Capture MongoDB metrics before + mongo_before = None + if mongo_client: + mongo_before = MongoMetrics.capture(mongo_client) + + # Start memory tracking + tracemalloc.start() + + # Run the benchmark + start = time.perf_counter() + func() + elapsed = time.perf_counter() - start + + # Capture peak memory + _, peak_memory = tracemalloc.get_traced_memory() + tracemalloc.stop() + peak_memory_mb = peak_memory / (1024 * 1024) + + # Capture MongoDB metrics after + mongo_delta = {} + if mongo_client and mongo_before: + mongo_after = MongoMetrics.capture(mongo_client) + mongo_delta = mongo_before.delta(mongo_after) + + return FullBenchmarkResult( + elapsed_seconds=elapsed, + peak_memory_mb=peak_memory_mb, + mongo_opcounters_delta=mongo_delta, + ) + + +def _print_benchmark_result( + impl: str, + dimension_name: str, + dimension_value: int, + result: FullBenchmarkResult, + num_rows: Optional[int] = None, +) -> None: + """Pretty print benchmark results.""" + print(f"\n[{impl}] {dimension_name}: {dimension_value:,}") + print(f" Time: {result.elapsed_seconds:.3f}s") + print(f" Memory: {result.peak_memory_mb:.1f} MB") + if num_rows: + rate = num_rows / result.elapsed_seconds if result.elapsed_seconds > 0 else 0 + print(f" Rate: {rate:,.0f} rows/s") + if result.mongo_opcounters_delta: + print(f" Mongo ops: {result.mongo_opcounters_delta}") + + +# ============================================================================= +# Test 1: Scale Rows (entity_df size) +# ============================================================================= + +ROW_COUNTS = [ + 1000, + 5000, + 10000, +] # Reduced for CI; use [10000, 50000, 100000, 500000] for full benchmark + + +@_requires_docker +@pytest.mark.parametrize("num_rows", ROW_COUNTS) +def test_scale_rows_ibis( + mongodb_connection_string: str, ibis_config: RepoConfig, num_rows: int +) -> None: + """Benchmark Ibis implementation with varying entity_df sizes. + + Measures: runtime, peak memory, MongoDB opcounters. + """ + num_features = 10 + num_entities = num_rows # One row per entity for simplicity + + client = MongoClient(mongodb_connection_string) + try: + now = _generate_ibis_data( + client, + "benchmark_db", + "driver_benchmark", + num_entities=num_entities, + num_features=num_features, + rows_per_entity=3, + ) + + _, fv = _create_ibis_fv(num_features) + + entity_df = pd.DataFrame( + { + "driver_id": list(range(num_entities)), + "event_timestamp": [now] * num_entities, + } + ) + + feature_refs = [f"driver_benchmark:feature_{i}" for i in range(num_features)] + + def run_query(): + job = MongoDBOfflineStoreIbis.get_historical_features( + config=ibis_config, + feature_views=[fv], + feature_refs=feature_refs, + entity_df=entity_df, + registry=MagicMock(), + project="benchmark", + full_feature_names=False, + ) + return job.to_df() + + result = _run_benchmark_full(run_query, mongo_client=client) + _print_benchmark_result("IBIS", "Rows", num_rows, result, num_rows=num_rows) + + finally: + client.close() + + +@_requires_docker +@pytest.mark.parametrize("num_rows", ROW_COUNTS) +def test_scale_rows_native( + mongodb_connection_string: str, native_config: RepoConfig, num_rows: int +) -> None: + """Benchmark Native implementation with varying entity_df sizes. + + Measures: runtime, peak memory, MongoDB opcounters. + """ + num_features = 10 + num_entities = num_rows + + client = MongoClient(mongodb_connection_string) + try: + client["benchmark_db"]["feature_history"].drop() + now = _generate_native_data( + client, + "benchmark_db", + "feature_history", + "driver_benchmark", + num_entities=num_entities, + num_features=num_features, + rows_per_entity=3, + ) + + _, fv = _create_native_fv(num_features) + + entity_df = pd.DataFrame( + { + "driver_id": list(range(num_entities)), + "event_timestamp": [now] * num_entities, + } + ) + + feature_refs = [f"driver_benchmark:feature_{i}" for i in range(num_features)] + + def run_query(): + job = MongoDBOfflineStoreNative.get_historical_features( + config=native_config, + feature_views=[fv], + feature_refs=feature_refs, + entity_df=entity_df, + registry=MagicMock(), + project="benchmark", + full_feature_names=False, + ) + return job.to_df() + + result = _run_benchmark_full(run_query, mongo_client=client) + _print_benchmark_result("NATIVE", "Rows", num_rows, result, num_rows=num_rows) + + finally: + client.close() + + +# ============================================================================= +# Test 2: Wide Feature Views (features per FV) +# ============================================================================= + +FEATURE_COUNTS = [10, 50, 100] # Use [50, 100, 150, 200] for full benchmark + + +@_requires_docker +@pytest.mark.parametrize("num_features", FEATURE_COUNTS) +def test_wide_features_ibis( + mongodb_connection_string: str, ibis_config: RepoConfig, num_features: int +) -> None: + """Benchmark Ibis with varying feature width.""" + num_entities = 1000 + + client = MongoClient(mongodb_connection_string) + try: + now = _generate_ibis_data( + client, + "benchmark_db", + "driver_benchmark", + num_entities=num_entities, + num_features=num_features, + rows_per_entity=3, + ) + + _, fv = _create_ibis_fv(num_features) + + entity_df = pd.DataFrame( + { + "driver_id": list(range(num_entities)), + "event_timestamp": [now] * num_entities, + } + ) + + feature_refs = [f"driver_benchmark:feature_{i}" for i in range(num_features)] + + def run_query(): + job = MongoDBOfflineStoreIbis.get_historical_features( + config=ibis_config, + feature_views=[fv], + feature_refs=feature_refs, + entity_df=entity_df, + registry=MagicMock(), + project="benchmark", + full_feature_names=False, + ) + return job.to_df() + + result = _run_benchmark_full(run_query, mongo_client=client) + _print_benchmark_result( + "IBIS", "Features", num_features, result, num_rows=num_entities + ) + + finally: + client.close() + + +@_requires_docker +@pytest.mark.parametrize("num_features", FEATURE_COUNTS) +def test_wide_features_native( + mongodb_connection_string: str, native_config: RepoConfig, num_features: int +) -> None: + """Benchmark Native with varying feature width.""" + num_entities = 1000 + + client = MongoClient(mongodb_connection_string) + try: + client["benchmark_db"]["feature_history"].drop() + now = _generate_native_data( + client, + "benchmark_db", + "feature_history", + "driver_benchmark", + num_entities=num_entities, + num_features=num_features, + rows_per_entity=3, + ) + + _, fv = _create_native_fv(num_features) + + entity_df = pd.DataFrame( + { + "driver_id": list(range(num_entities)), + "event_timestamp": [now] * num_entities, + } + ) + + feature_refs = [f"driver_benchmark:feature_{i}" for i in range(num_features)] + + def run_query(): + job = MongoDBOfflineStoreNative.get_historical_features( + config=native_config, + feature_views=[fv], + feature_refs=feature_refs, + entity_df=entity_df, + registry=MagicMock(), + project="benchmark", + full_feature_names=False, + ) + return job.to_df() + + result = _run_benchmark_full(run_query, mongo_client=client) + _print_benchmark_result( + "NATIVE", "Features", num_features, result, num_rows=num_entities + ) + + finally: + client.close() + + +# ============================================================================= +# Test 3: Skewed Entity Distribution +# ============================================================================= + + +@_requires_docker +@pytest.mark.parametrize("unique_ratio", [1.0, 0.5, 0.1]) # 100%, 50%, 10% unique +def test_entity_skew_ibis( + mongodb_connection_string: str, ibis_config: RepoConfig, unique_ratio: float +) -> None: + """Benchmark Ibis with varying entity uniqueness in entity_df.""" + import numpy as np + + total_rows = 5000 + num_features = 10 + num_unique_entities = int(total_rows * unique_ratio) + num_unique_entities = max(num_unique_entities, 1) + + client = MongoClient(mongodb_connection_string) + try: + now = _generate_ibis_data( + client, + "benchmark_db", + "driver_benchmark", + num_entities=num_unique_entities, + num_features=num_features, + rows_per_entity=5, + ) + + _, fv = _create_ibis_fv(num_features) + + # Create entity_df with repeated entity_ids + entity_ids = np.random.choice( + num_unique_entities, size=total_rows, replace=True + ) + entity_df = pd.DataFrame( + { + "driver_id": entity_ids, + "event_timestamp": [ + now - timedelta(minutes=i % 60) for i in range(total_rows) + ], + } + ) + + feature_refs = [f"driver_benchmark:feature_{i}" for i in range(num_features)] + + def run_query(): + job = MongoDBOfflineStoreIbis.get_historical_features( + config=ibis_config, + feature_views=[fv], + feature_refs=feature_refs, + entity_df=entity_df, + registry=MagicMock(), + project="benchmark", + full_feature_names=False, + ) + return job.to_df() + + result = _run_benchmark_full(run_query, mongo_client=client) + print( + f"\n[IBIS] Unique ratio: {unique_ratio:.0%} ({num_unique_entities:,} unique / {total_rows:,} rows)" + ) + print(f" Time: {result.elapsed_seconds:.3f}s") + print(f" Memory: {result.peak_memory_mb:.1f} MB") + print(f" Mongo ops: {result.mongo_opcounters_delta}") + + finally: + client.close() + + +@_requires_docker +@pytest.mark.parametrize("unique_ratio", [1.0, 0.5, 0.1]) +def test_entity_skew_native( + mongodb_connection_string: str, native_config: RepoConfig, unique_ratio: float +) -> None: + """Benchmark Native with varying entity uniqueness in entity_df.""" + import numpy as np + + total_rows = 5000 + num_features = 10 + num_unique_entities = int(total_rows * unique_ratio) + num_unique_entities = max(num_unique_entities, 1) + + client = MongoClient(mongodb_connection_string) + try: + client["benchmark_db"]["feature_history"].drop() + now = _generate_native_data( + client, + "benchmark_db", + "feature_history", + "driver_benchmark", + num_entities=num_unique_entities, + num_features=num_features, + rows_per_entity=5, + ) + + _, fv = _create_native_fv(num_features) + + entity_ids = np.random.choice( + num_unique_entities, size=total_rows, replace=True + ) + entity_df = pd.DataFrame( + { + "driver_id": entity_ids, + "event_timestamp": [ + now - timedelta(minutes=i % 60) for i in range(total_rows) + ], + } + ) + + feature_refs = [f"driver_benchmark:feature_{i}" for i in range(num_features)] + + def run_query(): + job = MongoDBOfflineStoreNative.get_historical_features( + config=native_config, + feature_views=[fv], + feature_refs=feature_refs, + entity_df=entity_df, + registry=MagicMock(), + project="benchmark", + full_feature_names=False, + ) + return job.to_df() + + result = _run_benchmark_full(run_query, mongo_client=client) + print( + f"\n[NATIVE] Unique ratio: {unique_ratio:.0%} ({num_unique_entities:,} unique / {total_rows:,} rows)" + ) + print(f" Time: {result.elapsed_seconds:.3f}s") + print(f" Memory: {result.peak_memory_mb:.1f} MB") + print(f" Mongo ops: {result.mongo_opcounters_delta}") + + finally: + client.close() + + +# ============================================================================= +# Summary comparison test +# ============================================================================= + + +@_requires_docker +def test_summary_comparison( + mongodb_connection_string: str, ibis_config: RepoConfig, native_config: RepoConfig +) -> None: + """Run a standard comparison and print summary with full metrics.""" + num_entities = 2000 + num_features = 20 + + client = MongoClient(mongodb_connection_string) + try: + # Setup Ibis data + now = _generate_ibis_data( + client, + "benchmark_db", + "driver_benchmark", + num_entities=num_entities, + num_features=num_features, + rows_per_entity=5, + ) + + # Setup Native data + client["benchmark_db"]["feature_history"].drop() + _generate_native_data( + client, + "benchmark_db", + "feature_history", + "driver_benchmark", + num_entities=num_entities, + num_features=num_features, + rows_per_entity=5, + ) + + entity_df = pd.DataFrame( + { + "driver_id": list(range(num_entities)), + "event_timestamp": [now] * num_entities, + } + ) + + feature_refs = [f"driver_benchmark:feature_{i}" for i in range(num_features)] + + # Ibis benchmark + _, ibis_fv = _create_ibis_fv(num_features) + + def run_ibis(): + job = MongoDBOfflineStoreIbis.get_historical_features( + config=ibis_config, + feature_views=[ibis_fv], + feature_refs=feature_refs, + entity_df=entity_df, + registry=MagicMock(), + project="benchmark", + full_feature_names=False, + ) + return job.to_df() + + ibis_result = _run_benchmark_full(run_ibis, mongo_client=client) + + # Native benchmark + _, native_fv = _create_native_fv(num_features) + + def run_native(): + job = MongoDBOfflineStoreNative.get_historical_features( + config=native_config, + feature_views=[native_fv], + feature_refs=feature_refs, + entity_df=entity_df, + registry=MagicMock(), + project="benchmark", + full_feature_names=False, + ) + return job.to_df() + + native_result = _run_benchmark_full(run_native, mongo_client=client) + + # Print summary + print("\n" + "=" * 70) + print("SUMMARY COMPARISON") + print("=" * 70) + print(f"Entities: {num_entities:,} | Features: {num_features}") + print("-" * 70) + print(f"{'Metric':<20} {'Ibis':>20} {'Native':>20}") + print("-" * 70) + print( + f"{'Time (s)':<20} {ibis_result.elapsed_seconds:>20.3f} {native_result.elapsed_seconds:>20.3f}" + ) + print( + f"{'Memory (MB)':<20} {ibis_result.peak_memory_mb:>20.1f} {native_result.peak_memory_mb:>20.1f}" + ) + print( + f"{'Rows/sec':<20} {num_entities / ibis_result.elapsed_seconds:>20,.0f} {num_entities / native_result.elapsed_seconds:>20,.0f}" + ) + print("-" * 70) + + if native_result.elapsed_seconds > 0: + ratio = native_result.elapsed_seconds / ibis_result.elapsed_seconds + print(f"Ibis is {ratio:.1f}x faster than Native") + print("=" * 70) + + finally: + client.close() From b8fcba5d5735fd871bdfe4bd2caa5c8257147bb8 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Wed, 18 Mar 2026 19:28:00 -0400 Subject: [PATCH 14/30] Refactor Native get_historical_features: replace with fetch+pandas join MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Eliminate -based PIT join which scaled poorly (O(n×m)) - Use single query to fetch all matching feature data - Batch entity_ids into chunks of 1000 for large queries - Flatten features subdoc with pd.json_normalize - Apply pd.merge_asof for efficient PIT join per FeatureView - Handle TTL filtering in pandas instead of MQL \ - Remove unused _ttl_to_ms and _build_ttl_gte_expr helpers Performance improvement: - Before: 10k rows in ~188s (53 rows/s) - After: 10k rows in ~7.4s (1,354 rows/s) - Now competitive with Ibis implementation Signed-off-by: Casey Clements --- .../mongodb_offline_store/mongodb_native.py | 273 ++++++++---------- 1 file changed, 124 insertions(+), 149 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py index c9cbae587a..6e1f610d37 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py @@ -81,7 +81,6 @@ """ import json -import uuid import warnings from datetime import datetime, timezone from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union @@ -375,55 +374,6 @@ def _serialize_entity_key_from_row( return serialize_entity_key(entity_key, entity_key_serialization_version) -def _ttl_to_ms(fv: FeatureView) -> Optional[int]: - """Convert FeatureView TTL to milliseconds.""" - if fv.ttl is None: - return None - return int(fv.ttl.total_seconds() * 1000) - - -def _build_ttl_gte_expr(feature_views: List[FeatureView]) -> Optional[Dict[str, Any]]: - """Build a $gte expression with per-FV TTL using $switch. - - Returns a MongoDB expression that evaluates to: - event_timestamp >= (entity_timestamp - ttl_for_this_feature_view) - - Each feature_view can have a different TTL, handled via $switch branches. - If no feature views have TTL, returns None (no filtering needed). - """ - branches = [] - - for fv in feature_views: - ttl_ms = _ttl_to_ms(fv) - if ttl_ms is None: - # No TTL for this FV - skip (effectively infinite history) - continue - - branches.append( - { - "case": {"$eq": ["$feature_view", fv.name]}, - "then": {"$subtract": ["$$ts", ttl_ms]}, - } - ) - - # If no TTLs at all, no lower bound needed - if not branches: - return None - - return { - "$gte": [ - "$event_timestamp", - { - "$switch": { - "branches": branches, - # Default: no lower bound (for FVs without TTL) - "default": {"$literal": 0}, - } - }, - ] - } - - class MongoDBOfflineStoreNative(OfflineStore): """Native MongoDB offline store using single-collection schema. @@ -630,6 +580,13 @@ def get_historical_features( project: str, full_feature_names: bool = False, ) -> RetrievalJob: + """Fetch historical features using a "fetch + pandas join" strategy. + + Instead of using $lookup (which scales poorly), this: + 1. Extracts unique entity_ids and computes timestamp bounds + 2. Fetches all matching feature data in one query + 3. Uses pd.merge_asof for efficient point-in-time joins in Python + """ if isinstance(entity_df, str): raise ValueError( "MongoDBOfflineStoreNative does not support SQL entity_df strings. " @@ -654,18 +611,10 @@ def get_historical_features( fv_to_features.setdefault(fv_name, []).append(feat_name) fv_names = list(fv_to_features.keys()) - - # Build per-FV TTL expression using $switch - relevant_fvs = [fv for fv in feature_views if fv.name in fv_to_features] - ttl_expr = _build_ttl_gte_expr(relevant_fvs) + fv_by_name = {fv.name: fv for fv in feature_views} def _run() -> pyarrow.Table: - if MongoClient is None: - raise FeastExtrasDependencyImportError( - "mongodb", "pymongo is not installed." - ) - - # Prepare entity_df: ensure timestamps are UTC and serialize entity keys + # Prepare entity_df: ensure timestamps are UTC result = entity_df.copy() if result[event_timestamp_col].dt.tz is None: result[event_timestamp_col] = pd.to_datetime( @@ -683,110 +632,136 @@ def _run() -> pyarrow.Table: axis=1, ) - # Build temp collection documents - temp_docs = [] - for _, row in result.iterrows(): - temp_docs.append( - { - "entity_id": row["_entity_id"], - "event_timestamp": row[event_timestamp_col], - "_row_idx": _, # Preserve original order - } - ) + # Extract unique entity_ids and timestamp bounds + unique_entity_ids = result["_entity_id"].unique().tolist() + max_ts = result[event_timestamp_col].max() - # Create temporary collection for query - temp_collection_name = f"tmp_entity_df_{uuid.uuid4().hex[:12]}" + # Batch entity_ids into chunks to avoid huge $in queries + BATCH_SIZE = 1000 + all_feature_docs: List[Dict] = [] client = MongoDBOfflineStoreNative._get_client_and_ensure_indexes(config) try: - db = client[db_name] - temp_collection = db[temp_collection_name] - temp_collection.insert_many(temp_docs) - - # Build $lookup subpipeline with PIT join logic - # Match: entity_id, feature_view in list, event_timestamp <= entity.ts - match_conditions: List[Dict[str, Any]] = [ - {"$eq": ["$entity_id", "$$entity_id"]}, - {"$in": ["$feature_view", fv_names]}, - {"$lte": ["$event_timestamp", "$$ts"]}, - ] - # Add per-FV TTL filter using $switch - if ttl_expr is not None: - match_conditions.append(ttl_expr) - - lookup_pipeline: List[Dict[str, Any]] = [ - {"$match": {"$expr": {"$and": match_conditions}}}, - {"$sort": {"feature_view": 1, "event_timestamp": -1}}, - { - "$group": { - "_id": "$feature_view", - "doc": {"$first": "$$ROOT"}, - } - }, - ] + coll = client[db_name][feature_collection] - # Main aggregation pipeline - pipeline: List[Dict[str, Any]] = [ - { - "$lookup": { - "from": feature_collection, - "let": { - "entity_id": "$entity_id", - "ts": "$event_timestamp", - }, - "pipeline": lookup_pipeline, - "as": "feature_rows", - } - }, - {"$sort": {"_row_idx": 1}}, # Preserve original order - ] + for i in range(0, len(unique_entity_ids), BATCH_SIZE): + batch_ids = unique_entity_ids[i : i + BATCH_SIZE] - docs = list(temp_collection.aggregate(pipeline)) + # Single query: fetch all matching feature data + query = { + "entity_id": {"$in": batch_ids}, + "feature_view": {"$in": fv_names}, + "event_timestamp": {"$lte": max_ts}, + } + docs = list(coll.find(query, {"_id": 0})) + all_feature_docs.extend(docs) finally: - # Cleanup temp collection - client[db_name][temp_collection_name].drop() client.close() - if not docs: - return pyarrow.Table.from_pydict({}) - - # Build result DataFrame - result = result.reset_index(drop=True) - rows = [] - for doc in docs: - # Start with entity columns from original entity_df - row_idx = doc["_row_idx"] - row = result.iloc[row_idx][ - entity_columns + [event_timestamp_col] - ].to_dict() - - # Extract features from each feature_view's matched doc - feature_rows_by_fv = { - fr["_id"]: fr["doc"] for fr in doc.get("feature_rows", []) - } - - # Extract features from each feature_view's matched doc - # TTL is already applied server-side via $switch expression + # Handle empty result + if not all_feature_docs: + # Return entity_df with NULL feature columns for fv_name, features in fv_to_features.items(): - fv_doc = feature_rows_by_fv.get(fv_name) + for feat in features: + col_name = f"{fv_name}__{feat}" if full_feature_names else feat + result[col_name] = None + result = result.drop(columns=["_entity_id"]) + return pyarrow.Table.from_pandas(result, preserve_index=False) + + # Convert to DataFrame and flatten features subdoc + feature_df = pd.DataFrame(all_feature_docs) + # Rename entity_id to _entity_id to match result DataFrame + feature_df = feature_df.rename(columns={"entity_id": "_entity_id"}) + + # Flatten nested 'features' dict into top-level columns + if "features" in feature_df.columns: + features_expanded = pd.json_normalize(feature_df["features"]) + feature_df = pd.concat( + [feature_df.drop(columns=["features"]), features_expanded], axis=1 + ) + + # Ensure timestamps are tz-aware + if feature_df["event_timestamp"].dt.tz is None: + feature_df["event_timestamp"] = pd.to_datetime( + feature_df["event_timestamp"], utc=True + ) + + # Split by feature_view and perform PIT join for each + result = result.sort_values(event_timestamp_col).reset_index(drop=True) + + for fv_name, features in fv_to_features.items(): + fv = fv_by_name.get(fv_name) + + # Filter to this feature_view's data + fv_df = feature_df[feature_df["feature_view"] == fv_name].copy() + + if fv_df.empty: + # No data for this FV - fill with NULLs for feat in features: col_name = f"{fv_name}__{feat}" if full_feature_names else feat - if fv_doc is None: - row[col_name] = None - else: - row[col_name] = fv_doc.get("features", {}).get(feat) - - rows.append(row) - - result_df = pd.DataFrame(rows) - if not result_df.empty and event_timestamp_col in result_df.columns: - if result_df[event_timestamp_col].dt.tz is None: - result_df[event_timestamp_col] = pd.to_datetime( - result_df[event_timestamp_col], utc=True + result[col_name] = None + continue + + # Sort by timestamp for merge_asof + fv_df = fv_df.sort_values("event_timestamp").reset_index(drop=True) + + # Select columns for merge + merge_cols = ["_entity_id", "event_timestamp"] + [ + f for f in features if f in fv_df.columns + ] + fv_df_subset = fv_df[ + [c for c in merge_cols if c in fv_df.columns] + ].copy() + + # Rename to avoid conflicts + fv_df_subset = fv_df_subset.rename( + columns={"event_timestamp": "_fv_ts"} + ) + + # Point-in-time join using merge_asof + result = pd.merge_asof( + result, + fv_df_subset, + left_on=event_timestamp_col, + right_on="_fv_ts", + by="_entity_id", + direction="backward", + ) + + # Apply TTL: null out stale features + if fv and fv.ttl: + cutoff = result[event_timestamp_col] - fv.ttl + stale_mask = result["_fv_ts"] < cutoff + for feat in features: + if feat in result.columns: + result.loc[stale_mask, feat] = None + + # Rename features if full_feature_names + for feat in features: + if feat in result.columns and full_feature_names: + result = result.rename(columns={feat: f"{fv_name}__{feat}"}) + elif feat not in result.columns: + # Feature wasn't in the data - add NULL column + col_name = f"{fv_name}__{feat}" if full_feature_names else feat + result[col_name] = None + + # Drop temporary column + result = result.drop(columns=["_fv_ts"], errors="ignore") + + # Remove internal entity_id column and restore original order + result = result.drop(columns=["_entity_id"], errors="ignore") + result = result.sort_index().reset_index(drop=True) + + # Ensure timestamp column is still tz-aware + if not result.empty and event_timestamp_col in result.columns: + if result[event_timestamp_col].dt.tz is None: + result[event_timestamp_col] = pd.to_datetime( + result[event_timestamp_col], utc=True ) - return pyarrow.Table.from_pandas(result_df, preserve_index=False) + + return pyarrow.Table.from_pandas(result, preserve_index=False) return MongoDBNativeRetrievalJob( query_fn=_run, From 5d516a841ee84571b54c2e70edcbd23d845f71fd Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Thu, 19 Mar 2026 07:39:30 -0400 Subject: [PATCH 15/30] Refactor get_historical_features with chunked processing for large entity_df - Add CHUNK_SIZE (5000) for entity_df processing to bound memory usage - Extract _run_single helper function for processing each chunk - Add _chunk_dataframe generator for yielding DataFrame slices - Preserve original row ordering via _row_idx column - Exclude internal columns (prefixed with _) from entity key serialization - Concat chunk results and restore ordering at the end This allows processing arbitrarily large entity_df while keeping memory bounded by processing in 5000-row chunks. Signed-off-by: Casey Clements --- .../mongodb_offline_store/mongodb_native.py | 110 +++++++++++------- 1 file changed, 70 insertions(+), 40 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py index 6e1f610d37..8c7822bca4 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py @@ -83,7 +83,17 @@ import json import warnings from datetime import datetime, timezone -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +from typing import ( + Any, + Callable, + Dict, + Generator, + Iterable, + List, + Optional, + Tuple, + Union, +) import pandas as pd import pyarrow @@ -584,8 +594,10 @@ def get_historical_features( Instead of using $lookup (which scales poorly), this: 1. Extracts unique entity_ids and computes timestamp bounds - 2. Fetches all matching feature data in one query + 2. Fetches all matching feature data in batched queries 3. Uses pd.merge_asof for efficient point-in-time joins in Python + + For large entity_df, processing is chunked to bound memory usage. """ if isinstance(entity_df, str): raise ValueError( @@ -613,16 +625,33 @@ def get_historical_features( fv_names = list(fv_to_features.keys()) fv_by_name = {fv.name: fv for fv in feature_views} - def _run() -> pyarrow.Table: + # Chunk size for entity_df processing (bounds memory usage) + CHUNK_SIZE = 5000 + # Batch size for MongoDB $in queries + MONGO_BATCH_SIZE = 1000 + + def _chunk_dataframe( + df: pd.DataFrame, size: int + ) -> Generator[pd.DataFrame, None, None]: + """Yield successive chunks of a DataFrame.""" + for i in range(0, len(df), size): + yield df.iloc[i : i + size] + + def _run_single(entity_subset_df: pd.DataFrame) -> pd.DataFrame: + """Process a single chunk of entity_df and return joined features.""" # Prepare entity_df: ensure timestamps are UTC - result = entity_df.copy() + result = entity_subset_df.copy() if result[event_timestamp_col].dt.tz is None: result[event_timestamp_col] = pd.to_datetime( result[event_timestamp_col], utc=True ) - # Get join keys (all columns except event_timestamp) - entity_columns = [c for c in result.columns if c != event_timestamp_col] + # Get join keys (all columns except event_timestamp and internal columns) + entity_columns = [ + c + for c in result.columns + if c != event_timestamp_col and not c.startswith("_") + ] # Serialize entity keys to bytes (same format as online store) result["_entity_id"] = result.apply( @@ -632,22 +661,20 @@ def _run() -> pyarrow.Table: axis=1, ) - # Extract unique entity_ids and timestamp bounds + # Extract unique entity_ids and timestamp bounds for this chunk unique_entity_ids = result["_entity_id"].unique().tolist() max_ts = result[event_timestamp_col].max() - # Batch entity_ids into chunks to avoid huge $in queries - BATCH_SIZE = 1000 + # Fetch feature data in batches all_feature_docs: List[Dict] = [] client = MongoDBOfflineStoreNative._get_client_and_ensure_indexes(config) try: coll = client[db_name][feature_collection] - for i in range(0, len(unique_entity_ids), BATCH_SIZE): - batch_ids = unique_entity_ids[i : i + BATCH_SIZE] + for i in range(0, len(unique_entity_ids), MONGO_BATCH_SIZE): + batch_ids = unique_entity_ids[i : i + MONGO_BATCH_SIZE] - # Single query: fetch all matching feature data query = { "entity_id": {"$in": batch_ids}, "feature_view": {"$in": fv_names}, @@ -661,66 +688,53 @@ def _run() -> pyarrow.Table: # Handle empty result if not all_feature_docs: - # Return entity_df with NULL feature columns for fv_name, features in fv_to_features.items(): for feat in features: col_name = f"{fv_name}__{feat}" if full_feature_names else feat result[col_name] = None - result = result.drop(columns=["_entity_id"]) - return pyarrow.Table.from_pandas(result, preserve_index=False) + return result.drop(columns=["_entity_id"]) # Convert to DataFrame and flatten features subdoc feature_df = pd.DataFrame(all_feature_docs) - - # Rename entity_id to _entity_id to match result DataFrame feature_df = feature_df.rename(columns={"entity_id": "_entity_id"}) - # Flatten nested 'features' dict into top-level columns if "features" in feature_df.columns: features_expanded = pd.json_normalize(feature_df["features"]) feature_df = pd.concat( [feature_df.drop(columns=["features"]), features_expanded], axis=1 ) - # Ensure timestamps are tz-aware if feature_df["event_timestamp"].dt.tz is None: feature_df["event_timestamp"] = pd.to_datetime( feature_df["event_timestamp"], utc=True ) - # Split by feature_view and perform PIT join for each + # Sort result for merge_asof result = result.sort_values(event_timestamp_col).reset_index(drop=True) + # Perform PIT join for each feature view for fv_name, features in fv_to_features.items(): fv = fv_by_name.get(fv_name) - - # Filter to this feature_view's data fv_df = feature_df[feature_df["feature_view"] == fv_name].copy() if fv_df.empty: - # No data for this FV - fill with NULLs for feat in features: col_name = f"{fv_name}__{feat}" if full_feature_names else feat result[col_name] = None continue - # Sort by timestamp for merge_asof fv_df = fv_df.sort_values("event_timestamp").reset_index(drop=True) - # Select columns for merge merge_cols = ["_entity_id", "event_timestamp"] + [ f for f in features if f in fv_df.columns ] fv_df_subset = fv_df[ [c for c in merge_cols if c in fv_df.columns] ].copy() - - # Rename to avoid conflicts fv_df_subset = fv_df_subset.rename( columns={"event_timestamp": "_fv_ts"} ) - # Point-in-time join using merge_asof result = pd.merge_asof( result, fv_df_subset, @@ -730,7 +744,7 @@ def _run() -> pyarrow.Table: direction="backward", ) - # Apply TTL: null out stale features + # Apply TTL if fv and fv.ttl: cutoff = result[event_timestamp_col] - fv.ttl stale_mask = result["_fv_ts"] < cutoff @@ -743,25 +757,41 @@ def _run() -> pyarrow.Table: if feat in result.columns and full_feature_names: result = result.rename(columns={feat: f"{fv_name}__{feat}"}) elif feat not in result.columns: - # Feature wasn't in the data - add NULL column col_name = f"{fv_name}__{feat}" if full_feature_names else feat result[col_name] = None - # Drop temporary column result = result.drop(columns=["_fv_ts"], errors="ignore") - # Remove internal entity_id column and restore original order - result = result.drop(columns=["_entity_id"], errors="ignore") - result = result.sort_index().reset_index(drop=True) + return result.drop(columns=["_entity_id"], errors="ignore") - # Ensure timestamp column is still tz-aware - if not result.empty and event_timestamp_col in result.columns: - if result[event_timestamp_col].dt.tz is None: - result[event_timestamp_col] = pd.to_datetime( - result[event_timestamp_col], utc=True + def _run() -> pyarrow.Table: + # Add row index to preserve original ordering + working_df = entity_df.copy() + working_df["_row_idx"] = range(len(working_df)) + + if len(working_df) <= CHUNK_SIZE: + # Small workload: process in single pass + result_df = _run_single(working_df) + else: + # Large workload: process in chunks + chunk_results = [] + for chunk in _chunk_dataframe(working_df, CHUNK_SIZE): + chunk_results.append(_run_single(chunk)) + + result_df = pd.concat(chunk_results, ignore_index=True) + + # Restore original ordering and remove index column + result_df = result_df.sort_values("_row_idx").reset_index(drop=True) + result_df = result_df.drop(columns=["_row_idx"], errors="ignore") + + # Ensure timestamp column is tz-aware + if not result_df.empty and event_timestamp_col in result_df.columns: + if result_df[event_timestamp_col].dt.tz is None: + result_df[event_timestamp_col] = pd.to_datetime( + result_df[event_timestamp_col], utc=True ) - return pyarrow.Table.from_pandas(result, preserve_index=False) + return pyarrow.Table.from_pandas(result_df, preserve_index=False) return MongoDBNativeRetrievalJob( query_fn=_run, From c7281fb06705d7bd74343dcf67811bb4f75abf56 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Thu, 19 Mar 2026 09:58:50 -0400 Subject: [PATCH 16/30] Optimize Native get_historical_features: reuse client, increase batch sizes Performance optimizations: - Reuse MongoClient across chunks (was creating new client per chunk) - Increase CHUNK_SIZE from 5,000 to 50,000 rows - Increase MONGO_BATCH_SIZE from 1,000 to 10,000 entity_ids - Pass collection to _run_single instead of creating client each time - Make index creation idempotent (check for existing index) Results (100k rows): - Before: 21.7s - After: 5.2s (4.2x faster) Results (1M rows): - Before: 1664s (28 min) - After: 212s (3.5 min) (7.8x faster) Signed-off-by: Casey Clements --- .../mongodb_offline_store/mongodb_native.py | 84 +++++++++++-------- 1 file changed, 49 insertions(+), 35 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py index 8c7822bca4..aa0c88f033 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py @@ -400,15 +400,24 @@ class MongoDBOfflineStoreNative(OfflineStore): @staticmethod def _ensure_indexes(client: Any, db_name: str, collection_name: str) -> None: - """Create recommended indexes on the feature_history collection.""" + """Create recommended indexes on the feature_history collection. + + Uses create_index with background=True. If index already exists + (with same or different name), this is a no-op. + """ collection = client[db_name][collection_name] + # Check if an equivalent index already exists + existing_indexes = collection.index_information() + target_key = [("entity_id", 1), ("feature_view", 1), ("event_timestamp", -1)] + + for idx_info in existing_indexes.values(): + if idx_info.get("key") == target_key: + return # Index already exists + collection.create_index( - [ - ("entity_id", 1), - ("feature_view", 1), - ("event_timestamp", -1), - ], + target_key, name="entity_fv_ts_idx", + background=True, ) @classmethod @@ -626,9 +635,9 @@ def get_historical_features( fv_by_name = {fv.name: fv for fv in feature_views} # Chunk size for entity_df processing (bounds memory usage) - CHUNK_SIZE = 5000 + CHUNK_SIZE = 50_000 # Batch size for MongoDB $in queries - MONGO_BATCH_SIZE = 1000 + MONGO_BATCH_SIZE = 10_000 def _chunk_dataframe( df: pd.DataFrame, size: int @@ -637,8 +646,13 @@ def _chunk_dataframe( for i in range(0, len(df), size): yield df.iloc[i : i + size] - def _run_single(entity_subset_df: pd.DataFrame) -> pd.DataFrame: - """Process a single chunk of entity_df and return joined features.""" + def _run_single(entity_subset_df: pd.DataFrame, coll: Any) -> pd.DataFrame: + """Process a single chunk of entity_df and return joined features. + + Args: + entity_subset_df: Chunk of entity DataFrame to process + coll: MongoDB collection object (reused across chunks) + """ # Prepare entity_df: ensure timestamps are UTC result = entity_subset_df.copy() if result[event_timestamp_col].dt.tz is None: @@ -668,23 +682,16 @@ def _run_single(entity_subset_df: pd.DataFrame) -> pd.DataFrame: # Fetch feature data in batches all_feature_docs: List[Dict] = [] - client = MongoDBOfflineStoreNative._get_client_and_ensure_indexes(config) - try: - coll = client[db_name][feature_collection] - - for i in range(0, len(unique_entity_ids), MONGO_BATCH_SIZE): - batch_ids = unique_entity_ids[i : i + MONGO_BATCH_SIZE] - - query = { - "entity_id": {"$in": batch_ids}, - "feature_view": {"$in": fv_names}, - "event_timestamp": {"$lte": max_ts}, - } - docs = list(coll.find(query, {"_id": 0})) - all_feature_docs.extend(docs) + for i in range(0, len(unique_entity_ids), MONGO_BATCH_SIZE): + batch_ids = unique_entity_ids[i : i + MONGO_BATCH_SIZE] - finally: - client.close() + query = { + "entity_id": {"$in": batch_ids}, + "feature_view": {"$in": fv_names}, + "event_timestamp": {"$lte": max_ts}, + } + docs = list(coll.find(query, {"_id": 0})) + all_feature_docs.extend(docs) # Handle empty result if not all_feature_docs: @@ -769,16 +776,23 @@ def _run() -> pyarrow.Table: working_df = entity_df.copy() working_df["_row_idx"] = range(len(working_df)) - if len(working_df) <= CHUNK_SIZE: - # Small workload: process in single pass - result_df = _run_single(working_df) - else: - # Large workload: process in chunks - chunk_results = [] - for chunk in _chunk_dataframe(working_df, CHUNK_SIZE): - chunk_results.append(_run_single(chunk)) + # Create client once for all chunks + client = MongoDBOfflineStoreNative._get_client_and_ensure_indexes(config) + try: + coll = client[db_name][feature_collection] + + if len(working_df) <= CHUNK_SIZE: + # Small workload: process in single pass + result_df = _run_single(working_df, coll) + else: + # Large workload: process in chunks + chunk_results = [] + for chunk in _chunk_dataframe(working_df, CHUNK_SIZE): + chunk_results.append(_run_single(chunk, coll)) - result_df = pd.concat(chunk_results, ignore_index=True) + result_df = pd.concat(chunk_results, ignore_index=True) + finally: + client.close() # Restore original ordering and remove index column result_df = result_df.sort_values("_row_idx").reset_index(drop=True) From 18bb99946cf93730cac6326137f9e425096e049b Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Thu, 19 Mar 2026 13:46:30 -0400 Subject: [PATCH 17/30] Remove duplicate MongoDBOfflineStoreNative from mongodb.py The Native implementation now lives exclusively in mongodb_native.py with the single-collection schema. This removes the confusing duplicate that used the Ibis collection-per-FV schema. Signed-off-by: Casey Clements --- .../contrib/mongodb_offline_store/mongodb.py | 416 +----------------- 1 file changed, 2 insertions(+), 414 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py index 51100ef827..10ffd6c533 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py @@ -13,12 +13,11 @@ # limitations under the License. import warnings -from datetime import datetime, timezone -from typing import Any, Callable, Dict, List, Optional, Union +from datetime import datetime +from typing import Any, Callable, List, Optional, Union import ibis import pandas as pd -import pyarrow from ibis.expr.types import Table from pydantic import StrictStr @@ -45,14 +44,9 @@ from feast.infra.offline_stores.offline_store import ( OfflineStore, RetrievalJob, - RetrievalMetadata, -) -from feast.infra.offline_stores.offline_utils import ( - infer_event_timestamp_from_entity_df, ) from feast.infra.registry.base_registry import BaseRegistry from feast.repo_config import FeastConfigBaseModel, RepoConfig -from feast.saved_dataset import SavedDatasetStorage class MongoDBOfflineStoreIbisConfig(FeastConfigBaseModel): @@ -247,409 +241,3 @@ def writer( client.close() return writer - - -# --------------------------------------------------------------------------- -# Native MQL implementation -# --------------------------------------------------------------------------- - - -class MongoDBOfflineStoreNativeConfig(FeastConfigBaseModel): - """Configuration for the MongoDB native-MQL offline store.""" - - type: StrictStr = "feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb.MongoDBOfflineStoreNative" - """Offline store type selector""" - - connection_string: StrictStr = "mongodb://localhost:27017" - """MongoDB connection URI""" - - database: StrictStr = "feast" - """Default MongoDB database name""" - - -def _fetch_collection_as_arrow( - connection_string: str, - db_name: str, - collection: str, - pipeline: Optional[List[Dict]] = None, -) -> pyarrow.Table: - """Run an aggregation pipeline (or full scan) via PyMongo and return a pyarrow Table. - - If *pipeline* is None the entire collection is scanned (``_id`` excluded). - The ``_id`` field is stripped from every result document before conversion. - """ - if MongoClient is None: - raise FeastExtrasDependencyImportError("mongodb", "pymongo is not installed.") - client: Any = MongoClient(connection_string, driver=DRIVER_METADATA, tz_aware=True) - try: - if pipeline is not None: - docs = list(client[db_name][collection].aggregate(pipeline)) - else: - docs = list(client[db_name][collection].find({}, {"_id": 0})) - finally: - client.close() - - if not docs: - return pyarrow.table({}) - - for doc in docs: - doc.pop("_id", None) - - return pyarrow.Table.from_pylist(docs) - - -class MongoDBNativeRetrievalJob(RetrievalJob): - """A RetrievalJob whose results come from a lazy PyMongo query callable. - - The callable is only executed when the caller materialises the job (e.g. - ``to_df()``, ``to_arrow()``, ``persist()``). - """ - - def __init__( - self, - query_fn: Callable[[], pyarrow.Table], - full_feature_names: bool, - on_demand_feature_views: List, - metadata: Optional[RetrievalMetadata], - config: RepoConfig, - ) -> None: - super().__init__() - self._query_fn = query_fn - self._full_feature_names = full_feature_names - self._on_demand_feature_views = on_demand_feature_views or [] - self._metadata = metadata - self._config = config - - def _to_arrow_internal(self, timeout: Optional[int] = None) -> pyarrow.Table: - return self._query_fn() - - def _to_df_internal(self, timeout: Optional[int] = None) -> pd.DataFrame: - return self._to_arrow_internal().to_pandas() - - @property - def full_feature_names(self) -> bool: - return self._full_feature_names - - @property - def on_demand_feature_views(self) -> List: - return self._on_demand_feature_views - - @property - def metadata(self) -> Optional[RetrievalMetadata]: - return self._metadata - - def persist( - self, - storage: SavedDatasetStorage, - allow_overwrite: bool = False, - timeout: Optional[int] = None, - ) -> None: - if MongoClient is None: - raise FeastExtrasDependencyImportError( - "mongodb", "pymongo is not installed." - ) - data_source = storage.to_data_source() - if not isinstance(data_source, MongoDBSource): - raise ValueError( - f"MongoDBNativeRetrievalJob.persist expected a MongoDBSource storage, " - f"got {type(data_source).__name__!r}." - ) - table = self._to_arrow_internal() - connection_string = self._config.offline_store.connection_string - db_name = data_source.database or self._config.offline_store.database - location = f"{db_name}.{data_source.collection}" - client: Any = MongoClient( - connection_string, driver=DRIVER_METADATA, tz_aware=True - ) - try: - coll = client[db_name][data_source.collection] - if not allow_overwrite and coll.estimated_document_count() > 0: - raise SavedDatasetLocationAlreadyExists(location=location) - coll.drop() - records = table.to_pylist() - if records: - coll.insert_many(records) - finally: - client.close() - - -class MongoDBOfflineStoreNative(OfflineStore): - """Offline store backed by MongoDB using native MQL aggregation pipelines. - - Compared with :class:`MongoDBOfflineStoreIbis`, this implementation avoids - the Ibis dependency entirely. The three main workflows map to: - - * ``offline_write_batch`` – Arrow → ``insert_many`` - * ``pull_latest_from_table_or_query`` – ``$match`` → ``$sort`` → ``$group`` - * ``pull_all_from_table_or_query`` – ``$match`` → ``$project`` - * ``get_historical_features`` – per-collection fetch + ``merge_asof`` - """ - - @staticmethod - def offline_write_batch( - config: RepoConfig, - feature_view: FeatureView, - table: pyarrow.Table, - progress: Optional[Callable[[int], Any]], - ) -> None: - if MongoClient is None: - raise FeastExtrasDependencyImportError( - "mongodb", "pymongo is not installed." - ) - data_source = feature_view.batch_source - if not isinstance(data_source, MongoDBSource): - raise ValueError( - f"MongoDBOfflineStoreNative.offline_write_batch expected a MongoDBSource, " - f"got {type(data_source).__name__!r}." - ) - connection_string = config.offline_store.connection_string - db_name = data_source.database or config.offline_store.database - records = table.to_pylist() - client: Any = MongoClient( - connection_string, driver=DRIVER_METADATA, tz_aware=True - ) - try: - coll = client[db_name][data_source.collection] - if records: - coll.insert_many(records) - if progress: - progress(len(records)) - finally: - client.close() - - @staticmethod - def pull_latest_from_table_or_query( - config: RepoConfig, - data_source: DataSource, - join_key_columns: List[str], - feature_name_columns: List[str], - timestamp_field: str, - created_timestamp_column: Optional[str], - start_date: datetime, - end_date: datetime, - ) -> RetrievalJob: - if not isinstance(data_source, MongoDBSource): - raise ValueError( - f"MongoDBOfflineStoreNative expected a MongoDBSource, " - f"got {type(data_source).__name__!r}." - ) - warnings.warn( - "MongoDB offline store (native) is in preview. API may change without notice.", - RuntimeWarning, - ) - start_utc = start_date.astimezone(tz=timezone.utc) - end_utc = end_date.astimezone(tz=timezone.utc) - connection_string = config.offline_store.connection_string - db_name = data_source.database or config.offline_store.database - collection = data_source.collection - - # Sort by timestamp descending so $first in $group gets the latest document - sort_spec: Dict = {timestamp_field: -1} - if created_timestamp_column: - sort_spec[created_timestamp_column] = -1 - - # Group by entity/join keys. _id becomes a subdocument like {driver_id: 1}. - # $first grabs values from the first document in each group (the latest, - # due to prior $sort). - group_id = {k: f"${k}" for k in join_key_columns} - group_stage: Dict = { - "_id": group_id, - **{f: {"$first": f"${f}"} for f in feature_name_columns}, - timestamp_field: {"$first": f"${timestamp_field}"}, - } - if created_timestamp_column: - group_stage[created_timestamp_column] = { - "$first": f"${created_timestamp_column}" - } - - # Project to flatten the output: extract join keys from _id subdocument, - # include feature columns directly. Excludes the _id field from output. - project_stage: Dict = { - "_id": 0, - **{k: f"$_id.{k}" for k in join_key_columns}, - **{f: 1 for f in feature_name_columns}, - timestamp_field: 1, - } - if created_timestamp_column: - project_stage[created_timestamp_column] = 1 - - pipeline = [ - {"$match": {timestamp_field: {"$gte": start_utc, "$lte": end_utc}}}, - {"$sort": sort_spec}, - {"$group": group_stage}, - {"$project": project_stage}, - ] - - def _run() -> pyarrow.Table: - return _fetch_collection_as_arrow( - connection_string, db_name, collection, pipeline - ) - - return MongoDBNativeRetrievalJob( - query_fn=_run, - full_feature_names=False, - on_demand_feature_views=[], - metadata=None, - config=config, - ) - - @staticmethod - def pull_all_from_table_or_query( - config: RepoConfig, - data_source: DataSource, - join_key_columns: List[str], - feature_name_columns: List[str], - timestamp_field: str, - created_timestamp_column: Optional[str] = None, - start_date: Optional[datetime] = None, - end_date: Optional[datetime] = None, - ) -> RetrievalJob: - if not isinstance(data_source, MongoDBSource): - raise ValueError( - f"MongoDBOfflineStoreNative expected a MongoDBSource, " - f"got {type(data_source).__name__!r}." - ) - warnings.warn( - "MongoDB offline store (native) is in preview. API may change without notice.", - RuntimeWarning, - ) - connection_string = config.offline_store.connection_string - db_name = data_source.database or config.offline_store.database - collection = data_source.collection - - fields = join_key_columns + feature_name_columns + [timestamp_field] - if created_timestamp_column: - fields.append(created_timestamp_column) - - match_filter: Dict = {} - if start_date or end_date: - ts_filter: Dict = {} - if start_date: - ts_filter["$gte"] = start_date.astimezone(tz=timezone.utc) - if end_date: - ts_filter["$lte"] = end_date.astimezone(tz=timezone.utc) - match_filter[timestamp_field] = ts_filter - - pipeline = [ - {"$match": match_filter}, - {"$project": {"_id": 0, **{f: 1 for f in fields}}}, - ] - - def _run() -> pyarrow.Table: - return _fetch_collection_as_arrow( - connection_string, db_name, collection, pipeline - ) - - return MongoDBNativeRetrievalJob( - query_fn=_run, - full_feature_names=False, - on_demand_feature_views=[], - metadata=None, - config=config, - ) - - @staticmethod - def get_historical_features( - config: RepoConfig, - feature_views: List[FeatureView], - feature_refs: List[str], - entity_df: Union[pd.DataFrame, str], - registry: BaseRegistry, - project: str, - full_feature_names: bool = False, - ) -> RetrievalJob: - if isinstance(entity_df, str): - raise ValueError( - "MongoDBOfflineStoreNative does not support SQL entity_df strings. " - "Pass a pandas DataFrame instead." - ) - warnings.warn( - "MongoDB offline store (native) is in preview. API may change without notice.", - RuntimeWarning, - ) - connection_string = config.offline_store.connection_string - default_db = config.offline_store.database - - entity_schema = dict(zip(entity_df.columns, entity_df.dtypes)) - event_timestamp_col = infer_event_timestamp_from_entity_df(entity_schema) - - # Map "feature_view:feature" refs → {fv_name: [feature, ...]} - fv_to_features: Dict[str, List[str]] = {} - for ref in feature_refs: - fv_name, feat_name = ref.split(":", 1) - fv_to_features.setdefault(fv_name, []).append(feat_name) - - fv_by_name = {fv.name: fv for fv in feature_views} - - def _run() -> pyarrow.Table: - result = entity_df.copy() - # Ensure the entity timestamp is tz-aware UTC for merge_asof - if result[event_timestamp_col].dt.tz is None: - result[event_timestamp_col] = pd.to_datetime( - result[event_timestamp_col], utc=True - ) - result = result.sort_values(event_timestamp_col) - - for fv_name, features in fv_to_features.items(): - fv = fv_by_name[fv_name] - source = fv.batch_source - if not isinstance(source, MongoDBSource): - raise ValueError( - f"MongoDBOfflineStoreNative: feature view {fv_name!r} has " - f"a non-MongoDBSource batch source ({type(source).__name__!r})." - ) - db_name = source.database or default_db - ts_field = source.timestamp_field - join_keys = [e.name for e in fv.entity_columns] - - arrow_table = _fetch_collection_as_arrow( - connection_string, db_name, source.collection - ) - if arrow_table.num_rows == 0: - for f in features: - col = f"{fv_name}__{f}" if full_feature_names else f - result[col] = None - continue - - feature_df = arrow_table.to_pandas() - # Ensure tz-aware UTC - if feature_df[ts_field].dt.tz is None: - feature_df[ts_field] = pd.to_datetime( - feature_df[ts_field], utc=True - ) - feature_df = feature_df.sort_values(ts_field) - - col_rename = { - f: (f"{fv_name}__{f}" if full_feature_names else f) - for f in features - } - cols_to_select = join_keys + features + [ts_field] - feature_df = feature_df[cols_to_select].rename(columns=col_rename) - out_features = list(col_rename.values()) - - merged = pd.merge_asof( - result, - feature_df, - left_on=event_timestamp_col, - right_on=ts_field, - by=join_keys, - direction="backward", - ) - # Apply TTL: null out features whose timestamp is too far in the past - if fv.ttl: - cutoff = merged[event_timestamp_col] - fv.ttl - too_old = merged[ts_field] < cutoff - for col in out_features: - merged.loc[too_old, col] = None - - result = merged.drop(columns=[ts_field], errors="ignore") - - return pyarrow.Table.from_pandas(result, preserve_index=False) - - return MongoDBNativeRetrievalJob( - query_fn=_run, - full_feature_names=full_feature_names, - on_demand_feature_views=[], - metadata=None, - config=config, - ) From 38d40f58ec74c6e04f839f03cbfdc36b542a9d65 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Thu, 19 Mar 2026 14:12:59 -0400 Subject: [PATCH 18/30] Consolidate mongodb_source.py into mongodb.py - Move MongoDBSource, MongoDBOptions, SavedDatasetMongoDBStorage into mongodb.py - Move _infer_python_type_str helper into mongodb.py - Update imports in tests and benchmarks - Remove mongodb_source.py This consolidates the collection-per-FV implementation into a single file, making the codebase easier to navigate. Signed-off-by: Casey Clements --- design-notes/CASEY_SESSION_NOTES.md | 109 +++++++ design-notes/design-hybrid-with-batches.md | 239 +++++++++++++++ design-notes/native_implementation_notes.md | 191 ++++++++++++ design-notes/offline_store_design.md | 98 ++++++ ...ompt-mdb-fetch-pandas-join-with-batches.md | 108 +++++++ .../contrib/mongodb_offline_store/mongodb.py | 278 ++++++++++++++++- .../mongodb_offline_store/mongodb_source.py | 283 ------------------ .../benchmark_mongodb_offline_stores.py | 4 +- .../contrib/test_mongodb_offline_retrieval.py | 2 - 9 files changed, 1020 insertions(+), 292 deletions(-) create mode 100644 design-notes/CASEY_SESSION_NOTES.md create mode 100644 design-notes/design-hybrid-with-batches.md create mode 100644 design-notes/native_implementation_notes.md create mode 100644 design-notes/offline_store_design.md create mode 100644 design-notes/prompt-mdb-fetch-pandas-join-with-batches.md delete mode 100644 sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_source.py diff --git a/design-notes/CASEY_SESSION_NOTES.md b/design-notes/CASEY_SESSION_NOTES.md new file mode 100644 index 0000000000..7a0b6f158f --- /dev/null +++ b/design-notes/CASEY_SESSION_NOTES.md @@ -0,0 +1,109 @@ +# MongoDB Feast Integration — Session Notes +_Last updated: 2026-03-16. Resume here after OS upgrade._ + +--- + +## Status at a Glance + +| Component | Branch | Status | +|---|---|---| +| **Online Store** | `INTPYTHON-297-MongoDB-Feast-Integration` | ✅ **Merged to upstream/master** | +| **Offline Store** | `FEAST-OfflineStore-INTPYTHON-297` | 🔧 In progress — next focus | + +--- + +## Online Store — COMPLETE ✅ + +### What was done +- Implemented `MongoDBOnlineStore` with full sync + async API +- Refactored write path: extracted `_build_write_ops` static method to eliminate code + duplication between `online_write_batch` and `online_write_batch_async` +- Added Feast driver metadata to MongoDB client instantiations +- Registered MongoDB in the feast-operator (kubebuilder enums, `ValidOnlineStoreDBStorePersistenceTypes`, operator YAMLs) +- Updated online store status from `alpha` → `preview` in docs +- All 5 unit tests pass (including Docker-based testcontainers integration test) + +### Key files +- `sdk/python/feast/infra/online_stores/mongodb_online_store/mongodb.py` — main implementation +- `sdk/python/tests/unit/online_store/test_mongodb_online_retrieval.py` — test suite +- `sdk/python/tests/universal/feature_repos/universal/online_store/mongodb.py` — universal test repo config + +### Git history cleanup (this session) +The PR had two merge commits (`632e103a6`, `26ce79b37`) that blocked squash-and-merge. +Resolution: +1. `git fetch --all` +2. Created clean branch `FEAST-OnlineStore-INTPYTHON-297` from `upstream/master` +3. Cherry-picked all 47 commits (oldest → newest), skipping the two merge commits +4. Resolved conflicts: directory rename (`tests/integration/` → `tests/universal/`), + `pixi.lock` auto-resolved, `detect-secrets` false positives got `# pragma: allowlist secret` +5. Force-pushed to `INTPYTHON-297-MongoDB-Feast-Integration` — maintainer squash-merged ✅ + +### Versioning +Version is derived dynamically via `setuptools_scm` from git tags (no hardcoded version). +Latest tag at time of merge: **`v0.60.0`**. Feature ships in the next release after that. +Update JIRA with the next release tag once the maintainers cut it. + +--- + +## Offline Store — IN PROGRESS 🔧 + +### Branch +``` +FEAST-OfflineStore-INTPYTHON-297 +``` + +### Commits on branch (not yet in upstream/master) +``` +cd3eef677 Started work on full Mongo/MQL implementation. Kept MongoDBOfflineStoreIbis and MongoDBOfflineStoreNative +71469f69a feat: restore test-python-universal-mongodb-online Makefile target +904505244 fix: pass onerror to pkgutil.walk_packages +946d84e4c fix: broaden import exception handling in doctest runner +55de0e9b5 fix: catch FeastExtrasDependencyImportError in doctest runner +157a71d77 refactor: improve MongoDB offline store code quality +67632af2f feat: Add MongoDB offline store (ibis-based PIT join, v1 alpha) +``` + +### Key files +- `sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py` + - Contains **two prototype implementations**: + - `MongoDBOfflineStoreIbis` — uses Ibis for point-in-time joins (delegates to `get_historical_features_ibis`) + - `MongoDBOfflineStoreNative` — native MQL implementation (started in `cd3eef677`, in progress) +- `sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_source.py` — `MongoDBSource` data source + +### Architecture: Ibis vs Native +- **Ibis approach**: delegates PIT join to `feast.infra.offline_stores.ibis` helpers. + Pro: less code, consistency with other ibis-backed stores. + Con: requires ibis-mongodb connector; PIT correctness depends on ibis translation. +- **Native approach**: implements PIT join directly in MQL (MongoDB aggregation pipeline). + Pro: no extra dependency, full control. + Con: more complex; MQL aggregation pipelines can be verbose. +- Decision pending benchmarking / correctness validation between the two. + +### Next steps for offline store +1. Finish `MongoDBOfflineStoreNative` MQL implementation (started in latest commit) +2. Validate PIT correctness for both implementations against the Feast universal test suite +3. Run: `make test-python-universal-mongodb-offline` (target may need creating — see `71469f69a`) +4. Choose Ibis vs Native based on results; remove the other +5. Add to operator (same pattern as online store: kubebuilder enums, install.yaml) +6. Open PR — follow same DCO + linear history discipline as online store + +--- + +## Environment Notes + +- **Python env**: always use `uv run pytest ...` (uses `.venv` in repo root, Python 3.11) +- **Do NOT use**: system Python (`/Library/Frameworks/Python.framework/...`) or conda envs +- **Docker**: must be running for the testcontainers integration test +- **Stale container**: `72d14b345b6a` (mongo:latest, port 57120) — leftover from testing, safe to stop +- **DCO**: all commits must be signed: `git commit -s` +- **No push/merge without explicit user approval** + +--- + +## Git Workflow Reminder +To keep history clean (lesson from online store PR): +- Always branch from `upstream/master` (after `git fetch --all`) +- Never merge upstream into a feature branch — rebase or cherry-pick instead +- Before opening a PR, verify with: `git log --merges ^upstream/master --oneline` + (must return empty) + diff --git a/design-notes/design-hybrid-with-batches.md b/design-notes/design-hybrid-with-batches.md new file mode 100644 index 0000000000..080986579e --- /dev/null +++ b/design-notes/design-hybrid-with-batches.md @@ -0,0 +1,239 @@ +Native MongoDB Offline Store (Hybrid Design) + +Design Document + +Overview + +This document describes the design of the Native MongoDB Offline Store for Feast using a hybrid execution model. The system combines MongoDB’s strengths in indexed data retrieval with Python’s strengths in relational and temporal joins. + +The implementation uses a single-collection schema in MongoDB to store feature data across all FeatureViews and performs point-in-time (PIT) joins using a “fetch + pandas join” strategy. This replaces an earlier fully in-database $lookup approach that proved unscalable for large workloads. + +The result is a design that is performant, scalable, and aligned with Feast’s semantics. + +⸻ + +Data Model + +All FeatureViews share a single MongoDB collection (feature_history). Each document represents an observation of a FeatureView for a given entity at a specific timestamp. + +Each document contains: + • A serialized entity identifier (entity_id) + • A FeatureView identifier (feature_view) + • A subdocument of feature values (features) + • An event timestamp (event_timestamp) + • An ingestion timestamp (created_at) + +This schema supports: + • Sparse feature storage (not all features present in every document) + • Flexible schema evolution over time + • Efficient indexing across FeatureViews + +A compound index is maintained on: + • (entity_id, feature_view, event_timestamp DESC) + +This index supports efficient filtering by entity, FeatureView, and time range. + +⸻ + +Execution Model + +High-Level Strategy + +The system implements historical feature retrieval in three stages: + 1. Preprocessing (Python) + • Normalize timestamps to UTC + • Serialize entity keys into entity_id + • Partition the input entity_df into manageable chunks + 2. Data Fetching (MongoDB) + • Query MongoDB using $in on entity IDs + • Filter by FeatureView and time bounds + • Retrieve matching feature documents in batches + 3. Point-in-Time Join (Python) + • Convert MongoDB results into pandas DataFrames + • Perform per-FeatureView joins using merge_asof + • Apply TTL constraints and feature selection + +This design avoids per-row database joins and instead performs a small number of efficient indexed scans. + +⸻ + +Chunking and Batching + +To ensure scalability, the system separates concerns between: + • Chunk size (entity_df) +Controls memory usage in Python +Default: ~5,000 rows + • Batch size (MongoDB queries) +Controls query size and index efficiency +Default: ~1,000 entity IDs per query + +Each chunk of entity_df is processed independently: + • Entity IDs are extracted and deduplicated + • Feature data is fetched in batches + • Results are joined and accumulated + +This ensures: + • Bounded memory usage + • Predictable query performance + • Compatibility with large workloads + +⸻ + +Point-in-Time Join Semantics + +For each FeatureView: + • Feature data is sorted by (entity_id, event_timestamp) + • The entity dataframe is similarly sorted + • A backward merge_asof is performed + +This ensures: + • Only feature values with timestamps ≤ entity timestamp are used + • The most recent valid feature value is selected + +TTL constraints are applied after the join: + • If the matched feature timestamp is older than the allowed TTL window, the value is set to NULL + +⸻ + +Key Improvements in Current Design + +1. Projection (Reduced Data Transfer) + +The system now explicitly limits fields retrieved from MongoDB to only those required: + • entity_id + • feature_view + • event_timestamp + • Requested feature fields within features + +This reduces: + • Network overhead + • BSON decoding cost + • Memory usage in pandas + +This is especially important for wide FeatureViews or large documents. + +⸻ + +2. Bounded Time Filtering + +Queries now include both: + • An upper bound (<= max_ts) + • A lower bound (>= min_ts) + +This significantly reduces the amount of historical data scanned when: + • The entity dataframe spans a narrow time window + • The feature store contains deep history + +This optimization improves: + • Query latency + • Index selectivity + • Memory footprint of retrieved data + +Future enhancements may incorporate TTL-aware lower bounds. + +⸻ + +3. Correct Sorting for Temporal Joins + +The system ensures proper sorting before merge_asof: + • Both dataframes are sorted by (entity_id, timestamp) + +This is critical for correctness when: + • Multiple entities are processed in a single batch + • Data is interleaved across entities + +Without this, joins may silently produce incorrect results. + +⸻ + +Tradeoffs + +Advantages + • Scalability: Avoids O(n × m) behavior of correlated joins + • Flexibility: Supports sparse and evolving schemas + • Performance: Leverages MongoDB indexes efficiently + • Simplicity: Uses well-understood pandas join semantics + +Limitations + • Memory-bound joins: Requires chunking for large workloads + • Multiple passes: Each FeatureView requires a separate join + • No server-side joins: MongoDB is used only for filtering, not relational logic + +⸻ + +Comparison to Alternative Designs + +Full MongoDB Join ($lookup) + +Rejected due to: + • Poor scaling with large entity sets + • Repeated execution of correlated subqueries + • High latency (orders of magnitude slower) + +⸻ + +Ibis-Based Design + • Uses one collection per FeatureView + • Loads data into memory and performs joins in Python + +Comparison: + • Similar performance after hybrid redesign + • Simpler query model + • Less flexible schema + +The Native design trades simplicity for: + • Unified storage + • Better alignment with document-based ingestion + • More flexible feature evolution + +⸻ + +Operational Considerations + +Index Management + +Indexes are created lazily at runtime: + • Ensures correctness without manual setup + • Avoids placing responsibility on users + +Future improvements may include: + • Optional strict index validation + • Configuration-driven index management + +⸻ + +MongoDB Client Usage + +Each chunk currently uses a separate MongoDB client instance. + +This is acceptable for moderate workloads but may be optimized in the future by: + • Reusing a shared client per retrieval job + • Leveraging connection pooling more explicitly + +⸻ + +Future Work + +Several enhancements are possible: + 1. Streaming Joins + • Avoid materializing all feature data in memory + • Process data incrementally + 2. Adaptive Chunking + • Dynamically adjust chunk size based on memory pressure + 3. TTL Pushdown + • Incorporate TTL constraints into MongoDB queries + 4. Parallel Execution + • Process chunks concurrently for large workloads + +⸻ + +Conclusion + +The hybrid MongoDB + pandas design represents a significant improvement over the initial fully in-database approach. It aligns system responsibilities with the strengths of each component: + • MongoDB handles indexed filtering and retrieval + • Python handles temporal join logic + +With the addition of projection, bounded time filtering, and correct sorting, the system is now both performant and correct for large-scale historical feature retrieval. + +This design provides a strong foundation for further optimization and production use. + diff --git a/design-notes/native_implementation_notes.md b/design-notes/native_implementation_notes.md new file mode 100644 index 0000000000..891751e56c --- /dev/null +++ b/design-notes/native_implementation_notes.md @@ -0,0 +1,191 @@ +# Native MongoDB Offline Store Implementation Review + +## Overview + +This document reviews the native MongoDB offline store implementation (`mongodb_native.py`) in the context of Feast idioms, the MongoDB online store implementation, and best practices. + +--- + +## Schema Alignment: Online ↔ Offline + +### Online Store Schema (mongodb_online_store/mongodb.py) +```javascript +{ + "_id": bytes, // serialized entity key + "features": { + "": { + "": value + } + }, + "event_timestamps": { "": datetime }, + "created_timestamp": datetime +} +``` + +### Offline Store Schema (Native) +```javascript +{ + "_id": ObjectId(), + "entity_id": bytes, // serialized entity key (same format as online _id) + "feature_view": "driver_stats", // discriminator + "features": { "": value }, + "event_timestamp": datetime, + "created_at": datetime +} +``` + +### ✅ Alignment Strengths +1. **Entity key serialization**: Both use `serialize_entity_key()` from `key_encoding_utils.py` +2. **Nested features**: Both use `features: { ... }` subdocument pattern +3. **Timestamps**: Both track event and created timestamps + +### ⚠️ Alignment Concerns +1. **`_id` usage**: Online uses `_id` = entity_id; Offline uses `_id` = ObjectId() with separate `entity_id` field + - **Recommendation**: Consider using `_id` = `{entity_id, feature_view, event_timestamp}` compound key for offline, eliminating ObjectId overhead + +2. **Feature nesting depth**: Online nests by feature_view then feature; Offline nests only by feature (feature_view is top-level) + - This is intentional (offline is one doc per event; online is one doc per entity with all FVs) + +--- + +## Feast Idioms Compliance + +### ✅ Correctly Followed +1. **RetrievalJob pattern**: Returns `MongoDBNativeRetrievalJob` wrapping a `query_fn` closure +2. **Arrow output**: `_to_arrow_internal()` returns `pyarrow.Table` (hard requirement) +3. **Warnings for preview**: Uses `warnings.warn()` with `RuntimeWarning` +4. **Config inheritance**: `MongoDBOfflineStoreNativeConfig` extends `FeastConfigBaseModel` +5. **DataSource pattern**: `MongoDBSourceNative` extends `DataSource` with `from_proto`/`_to_proto_impl` + +### ⚠️ Missing or Incomplete +1. **`offline_write_batch`**: Not implemented (raises `NotImplementedError` in persist) + - Required for push sources and `feast materialize` reverse path + - Should accept `pyarrow.Table` and insert into `feature_history` collection + +2. **`write_logged_features`**: Not implemented + - Lower priority but needed for feature logging + +3. **`persist()` on RetrievalJob**: Not implemented + - Should write results to a new collection for saved datasets + +--- + +## MQL Pipeline Quality + +### ✅ Well Implemented +1. **`pull_all_from_table_or_query`**: Clean range scan with `$project` flattening features server-side +2. **`pull_latest_from_table_or_query`**: Proper `$sort` → `$group` → `$project` pattern +3. **`get_historical_features`**: Uses `$lookup` with correlated subpipeline for server-side PIT join +4. **Per-FV TTL via `$switch`**: Elegant solution for different TTLs per feature view + +### ⚠️ Potential Improvements +1. **Index usage in `$lookup`**: The `$expr` in `$match` may not use indexes efficiently + - MongoDB 5.0+ has better support for `$expr` index usage + - Consider adding `hint` option if performance is critical + +2. **Temp collection cleanup**: Currently uses `try/finally` but could benefit from context manager pattern + +3. **Connection pooling**: Each method creates a new `MongoClient`. The online store caches `_client` and `_collection` + - **Recommendation**: Add `_client` caching to the offline store class or use connection pooling + +--- + +## Comparison with Online Store Patterns + +| Aspect | Online Store | Offline Store (Native) | +|--------|--------------|------------------------| +| Client caching | `_client`, `_collection` instance vars | New client per operation | +| Async support | Yes (`AsyncMongoClient`) | No | +| Batch operations | `bulk_write` with `UpdateOne` | `insert_many` | +| Error handling | Raises `RuntimeError` for config mismatch | Raises `ValueError` | +| DriverInfo | ✅ Yes | ✅ Yes | + +### Recommendations +1. **Add client caching** to avoid connection overhead per query +2. **Consider async support** for large entity_df scenarios +3. **Standardize error types** (use `RuntimeError` or `FeastError` subclasses) + +--- + +## Missing Features for Production Readiness + +### High Priority +1. **`offline_write_batch`**: Insert Arrow table into feature_history + ```python + @staticmethod + def offline_write_batch( + config: RepoConfig, + feature_view: FeatureView, + table: pyarrow.Table, + progress: Optional[Callable[[int], Any]], + ): + # Convert Arrow → docs with schema: + # { entity_id, feature_view, features: {...}, event_timestamp, created_at } + # Then insert_many() + ``` + +2. **Index creation helper**: Document or auto-create the compound index + ```javascript + db.feature_history.createIndex({ + entity_id: 1, + feature_view: 1, + event_timestamp: -1 + }) + ``` + +3. **Connection pooling / client reuse** + +### Medium Priority +4. **`persist()` for saved datasets**: Write retrieval results to a collection +5. **`write_logged_features`**: For feature logging support +6. **Async operations**: Mirror online store's async pattern + +### Lower Priority +7. **Streaming cursor support**: For very large result sets +8. **Explain plan logging**: Debug mode to show MQL execution plan + +--- + +## Code Quality Observations + +### ✅ Good +- Clear docstrings explaining schema and index requirements +- Type hints throughout +- Helper functions extracted (`_ttl_to_ms`, `_build_ttl_gte_expr`, `_serialize_entity_key_from_row`) +- Proper cleanup of temp collections in `finally` block + +### ⚠️ Could Improve +- Some duplication in timestamp timezone handling (could extract helper) +- Magic strings like `"event_timestamp"`, `"created_at"` could be constants +- The `_run()` closures are large — consider extracting to separate methods + +--- + +## Test Coverage Assessment + +Current tests cover: +- ✅ `pull_latest_from_table_or_query` +- ✅ `pull_all_from_table_or_query` +- ✅ `get_historical_features` (PIT join) +- ✅ TTL filtering +- ✅ Multiple feature views +- ✅ Compound join keys + +Missing tests: +- ❌ `offline_write_batch` (not implemented) +- ❌ Empty result handling edge cases +- ❌ Very large entity_df (performance/memory) +- ❌ Concurrent access to temp collections +- ❌ Index usage verification (explain plans) + +--- + +## Summary + +The native implementation is a solid foundation with proper use of MQL aggregation pipelines. Key next steps: + +1. **Implement `offline_write_batch`** — Required for push sources +2. **Add client caching** — Match online store pattern +3. **Document/automate index creation** — Critical for performance +4. **Consider `_id` schema optimization** — Use compound `_id` instead of ObjectId + entity_id + diff --git a/design-notes/offline_store_design.md b/design-notes/offline_store_design.md new file mode 100644 index 0000000000..fbe7120a3c --- /dev/null +++ b/design-notes/offline_store_design.md @@ -0,0 +1,98 @@ +# Corrected MongoDB OfflineStore Design + +## What the interface actually requires + +`RetrievalJob._to_arrow_internal` must return a `pyarrow.Table`. This is non-negotiable +because the compute engines call `retrieval_job.to_arrow()` directly: + +```python +# sdk/python/feast/infra/compute_engines/local/nodes.py +retrieval_job = create_offline_store_retrieval_job(...) +arrow_table = retrieval_job.to_arrow() # ← hard requirement +``` + +The compute engine then converts Arrow → proto tuples itself before calling +`OnlineStore.online_write_batch(data: List[Tuple[EntityKeyProto, ...]])`. +The offline store never sees the proto tuple format. + +`OfflineStore.offline_write_batch` (the push-source write path) takes a `pyarrow.Table` +— so Arrow is also the *input* format for writes. + +## The right approach — native aggregation, then Arrow + +The Couchbase offline store is the correct reference. It: +1. Expresses computation natively in the database (SQL++ window functions). +2. Iterates the cursor in Python. +3. Converts directly: `pa.Table.from_pylist(processed_rows)` — **no pandas intermediate**. + +MongoDB should follow the same pattern using its aggregation pipeline. + +## pull_latest_from_table_or_query + +The `$group` + `$sort` aggregation is the natural MongoDB equivalent of +`ROW_NUMBER() OVER(PARTITION BY entity ORDER BY timestamp DESC) = 1`: + +```python +pipeline = [ + {"$match": { + timestamp_field: {"$gte": start_date, "$lte": end_date} + }}, + {"$sort": {timestamp_field: -1, created_timestamp_column: -1}}, + {"$group": { + "_id": {k: f"${k}" for k in join_key_columns}, + **{f: {"$first": f"${f}"} for f in feature_name_columns}, + timestamp_field: {"$first": f"${timestamp_field}"}, + }}, +] +# cursor → pa.Table.from_pylist([doc for doc in collection.aggregate(pipeline)]) +``` + +No pandas. No Feast join utilities. The database does the work. + +## get_historical_features + +This is harder. The point-in-time join requires: for each (entity, entity_timestamp) row, +find the feature row with the latest `event_timestamp <= entity_timestamp`. + +MongoDB has no SQL window functions, but the aggregation pipeline can express this: + +``` +For each feature view: + $match: entity_ids in entity_df AND event_timestamp <= max(entity_timestamps) + $sort: entity_id, event_timestamp DESC + $lookup or unwind against entity_df rows + $match: event_timestamp <= entity_row.entity_timestamp (and TTL if set) + $group by (entity_id, entity_row_id): $first of features +``` + +This is complex but keeps computation in MongoDB and avoids loading the full history +into Python memory. The result cursor is then converted via `pa.Table.from_pylist()`. + +For an initial implementation it is acceptable to pull the filtered documents into +memory and do the join in Python (like the Dask store) — but this should be noted +as a known limitation, not the target design. + +## offline_write_batch + +Receives a `pyarrow.Table` from Feast (push-source path). Convert with +`table.to_pylist()` and `insert_many()` into the collection. + +## What changes from the previous design + +| Previous (incorrect) | Corrected | +|---------------------------------------------|---------------------------------------------| +| Pull docs into pandas, use offline_utils | Use MongoDB aggregation pipeline | +| pandas is the intermediate format | MongoDB cursor → `pa.Table.from_pylist()` | +| Arrow is an afterthought | Arrow is the required output of the job | +| Claimed online_write_batch takes Arrow | It takes proto tuples; compute engine converts | + +## Implementation order (unchanged) + +1. `MongoDBSource` — DataSource subclass (connection_string, database, collection, timestamp_field). +2. `MongoDBOfflineStoreConfig` — pydantic config. +3. `MongoDBRetrievalJob` — wraps aggregation pipeline, implements `_to_arrow_internal`. +4. `offline_write_batch` — `pyarrow.Table` → `insert_many`. +5. `pull_latest_from_table_or_query` — `$sort` + `$group` aggregation. +6. `pull_all_from_table_or_query` — `$match` time-range scan. +7. `get_historical_features` — aggregation pipeline PIT join (or in-memory fallback). + diff --git a/design-notes/prompt-mdb-fetch-pandas-join-with-batches.md b/design-notes/prompt-mdb-fetch-pandas-join-with-batches.md new file mode 100644 index 0000000000..9bd8fb437c --- /dev/null +++ b/design-notes/prompt-mdb-fetch-pandas-join-with-batches.md @@ -0,0 +1,108 @@ +Enhance MongoDBOfflineStoreNative.get_historical_features to support chunked execution for large entity_df, while preserving the existing fetch + pandas PIT join logic. + +Goals + • Prevent memory blowups for large entity_df + • Reuse the current implementation as much as possible + • Keep the code clean and idiomatic to Feast + +⸻ + +Requirements + +1. Add chunking based on entity_df size + • Introduce a constant: +``` python +CHUNK_SIZE = 5000 # make configurable configurable +``` + • If len(entity_df) <= CHUNK_SIZE: + • Run the existing _run() logic unchanged + • Else: + • Split entity_df into chunks of size CHUNK_SIZE + +⸻ + +2. Extract existing logic into reusable function +Refactor the current _run() implementation into a helper: +``` python +def _run_single(entity_subset_df: pd.DataFrame) -> pd.DataFrame: + ... +``` +This function should: + • Perform: + • entity_id serialization + • MongoDB fetch ($in query) + • pandas normalization + • per-feature-view merge_asof + • Return a pandas DataFrame (not Arrow) +3. Implement chunked execution +In _run(): +``` python +if len(entity_df) <= CHUNK_SIZE: + df = _run_single(entity_df) +else: + dfs = [] + for chunk in chunk_dataframe(entity_df, CHUNK_SIZE): + dfs.append(_run_single(chunk)) + df = pd.concat(dfs, ignore_index=True) +``` +4. Implement chunk helper +Add: +``` +def chunk_dataframe(df: pd.DataFrame, size: int): + for i in range(0, len(df), size): + yield df.iloc[i:i+size] +``` +5. Preserve ordering + • Ensure final DataFrame preserves original row order + • Use a _row_idx column if necessary +6. Handle edge cases +Ensure chunked version correctly handles: + • Empty MongoDB results + • Missing feature_views + • Missing features inside documents + • TTL filtering (already implemented in pandas) + +⸻ + +7. Return Arrow table +Final _run() must still return: +``` +pyarrow.Table.from_pandas(df, preserve_index=False) +``` +Constraints + • Do NOT reintroduce $lookup + • Do NOT use temp collections + • Do NOT duplicate large blocks of logic + • Keep code readable and maintainable + +⸻ + +Optional (nice-to-have) + • Add logging or debug print: + • number of chunks processed + • rows per chunk + +⸻ + +Outcome + • Small workloads behave exactly as before + • Large workloads are processed safely in chunks + • Performance remains close to Ibis for moderate sizes + • Memory usage is bounded + +⸻ + +🧠 Why this design is the right one + +This keeps your system: + +✅ Fast + • still uses vectorized joins + +✅ Scalable + • bounded memory + +✅ Clean + • no duplication + • no branching chaos + diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py index 10ffd6c533..241e69cbb4 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import warnings from datetime import datetime -from typing import Any, Callable, List, Optional, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast import ibis import pandas as pd @@ -28,14 +29,12 @@ from feast.data_source import DataSource from feast.errors import ( + DataSourceNoNameException, FeastExtrasDependencyImportError, SavedDatasetLocationAlreadyExists, ) from feast.feature_view import FeatureView from feast.infra.offline_stores.contrib.mongodb_offline_store import DRIVER_METADATA -from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_source import ( - MongoDBSource, -) from feast.infra.offline_stores.ibis import ( get_historical_features_ibis, pull_all_from_table_or_query_ibis, @@ -46,7 +45,278 @@ RetrievalJob, ) from feast.infra.registry.base_registry import BaseRegistry +from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto +from feast.protos.feast.core.SavedDataset_pb2 import ( + SavedDatasetStorage as SavedDatasetStorageProto, +) from feast.repo_config import FeastConfigBaseModel, RepoConfig +from feast.saved_dataset import SavedDatasetStorage +from feast.type_map import mongodb_to_feast_value_type +from feast.value_type import ValueType + +# --------------------------------------------------------------------------- +# Helper functions +# --------------------------------------------------------------------------- + + +def _infer_python_type_str(value: Any) -> Optional[str]: + """Infer a Feast-compatible type string from a Python value returned by pymongo.""" + if value is None: + return None + if isinstance(value, bool): + return "bool" + if isinstance(value, int): + return "int" + if isinstance(value, float): + return "float" + if isinstance(value, str): + return "str" + if isinstance(value, bytes): + return "bytes" + if isinstance(value, datetime): + return "datetime" + if isinstance(value, list): + if not value: + return "list[str]" + elem_type = _infer_python_type_str(value[0]) + if elem_type: + return f"list[{elem_type}]" + return "list[str]" + return None + + +# --------------------------------------------------------------------------- +# MongoDBSource and related classes (collection-per-FeatureView schema) +# --------------------------------------------------------------------------- + + +class MongoDBOptions: + """Options for a MongoDB data source (database + collection).""" + + def __init__(self, database: str, collection: str): + self._database = database + self._collection = collection + + def to_proto(self) -> DataSourceProto.CustomSourceOptions: + """Serialize database and collection names as JSON into a CustomSourceOptions proto.""" + return DataSourceProto.CustomSourceOptions( + configuration=json.dumps( + {"database": self._database, "collection": self._collection} + ).encode() + ) + + @classmethod + def from_proto( + cls, options_proto: DataSourceProto.CustomSourceOptions + ) -> "MongoDBOptions": + """Deserialize a CustomSourceOptions proto back into a MongoDBOptions instance.""" + config = json.loads(options_proto.configuration.decode("utf8")) + return cls(database=config["database"], collection=config["collection"]) + + +class MongoDBSource(DataSource): + """A MongoDB collection used as a Feast offline data source. + + ``name`` is the logical Feast name for this source. If omitted, it defaults + to the value of ``collection``. At least one of ``name`` or ``collection`` + must be supplied. + + ``database`` is the MongoDB database that contains the collection. When + omitted it falls back to ``MongoDBOfflineStoreConfig.database`` at query + time, so a single store-level default can be shared across many sources. + + ``schema_sample_size`` controls how many documents are randomly sampled + when Feast infers the collection schema (used by ``feast apply`` and + ``get_table_column_names_and_types``). Increase it for collections with + highly variable document shapes; decrease it to speed up ``feast apply`` + at the cost of schema coverage. + """ + + def source_type(self) -> DataSourceProto.SourceType.ValueType: + return DataSourceProto.CUSTOM_SOURCE + + def __init__( + self, + name: Optional[str] = None, + database: Optional[str] = None, + collection: Optional[str] = None, + timestamp_field: Optional[str] = "", + created_timestamp_column: Optional[str] = "", + field_mapping: Optional[Dict[str, str]] = None, + description: Optional[str] = "", + tags: Optional[Dict[str, str]] = None, + owner: Optional[str] = "", + schema_sample_size: int = 100, + ): + if name is None and collection is None: + raise DataSourceNoNameException() + # At least one of name / collection is non-None; cast to satisfy the type checker. + name = cast(str, name or collection) + + self._mongodb_options = MongoDBOptions( + database=database or "", + collection=collection or name, + ) + self._schema_sample_size = schema_sample_size + + super().__init__( + name=name, + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, + field_mapping=field_mapping, + description=description, + tags=tags, + owner=owner, + ) + + def __hash__(self): + return super().__hash__() + + def __eq__(self, other): + if not isinstance(other, MongoDBSource): + raise TypeError( + "Comparisons should only involve MongoDBSource class objects." + ) + return ( + super().__eq__(other) + and self._mongodb_options._database == other._mongodb_options._database + and self._mongodb_options._collection == other._mongodb_options._collection + and self.timestamp_field == other.timestamp_field + and self.created_timestamp_column == other.created_timestamp_column + and self.field_mapping == other.field_mapping + ) + + @property + def database(self) -> str: + return self._mongodb_options._database + + @property + def collection(self) -> str: + return self._mongodb_options._collection + + @staticmethod + def from_proto(data_source: DataSourceProto) -> "MongoDBSource": + assert data_source.HasField("custom_options") + options = json.loads(data_source.custom_options.configuration) + return MongoDBSource( + name=data_source.name, + database=options["database"], + collection=options["collection"], + field_mapping=dict(data_source.field_mapping), + timestamp_field=data_source.timestamp_field, + created_timestamp_column=data_source.created_timestamp_column, + description=data_source.description, + tags=dict(data_source.tags), + owner=data_source.owner, + ) + + def _to_proto_impl(self) -> DataSourceProto: + data_source_proto = DataSourceProto( + name=self.name, + type=DataSourceProto.CUSTOM_SOURCE, + data_source_class_type="feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb.MongoDBSource", + field_mapping=self.field_mapping, + custom_options=self._mongodb_options.to_proto(), + description=self.description, + tags=self.tags, + owner=self.owner, + ) + data_source_proto.timestamp_field = self.timestamp_field + data_source_proto.created_timestamp_column = self.created_timestamp_column + return data_source_proto + + def validate(self, config: RepoConfig): + # No upfront schema validation is required for MongoDB; the connection + # is exercised lazily when features are actually retrieved. + pass + + @staticmethod + def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: + return mongodb_to_feast_value_type + + def get_table_query_string(self) -> str: + return f"{self._mongodb_options._database}.{self._mongodb_options._collection}" + + def get_table_column_names_and_types( + self, config: RepoConfig + ) -> Iterable[Tuple[str, str]]: + """Sample documents from the collection to infer field names and their Feast type strings. + + Uses ``$sample`` to fetch up to ``schema_sample_size`` documents, then + picks the most-frequent Python type observed per field. The ``_id`` + field is always excluded. + """ + if MongoClient is None: + raise FeastExtrasDependencyImportError( + "mongodb", "pymongo is not installed." + ) + connection_string = config.offline_store.connection_string + db_name = self.database or config.offline_store.database + client: Any = MongoClient(connection_string, tz_aware=True) + try: + docs = list( + client[db_name][self.collection].aggregate( + [{"$sample": {"size": self._schema_sample_size}}] + ) + ) + finally: + client.close() + + field_type_counts: Dict[str, Dict[str, int]] = {} + for doc in docs: + for field, value in doc.items(): + if field == "_id": + continue + type_str = _infer_python_type_str(value) + if type_str is None: + continue + field_type_counts.setdefault(field, {}) + field_type_counts[field][type_str] = ( + field_type_counts[field].get(type_str, 0) + 1 + ) + + return [ + (field, max(counts, key=lambda t: counts[t])) + for field, counts in field_type_counts.items() + ] + + +class SavedDatasetMongoDBStorage(SavedDatasetStorage): + """Persists a Feast SavedDataset into a MongoDB collection.""" + + _proto_attr_name = "custom_storage" + + mongodb_options: MongoDBOptions + + def __init__(self, database: str, collection: str): + self.mongodb_options = MongoDBOptions( + database=database, + collection=collection, + ) + + @staticmethod + def from_proto( + storage_proto: SavedDatasetStorageProto, + ) -> "SavedDatasetMongoDBStorage": + options = json.loads(storage_proto.custom_storage.configuration) + return SavedDatasetMongoDBStorage( + database=options["database"], + collection=options["collection"], + ) + + def to_proto(self) -> SavedDatasetStorageProto: + return SavedDatasetStorageProto(custom_storage=self.mongodb_options.to_proto()) + + def to_data_source(self) -> DataSource: + return MongoDBSource( + database=self.mongodb_options._database, + collection=self.mongodb_options._collection, + ) + + +# --------------------------------------------------------------------------- +# Offline store configuration and implementation +# --------------------------------------------------------------------------- class MongoDBOfflineStoreIbisConfig(FeastConfigBaseModel): diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_source.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_source.py deleted file mode 100644 index ee55fe24e6..0000000000 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_source.py +++ /dev/null @@ -1,283 +0,0 @@ -# Copyright 2026 The Feast Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -from datetime import datetime -from typing import Any, Callable, Dict, Iterable, Optional, Tuple, cast - -try: - from pymongo import MongoClient -except ImportError: - MongoClient = None # type: ignore[assignment,misc] - -from feast.data_source import DataSource -from feast.errors import DataSourceNoNameException, FeastExtrasDependencyImportError -from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto -from feast.protos.feast.core.SavedDataset_pb2 import ( - SavedDatasetStorage as SavedDatasetStorageProto, -) -from feast.repo_config import RepoConfig -from feast.saved_dataset import SavedDatasetStorage -from feast.type_map import mongodb_to_feast_value_type -from feast.value_type import ValueType - - -def _infer_python_type_str(value: Any) -> Optional[str]: - """Infer a Feast-compatible type string from a Python value returned by pymongo.""" - if value is None: - return None - if isinstance(value, bool): - return "bool" - if isinstance(value, int): - return "int" - if isinstance(value, float): - return "float" - if isinstance(value, str): - return "str" - if isinstance(value, bytes): - return "bytes" - if isinstance(value, datetime): - return "datetime" - if isinstance(value, list): - if not value: - return "list[str]" - elem_type = _infer_python_type_str(value[0]) - if elem_type: - return f"list[{elem_type}]" - return "list[str]" - return None - - -class MongoDBOptions: - """Options for a MongoDB data source (database + collection).""" - - def __init__(self, database: str, collection: str): - self._database = database - self._collection = collection - - def to_proto(self) -> DataSourceProto.CustomSourceOptions: - """Serialize database and collection names as JSON into a CustomSourceOptions proto.""" - return DataSourceProto.CustomSourceOptions( - configuration=json.dumps( - {"database": self._database, "collection": self._collection} - ).encode() - ) - - @classmethod - def from_proto( - cls, options_proto: DataSourceProto.CustomSourceOptions - ) -> "MongoDBOptions": - """Deserialize a CustomSourceOptions proto back into a MongoDBOptions instance.""" - config = json.loads(options_proto.configuration.decode("utf8")) - return cls(database=config["database"], collection=config["collection"]) - - -class MongoDBSource(DataSource): - """A MongoDB collection used as a Feast offline data source. - - ``name`` is the logical Feast name for this source. If omitted, it defaults - to the value of ``collection``. At least one of ``name`` or ``collection`` - must be supplied. - - ``database`` is the MongoDB database that contains the collection. When - omitted it falls back to ``MongoDBOfflineStoreConfig.database`` at query - time, so a single store-level default can be shared across many sources. - - ``schema_sample_size`` controls how many documents are randomly sampled - when Feast infers the collection schema (used by ``feast apply`` and - ``get_table_column_names_and_types``). Increase it for collections with - highly variable document shapes; decrease it to speed up ``feast apply`` - at the cost of schema coverage. - """ - - def source_type(self) -> DataSourceProto.SourceType.ValueType: - return DataSourceProto.CUSTOM_SOURCE - - def __init__( - self, - name: Optional[str] = None, - database: Optional[str] = None, - collection: Optional[str] = None, - timestamp_field: Optional[str] = "", - created_timestamp_column: Optional[str] = "", - field_mapping: Optional[Dict[str, str]] = None, - description: Optional[str] = "", - tags: Optional[Dict[str, str]] = None, - owner: Optional[str] = "", - schema_sample_size: int = 100, - ): - if name is None and collection is None: - raise DataSourceNoNameException() - # At least one of name / collection is non-None; cast to satisfy the type checker. - name = cast(str, name or collection) - - self._mongodb_options = MongoDBOptions( - database=database or "", - collection=collection or name, - ) - self._schema_sample_size = schema_sample_size - - super().__init__( - name=name, - timestamp_field=timestamp_field, - created_timestamp_column=created_timestamp_column, - field_mapping=field_mapping, - description=description, - tags=tags, - owner=owner, - ) - - def __hash__(self): - return super().__hash__() - - def __eq__(self, other): - if not isinstance(other, MongoDBSource): - raise TypeError( - "Comparisons should only involve MongoDBSource class objects." - ) - return ( - super().__eq__(other) - and self._mongodb_options._database == other._mongodb_options._database - and self._mongodb_options._collection == other._mongodb_options._collection - and self.timestamp_field == other.timestamp_field - and self.created_timestamp_column == other.created_timestamp_column - and self.field_mapping == other.field_mapping - ) - - @property - def database(self) -> str: - return self._mongodb_options._database - - @property - def collection(self) -> str: - return self._mongodb_options._collection - - @staticmethod - def from_proto(data_source: DataSourceProto) -> "MongoDBSource": - assert data_source.HasField("custom_options") - options = json.loads(data_source.custom_options.configuration) - return MongoDBSource( - name=data_source.name, - database=options["database"], - collection=options["collection"], - field_mapping=dict(data_source.field_mapping), - timestamp_field=data_source.timestamp_field, - created_timestamp_column=data_source.created_timestamp_column, - description=data_source.description, - tags=dict(data_source.tags), - owner=data_source.owner, - ) - - def _to_proto_impl(self) -> DataSourceProto: - data_source_proto = DataSourceProto( - name=self.name, - type=DataSourceProto.CUSTOM_SOURCE, - data_source_class_type="feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_source.MongoDBSource", - field_mapping=self.field_mapping, - custom_options=self._mongodb_options.to_proto(), - description=self.description, - tags=self.tags, - owner=self.owner, - ) - data_source_proto.timestamp_field = self.timestamp_field - data_source_proto.created_timestamp_column = self.created_timestamp_column - return data_source_proto - - def validate(self, config: RepoConfig): - # No upfront schema validation is required for MongoDB; the connection - # is exercised lazily when features are actually retrieved. - pass - - @staticmethod - def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: - return mongodb_to_feast_value_type - - def get_table_query_string(self) -> str: - return f"{self._mongodb_options._database}.{self._mongodb_options._collection}" - - def get_table_column_names_and_types( - self, config: RepoConfig - ) -> Iterable[Tuple[str, str]]: - """Sample documents from the collection to infer field names and their Feast type strings. - - Uses ``$sample`` to fetch up to ``schema_sample_size`` documents, then - picks the most-frequent Python type observed per field. The ``_id`` - field is always excluded. - """ - if MongoClient is None: - raise FeastExtrasDependencyImportError( - "mongodb", "pymongo is not installed." - ) - connection_string = config.offline_store.connection_string - db_name = self.database or config.offline_store.database - client: Any = MongoClient(connection_string, tz_aware=True) - try: - docs = list( - client[db_name][self.collection].aggregate( - [{"$sample": {"size": self._schema_sample_size}}] - ) - ) - finally: - client.close() - - field_type_counts: Dict[str, Dict[str, int]] = {} - for doc in docs: - for field, value in doc.items(): - if field == "_id": - continue - type_str = _infer_python_type_str(value) - if type_str is None: - continue - field_type_counts.setdefault(field, {}) - field_type_counts[field][type_str] = ( - field_type_counts[field].get(type_str, 0) + 1 - ) - - return [ - (field, max(counts, key=lambda t: counts[t])) - for field, counts in field_type_counts.items() - ] - - -class SavedDatasetMongoDBStorage(SavedDatasetStorage): - """Persists a Feast SavedDataset into a MongoDB collection.""" - - _proto_attr_name = "custom_storage" - - mongodb_options: MongoDBOptions - - def __init__(self, database: str, collection: str): - self.mongodb_options = MongoDBOptions( - database=database, - collection=collection, - ) - - @staticmethod - def from_proto( - storage_proto: SavedDatasetStorageProto, - ) -> "SavedDatasetMongoDBStorage": - options = json.loads(storage_proto.custom_storage.configuration) - return SavedDatasetMongoDBStorage( - database=options["database"], - collection=options["collection"], - ) - - def to_proto(self) -> SavedDatasetStorageProto: - return SavedDatasetStorageProto(custom_storage=self.mongodb_options.to_proto()) - - def to_data_source(self) -> DataSource: - return MongoDBSource( - database=self.mongodb_options._database, - collection=self.mongodb_options._collection, - ) diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py b/sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py index 177023dd6f..27c6a6a35a 100644 --- a/sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py +++ b/sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py @@ -37,15 +37,13 @@ from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb import ( MongoDBOfflineStoreIbis, MongoDBOfflineStoreIbisConfig, + MongoDBSource, ) from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_native import ( MongoDBOfflineStoreNative, MongoDBOfflineStoreNativeConfig, MongoDBSourceNative, ) -from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_source import ( - MongoDBSource, -) from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import Value as ValueProto from feast.repo_config import RepoConfig diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py b/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py index 225d18d3e9..3acd93c288 100644 --- a/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py +++ b/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py @@ -22,8 +22,6 @@ from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb import ( MongoDBOfflineStoreIbis, MongoDBOfflineStoreIbisConfig, -) -from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_source import ( MongoDBSource, ) from feast.repo_config import RepoConfig From 9bd0c1a36a9411df07bda47c799495cc0b471f64 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Thu, 19 Mar 2026 14:27:34 -0400 Subject: [PATCH 19/30] Rename mongodb_offline_store to mongodb, use One/Many naming convention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename module: mongodb_offline_store/ → mongodb/ - Rename files: mongodb.py → mongodb_many.py, mongodb_native.py → mongodb_one.py Class renames: - MongoDBSource → MongoDBSourceMany - MongoDBOptions → MongoDBOptionsMany - SavedDatasetMongoDBStorage → SavedDatasetMongoDBStorageMany - MongoDBOfflineStoreIbis → MongoDBOfflineStoreMany - MongoDBOfflineStoreIbisConfig → MongoDBOfflineStoreManyConfig - MongoDBSourceNative → MongoDBSourceOne - MongoDBOfflineStoreNative → MongoDBOfflineStoreOne - MongoDBOfflineStoreNativeConfig → MongoDBOfflineStoreOneConfig - MongoDBNativeRetrievalJob → MongoDBOneRetrievalJob The One/Many naming reflects the core architectural difference: - One: Single shared collection for all FeatureViews - Many: One collection per FeatureView Signed-off-by: Casey Clements --- .../__init__.py | 0 .../mongodb.py => mongodb/mongodb_many.py} | 72 ++++++++++--------- .../mongodb_one.py} | 56 ++++++++------- .../benchmark_mongodb_offline_stores.py | 40 +++++------ .../contrib/test_mongodb_offline_retrieval.py | 46 ++++++------ .../test_mongodb_offline_retrieval_native.py | 44 ++++++------ 6 files changed, 132 insertions(+), 126 deletions(-) rename sdk/python/feast/infra/offline_stores/contrib/{mongodb_offline_store => mongodb}/__init__.py (100%) rename sdk/python/feast/infra/offline_stores/contrib/{mongodb_offline_store/mongodb.py => mongodb/mongodb_many.py} (89%) rename sdk/python/feast/infra/offline_stores/contrib/{mongodb_offline_store/mongodb_native.py => mongodb/mongodb_one.py} (94%) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/__init__.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb/__init__.py similarity index 100% rename from sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/__init__.py rename to sdk/python/feast/infra/offline_stores/contrib/mongodb/__init__.py diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb/mongodb_many.py similarity index 89% rename from sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py rename to sdk/python/feast/infra/offline_stores/contrib/mongodb/mongodb_many.py index 241e69cbb4..7dac38af02 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb/mongodb_many.py @@ -34,7 +34,7 @@ SavedDatasetLocationAlreadyExists, ) from feast.feature_view import FeatureView -from feast.infra.offline_stores.contrib.mongodb_offline_store import DRIVER_METADATA +from feast.infra.offline_stores.contrib.mongodb import DRIVER_METADATA from feast.infra.offline_stores.ibis import ( get_historical_features_ibis, pull_all_from_table_or_query_ibis, @@ -86,11 +86,11 @@ def _infer_python_type_str(value: Any) -> Optional[str]: # --------------------------------------------------------------------------- -# MongoDBSource and related classes (collection-per-FeatureView schema) +# MongoDBSourceMany and related classes (one collection per FeatureView) # --------------------------------------------------------------------------- -class MongoDBOptions: +class MongoDBOptionsMany: """Options for a MongoDB data source (database + collection).""" def __init__(self, database: str, collection: str): @@ -108,21 +108,21 @@ def to_proto(self) -> DataSourceProto.CustomSourceOptions: @classmethod def from_proto( cls, options_proto: DataSourceProto.CustomSourceOptions - ) -> "MongoDBOptions": - """Deserialize a CustomSourceOptions proto back into a MongoDBOptions instance.""" + ) -> "MongoDBOptionsMany": + """Deserialize a CustomSourceOptions proto back into a MongoDBOptionsMany instance.""" config = json.loads(options_proto.configuration.decode("utf8")) return cls(database=config["database"], collection=config["collection"]) -class MongoDBSource(DataSource): - """A MongoDB collection used as a Feast offline data source. +class MongoDBSourceMany(DataSource): + """A MongoDB collection used as a Feast offline data source (one collection per FeatureView). ``name`` is the logical Feast name for this source. If omitted, it defaults to the value of ``collection``. At least one of ``name`` or ``collection`` must be supplied. ``database`` is the MongoDB database that contains the collection. When - omitted it falls back to ``MongoDBOfflineStoreConfig.database`` at query + omitted it falls back to ``MongoDBOfflineStoreManyConfig.database`` at query time, so a single store-level default can be shared across many sources. ``schema_sample_size`` controls how many documents are randomly sampled @@ -153,7 +153,7 @@ def __init__( # At least one of name / collection is non-None; cast to satisfy the type checker. name = cast(str, name or collection) - self._mongodb_options = MongoDBOptions( + self._mongodb_options = MongoDBOptionsMany( database=database or "", collection=collection or name, ) @@ -173,9 +173,9 @@ def __hash__(self): return super().__hash__() def __eq__(self, other): - if not isinstance(other, MongoDBSource): + if not isinstance(other, MongoDBSourceMany): raise TypeError( - "Comparisons should only involve MongoDBSource class objects." + "Comparisons should only involve MongoDBSourceMany class objects." ) return ( super().__eq__(other) @@ -195,10 +195,10 @@ def collection(self) -> str: return self._mongodb_options._collection @staticmethod - def from_proto(data_source: DataSourceProto) -> "MongoDBSource": + def from_proto(data_source: DataSourceProto) -> "MongoDBSourceMany": assert data_source.HasField("custom_options") options = json.loads(data_source.custom_options.configuration) - return MongoDBSource( + return MongoDBSourceMany( name=data_source.name, database=options["database"], collection=options["collection"], @@ -214,7 +214,7 @@ def _to_proto_impl(self) -> DataSourceProto: data_source_proto = DataSourceProto( name=self.name, type=DataSourceProto.CUSTOM_SOURCE, - data_source_class_type="feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb.MongoDBSource", + data_source_class_type="feast.infra.offline_stores.contrib.mongodb.mongodb_many.MongoDBSourceMany", field_mapping=self.field_mapping, custom_options=self._mongodb_options.to_proto(), description=self.description, @@ -281,15 +281,15 @@ def get_table_column_names_and_types( ] -class SavedDatasetMongoDBStorage(SavedDatasetStorage): - """Persists a Feast SavedDataset into a MongoDB collection.""" +class SavedDatasetMongoDBStorageMany(SavedDatasetStorage): + """Persists a Feast SavedDataset into a MongoDB collection (many-collection schema).""" _proto_attr_name = "custom_storage" - mongodb_options: MongoDBOptions + mongodb_options: MongoDBOptionsMany def __init__(self, database: str, collection: str): - self.mongodb_options = MongoDBOptions( + self.mongodb_options = MongoDBOptionsMany( database=database, collection=collection, ) @@ -297,9 +297,9 @@ def __init__(self, database: str, collection: str): @staticmethod def from_proto( storage_proto: SavedDatasetStorageProto, - ) -> "SavedDatasetMongoDBStorage": + ) -> "SavedDatasetMongoDBStorageMany": options = json.loads(storage_proto.custom_storage.configuration) - return SavedDatasetMongoDBStorage( + return SavedDatasetMongoDBStorageMany( database=options["database"], collection=options["collection"], ) @@ -308,7 +308,7 @@ def to_proto(self) -> SavedDatasetStorageProto: return SavedDatasetStorageProto(custom_storage=self.mongodb_options.to_proto()) def to_data_source(self) -> DataSource: - return MongoDBSource( + return MongoDBSourceMany( database=self.mongodb_options._database, collection=self.mongodb_options._collection, ) @@ -319,10 +319,10 @@ def to_data_source(self) -> DataSource: # --------------------------------------------------------------------------- -class MongoDBOfflineStoreIbisConfig(FeastConfigBaseModel): - """Configuration for the MongoDB Ibis-backed offline store.""" +class MongoDBOfflineStoreManyConfig(FeastConfigBaseModel): + """Configuration for the MongoDB offline store (one collection per FeatureView).""" - type: StrictStr = "feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb.MongoDBOfflineStoreIbis" + type: StrictStr = "feast.infra.offline_stores.contrib.mongodb.mongodb_many.MongoDBOfflineStoreMany" """Offline store type selector""" connection_string: StrictStr = "mongodb://localhost:27017" @@ -332,8 +332,12 @@ class MongoDBOfflineStoreIbisConfig(FeastConfigBaseModel): """Default MongoDB database name""" -class MongoDBOfflineStoreIbis(OfflineStore): - """Offline store backed by MongoDB, using Ibis for point-in-time joins.""" +class MongoDBOfflineStoreMany(OfflineStore): + """Offline store backed by MongoDB (one collection per FeatureView). + + Uses Ibis memtables for point-in-time joins. Each FeatureView's data is stored + in a separate MongoDB collection, with the collection name matching the source name. + """ @staticmethod def pull_latest_from_table_or_query( @@ -346,9 +350,9 @@ def pull_latest_from_table_or_query( start_date: datetime, end_date: datetime, ) -> RetrievalJob: - if not isinstance(data_source, MongoDBSource): + if not isinstance(data_source, MongoDBSourceMany): raise ValueError( - f"MongoDBOfflineStore expected a MongoDBSource, " + f"MongoDBOfflineStoreMany expected a MongoDBSourceMany, " f"got {type(data_source).__name__!r}." ) warnings.warn( @@ -405,9 +409,9 @@ def pull_all_from_table_or_query( start_date: Optional[datetime] = None, end_date: Optional[datetime] = None, ) -> RetrievalJob: - if not isinstance(data_source, MongoDBSource): + if not isinstance(data_source, MongoDBSourceMany): raise ValueError( - f"MongoDBOfflineStore expected a MongoDBSource, " + f"MongoDBOfflineStoreMany expected a MongoDBSourceMany, " f"got {type(data_source).__name__!r}." ) warnings.warn( @@ -436,9 +440,9 @@ def reader(data_source: DataSource, repo_path: str) -> Table: raise FeastExtrasDependencyImportError( "mongodb", "pymongo is not installed." ) - if not isinstance(data_source, MongoDBSource): + if not isinstance(data_source, MongoDBSourceMany): raise ValueError( - f"MongoDBOfflineStore reader expected a MongoDBSource, " + f"MongoDBOfflineStoreMany reader expected a MongoDBSourceMany, " f"got {type(data_source).__name__!r}." ) connection_string = config.offline_store.connection_string @@ -487,9 +491,9 @@ def writer( raise FeastExtrasDependencyImportError( "mongodb", "pymongo is not installed." ) - if not isinstance(data_source, MongoDBSource): + if not isinstance(data_source, MongoDBSourceMany): raise ValueError( - f"MongoDBOfflineStore writer expected a MongoDBSource, " + f"MongoDBOfflineStoreMany writer expected a MongoDBSourceMany, " f"got {type(data_source).__name__!r}." ) connection_string = config.offline_store.connection_string diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb/mongodb_one.py similarity index 94% rename from sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py rename to sdk/python/feast/infra/offline_stores/contrib/mongodb/mongodb_one.py index aa0c88f033..293b785c86 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_native.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb/mongodb_one.py @@ -109,7 +109,7 @@ from feast.errors import DataSourceNoNameException, FeastExtrasDependencyImportError from feast.feature_view import FeatureView from feast.infra.key_encoding_utils import serialize_entity_key -from feast.infra.offline_stores.contrib.mongodb_offline_store import DRIVER_METADATA +from feast.infra.offline_stores.contrib.mongodb import DRIVER_METADATA from feast.infra.offline_stores.offline_store import ( OfflineStore, RetrievalJob, @@ -127,10 +127,12 @@ from feast.value_type import ValueType -class MongoDBOfflineStoreNativeConfig(FeastConfigBaseModel): - """Configuration for the Native MongoDB offline store.""" +class MongoDBOfflineStoreOneConfig(FeastConfigBaseModel): + """Configuration for the MongoDB offline store (single shared collection).""" - type: StrictStr = "feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_native.MongoDBOfflineStoreNative" + type: StrictStr = ( + "feast.infra.offline_stores.contrib.mongodb.mongodb_one.MongoDBOfflineStoreOne" + ) """Offline store type selector""" connection_string: StrictStr = "mongodb://localhost:27017" @@ -143,12 +145,12 @@ class MongoDBOfflineStoreNativeConfig(FeastConfigBaseModel): """Single collection name for all feature views""" -class MongoDBSourceNative(DataSource): - """A MongoDB data source for the native offline store. +class MongoDBSourceOne(DataSource): + """A MongoDB data source for the single-collection offline store. - Unlike many data source implementations, this source does not map each - FeatureView to its own table or collection. Instead, all FeatureViews - share a single MongoDB collection (configured at the store level). + Unlike MongoDBSourceMany, this source does not map each FeatureView to + its own collection. Instead, all FeatureViews share a single MongoDB + collection (configured at the store level). Each document in that collection includes a ``feature_view`` field that identifies which FeatureView it belongs to. The ``name`` of this data @@ -183,9 +185,9 @@ def __hash__(self): return super().__hash__() def __eq__(self, other): - if not isinstance(other, MongoDBSourceNative): + if not isinstance(other, MongoDBSourceOne): raise TypeError( - "Comparisons should only involve MongoDBSourceNative class objects." + "Comparisons should only involve MongoDBSourceOne class objects." ) return ( super().__eq__(other) @@ -203,9 +205,9 @@ def source_type(self) -> DataSourceProto.SourceType.ValueType: return DataSourceProto.CUSTOM_SOURCE @staticmethod - def from_proto(data_source: DataSourceProto) -> "MongoDBSourceNative": + def from_proto(data_source: DataSourceProto) -> "MongoDBSourceOne": assert data_source.HasField("custom_options") - return MongoDBSourceNative( + return MongoDBSourceOne( name=data_source.name, timestamp_field=data_source.timestamp_field, created_timestamp_column=data_source.created_timestamp_column, @@ -219,7 +221,7 @@ def _to_proto_impl(self) -> DataSourceProto: return DataSourceProto( name=self.name, type=DataSourceProto.CUSTOM_SOURCE, - data_source_class_type="feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_native.MongoDBSourceNative", + data_source_class_type="feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_native.MongoDBSourceOne", field_mapping=self.field_mapping, custom_options=DataSourceProto.CustomSourceOptions( configuration=json.dumps({"feature_view": self.name}).encode() @@ -320,7 +322,7 @@ def _fetch_documents( return list(client[database][collection].aggregate(pipeline)) -class MongoDBNativeRetrievalJob(RetrievalJob): +class MongoDBOneRetrievalJob(RetrievalJob): """Retrieval job for native MongoDB offline store queries.""" def __init__( @@ -384,7 +386,7 @@ def _serialize_entity_key_from_row( return serialize_entity_key(entity_key, entity_key_serialization_version) -class MongoDBOfflineStoreNative(OfflineStore): +class MongoDBOfflineStoreOne(OfflineStore): """Native MongoDB offline store using single-collection schema. All feature views share one collection (``feature_history``), with documents @@ -452,9 +454,9 @@ def pull_latest_from_table_or_query( start_date: datetime, end_date: datetime, ) -> RetrievalJob: - if not isinstance(data_source, MongoDBSourceNative): + if not isinstance(data_source, MongoDBSourceOne): raise ValueError( - f"MongoDBOfflineStoreNative expected MongoDBSourceNative, " + f"MongoDBOfflineStoreOne expected MongoDBSourceOne, " f"got {type(data_source).__name__!r}." ) warnings.warn( @@ -499,7 +501,7 @@ def pull_latest_from_table_or_query( ] def _run() -> pyarrow.Table: - client = MongoDBOfflineStoreNative._get_client_and_ensure_indexes(config) + client = MongoDBOfflineStoreOne._get_client_and_ensure_indexes(config) try: docs = _fetch_documents(client, db_name, collection, pipeline) if not docs: @@ -515,7 +517,7 @@ def _run() -> pyarrow.Table: finally: client.close() - return MongoDBNativeRetrievalJob(query_fn=_run, full_feature_names=False) + return MongoDBOneRetrievalJob(query_fn=_run, full_feature_names=False) @staticmethod def pull_all_from_table_or_query( @@ -528,9 +530,9 @@ def pull_all_from_table_or_query( start_date: Optional[datetime] = None, end_date: Optional[datetime] = None, ) -> RetrievalJob: - if not isinstance(data_source, MongoDBSourceNative): + if not isinstance(data_source, MongoDBSourceOne): raise ValueError( - f"MongoDBOfflineStoreNative expected MongoDBSourceNative, " + f"MongoDBOfflineStoreOne expected MongoDBSourceOne, " f"got {type(data_source).__name__!r}." ) warnings.warn( @@ -571,7 +573,7 @@ def pull_all_from_table_or_query( ] def _run() -> pyarrow.Table: - client = MongoDBOfflineStoreNative._get_client_and_ensure_indexes(config) + client = MongoDBOfflineStoreOne._get_client_and_ensure_indexes(config) try: docs = _fetch_documents(client, db_name, collection, pipeline) if not docs: @@ -587,7 +589,7 @@ def _run() -> pyarrow.Table: finally: client.close() - return MongoDBNativeRetrievalJob(query_fn=_run, full_feature_names=False) + return MongoDBOneRetrievalJob(query_fn=_run, full_feature_names=False) @staticmethod def get_historical_features( @@ -610,7 +612,7 @@ def get_historical_features( """ if isinstance(entity_df, str): raise ValueError( - "MongoDBOfflineStoreNative does not support SQL entity_df strings. " + "MongoDBOfflineStoreOne does not support SQL entity_df strings. " "Pass a pandas DataFrame instead." ) warnings.warn( @@ -777,7 +779,7 @@ def _run() -> pyarrow.Table: working_df["_row_idx"] = range(len(working_df)) # Create client once for all chunks - client = MongoDBOfflineStoreNative._get_client_and_ensure_indexes(config) + client = MongoDBOfflineStoreOne._get_client_and_ensure_indexes(config) try: coll = client[db_name][feature_collection] @@ -807,7 +809,7 @@ def _run() -> pyarrow.Table: return pyarrow.Table.from_pandas(result_df, preserve_index=False) - return MongoDBNativeRetrievalJob( + return MongoDBOneRetrievalJob( query_fn=_run, full_feature_names=full_feature_names, ) diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py b/sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py index 27c6a6a35a..3b663b150c 100644 --- a/sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py +++ b/sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py @@ -34,15 +34,15 @@ from feast import Entity, FeatureView, Field from feast.infra.key_encoding_utils import serialize_entity_key -from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb import ( - MongoDBOfflineStoreIbis, - MongoDBOfflineStoreIbisConfig, - MongoDBSource, +from feast.infra.offline_stores.contrib.mongodb.mongodb_many import ( + MongoDBOfflineStoreMany, + MongoDBOfflineStoreManyConfig, + MongoDBSourceMany, ) -from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_native import ( - MongoDBOfflineStoreNative, - MongoDBOfflineStoreNativeConfig, - MongoDBSourceNative, +from feast.infra.offline_stores.contrib.mongodb.mongodb_one import ( + MongoDBOfflineStoreOne, + MongoDBOfflineStoreOneConfig, + MongoDBSourceOne, ) from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import Value as ValueProto @@ -149,7 +149,7 @@ def ibis_config(mongodb_connection_string: str) -> RepoConfig: project="benchmark", registry="memory://", provider="local", - offline_store=MongoDBOfflineStoreIbisConfig( + offline_store=MongoDBOfflineStoreManyConfig( connection_string=mongodb_connection_string, database="benchmark_db", ), @@ -165,7 +165,7 @@ def native_config(mongodb_connection_string: str) -> RepoConfig: project="benchmark", registry="memory://", provider="local", - offline_store=MongoDBOfflineStoreNativeConfig( + offline_store=MongoDBOfflineStoreOneConfig( connection_string=mongodb_connection_string, database="benchmark_db", collection="feature_history", @@ -241,7 +241,7 @@ def _generate_native_data( def _create_ibis_fv(num_features: int) -> tuple: """Create Ibis source and FeatureView.""" - source = MongoDBSource( + source = MongoDBSourceMany( name="driver_benchmark", database="benchmark_db", collection="driver_benchmark", @@ -267,7 +267,7 @@ def _create_ibis_fv(num_features: int) -> tuple: def _create_native_fv(num_features: int) -> tuple: """Create Native source and FeatureView.""" - source = MongoDBSourceNative( + source = MongoDBSourceOne( name="driver_benchmark", timestamp_field="event_timestamp", ) @@ -406,7 +406,7 @@ def test_scale_rows_ibis( feature_refs = [f"driver_benchmark:feature_{i}" for i in range(num_features)] def run_query(): - job = MongoDBOfflineStoreIbis.get_historical_features( + job = MongoDBOfflineStoreMany.get_historical_features( config=ibis_config, feature_views=[fv], feature_refs=feature_refs, @@ -461,7 +461,7 @@ def test_scale_rows_native( feature_refs = [f"driver_benchmark:feature_{i}" for i in range(num_features)] def run_query(): - job = MongoDBOfflineStoreNative.get_historical_features( + job = MongoDBOfflineStoreOne.get_historical_features( config=native_config, feature_views=[fv], feature_refs=feature_refs, @@ -517,7 +517,7 @@ def test_wide_features_ibis( feature_refs = [f"driver_benchmark:feature_{i}" for i in range(num_features)] def run_query(): - job = MongoDBOfflineStoreIbis.get_historical_features( + job = MongoDBOfflineStoreMany.get_historical_features( config=ibis_config, feature_views=[fv], feature_refs=feature_refs, @@ -570,7 +570,7 @@ def test_wide_features_native( feature_refs = [f"driver_benchmark:feature_{i}" for i in range(num_features)] def run_query(): - job = MongoDBOfflineStoreNative.get_historical_features( + job = MongoDBOfflineStoreOne.get_historical_features( config=native_config, feature_views=[fv], feature_refs=feature_refs, @@ -637,7 +637,7 @@ def test_entity_skew_ibis( feature_refs = [f"driver_benchmark:feature_{i}" for i in range(num_features)] def run_query(): - job = MongoDBOfflineStoreIbis.get_historical_features( + job = MongoDBOfflineStoreMany.get_historical_features( config=ibis_config, feature_views=[fv], feature_refs=feature_refs, @@ -703,7 +703,7 @@ def test_entity_skew_native( feature_refs = [f"driver_benchmark:feature_{i}" for i in range(num_features)] def run_query(): - job = MongoDBOfflineStoreNative.get_historical_features( + job = MongoDBOfflineStoreOne.get_historical_features( config=native_config, feature_views=[fv], feature_refs=feature_refs, @@ -776,7 +776,7 @@ def test_summary_comparison( _, ibis_fv = _create_ibis_fv(num_features) def run_ibis(): - job = MongoDBOfflineStoreIbis.get_historical_features( + job = MongoDBOfflineStoreMany.get_historical_features( config=ibis_config, feature_views=[ibis_fv], feature_refs=feature_refs, @@ -793,7 +793,7 @@ def run_ibis(): _, native_fv = _create_native_fv(num_features) def run_native(): - job = MongoDBOfflineStoreNative.get_historical_features( + job = MongoDBOfflineStoreOne.get_historical_features( config=native_config, feature_views=[native_fv], feature_refs=feature_refs, diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py b/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py index 3acd93c288..1c9882900d 100644 --- a/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py +++ b/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py @@ -19,10 +19,10 @@ from testcontainers.mongodb import MongoDbContainer from feast import Entity, FeatureView, Field -from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb import ( - MongoDBOfflineStoreIbis, - MongoDBOfflineStoreIbisConfig, - MongoDBSource, +from feast.infra.offline_stores.contrib.mongodb.mongodb_many import ( + MongoDBOfflineStoreMany, + MongoDBOfflineStoreManyConfig, + MongoDBSourceMany, ) from feast.repo_config import RepoConfig from feast.types import Float64, Int64, String @@ -75,7 +75,7 @@ def repo_config(mongodb_connection_string: str) -> RepoConfig: project="test_project", registry="memory://", provider="local", - offline_store=MongoDBOfflineStoreIbisConfig( + offline_store=MongoDBOfflineStoreManyConfig( connection_string=mongodb_connection_string, database="feast_test", ), @@ -129,9 +129,9 @@ def sample_data(mongodb_connection_string: str) -> datetime: @pytest.fixture -def driver_source() -> MongoDBSource: - """Create a MongoDBSource for driver stats.""" - return MongoDBSource( +def driver_source() -> MongoDBSourceMany: + """Create a MongoDBSourceMany for driver stats.""" + return MongoDBSourceMany( name="driver_stats", database="feast_test", collection="driver_stats", @@ -140,7 +140,7 @@ def driver_source() -> MongoDBSource: @pytest.fixture -def driver_fv(driver_source: MongoDBSource) -> FeatureView: +def driver_fv(driver_source: MongoDBSourceMany) -> FeatureView: """Create a FeatureView for driver stats. The ttl (time-to-live) parameter defines how far back in time Feast will look @@ -170,7 +170,7 @@ def driver_fv(driver_source: MongoDBSource) -> FeatureView: @_requires_docker def test_pull_latest_from_table_or_query( - repo_config: RepoConfig, sample_data: datetime, driver_source: MongoDBSource + repo_config: RepoConfig, sample_data: datetime, driver_source: MongoDBSourceMany ) -> None: """Test pulling latest features per entity from MongoDB. @@ -180,7 +180,7 @@ def test_pull_latest_from_table_or_query( is from 2 hours ago. """ now = sample_data - job = MongoDBOfflineStoreIbis.pull_latest_from_table_or_query( + job = MongoDBOfflineStoreMany.pull_latest_from_table_or_query( config=repo_config, data_source=driver_source, join_key_columns=["driver_id"], @@ -256,7 +256,7 @@ def test_get_historical_features_pit_join( } ) - job = MongoDBOfflineStoreIbis.get_historical_features( + job = MongoDBOfflineStoreMany.get_historical_features( config=repo_config, feature_views=[driver_fv], feature_refs=["driver_stats:conv_rate", "driver_stats:acc_rate"], @@ -287,11 +287,11 @@ def test_get_historical_features_pit_join( @_requires_docker def test_pull_all_from_table_or_query( - repo_config: RepoConfig, sample_data: datetime, driver_source: MongoDBSource + repo_config: RepoConfig, sample_data: datetime, driver_source: MongoDBSourceMany ) -> None: """Test pulling all features within a time range (no deduplication).""" now = sample_data - job = MongoDBOfflineStoreIbis.pull_all_from_table_or_query( + job = MongoDBOfflineStoreMany.pull_all_from_table_or_query( config=repo_config, data_source=driver_source, join_key_columns=["driver_id"], @@ -314,7 +314,7 @@ def test_pull_all_from_table_or_query( def test_ttl_excludes_stale_features( repo_config: RepoConfig, mongodb_connection_string: str, - driver_source: MongoDBSource, + driver_source: MongoDBSourceMany, ) -> None: """Test that TTL causes stale feature values to be returned as NULL. @@ -339,7 +339,7 @@ def test_ttl_excludes_stale_features( client.close() # Create source and feature view with 1-day TTL - ttl_source = MongoDBSource( + ttl_source = MongoDBSourceMany( name="driver_stats_ttl_test", database="feast_test", collection="driver_stats_ttl_test", @@ -367,7 +367,7 @@ def test_ttl_excludes_stale_features( } ) - job = MongoDBOfflineStoreIbis.get_historical_features( + job = MongoDBOfflineStoreMany.get_historical_features( config=repo_config, feature_views=[ttl_fv], feature_refs=["driver_stats_ttl_test:conv_rate"], @@ -429,13 +429,13 @@ def test_multiple_feature_views( client.close() # Create sources for each collection - driver_source = MongoDBSource( + driver_source = MongoDBSourceMany( name="driver_stats_multi", database="feast_test", collection="driver_stats_multi", timestamp_field="event_timestamp", ) - vehicle_source = MongoDBSource( + vehicle_source = MongoDBSourceMany( name="vehicle_stats_multi", database="feast_test", collection="vehicle_stats_multi", @@ -481,7 +481,7 @@ def test_multiple_feature_views( ) # Request features from BOTH feature views - job = MongoDBOfflineStoreIbis.get_historical_features( + job = MongoDBOfflineStoreMany.get_historical_features( config=repo_config, feature_views=[driver_fv, vehicle_fv], feature_refs=[ @@ -566,7 +566,7 @@ def test_compound_join_keys( client.close() # Create source - source = MongoDBSource( + source = MongoDBSourceMany( name="user_device_features", database="feast_test", collection="user_device_features", @@ -594,7 +594,7 @@ def test_compound_join_keys( ) # Test pull_latest: should get one row per unique (user_id, device_id) combination - job = MongoDBOfflineStoreIbis.pull_latest_from_table_or_query( + job = MongoDBOfflineStoreMany.pull_latest_from_table_or_query( config=repo_config, data_source=source, join_key_columns=["user_id", "device_id"], @@ -622,7 +622,7 @@ def test_compound_join_keys( } ) - job = MongoDBOfflineStoreIbis.get_historical_features( + job = MongoDBOfflineStoreMany.get_historical_features( config=repo_config, feature_views=[fv], feature_refs=["user_device_features:app_opens"], diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval_native.py b/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval_native.py index 5c02299254..f18d2d15af 100644 --- a/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval_native.py +++ b/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval_native.py @@ -32,10 +32,10 @@ from feast import Entity, FeatureView, Field from feast.infra.key_encoding_utils import serialize_entity_key -from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_native import ( - MongoDBOfflineStoreNative, - MongoDBOfflineStoreNativeConfig, - MongoDBSourceNative, +from feast.infra.offline_stores.contrib.mongodb.mongodb_one import ( + MongoDBOfflineStoreOne, + MongoDBOfflineStoreOneConfig, + MongoDBSourceOne, ) from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import Value as ValueProto @@ -109,7 +109,7 @@ def repo_config(mongodb_connection_string: str) -> RepoConfig: project="test_project", registry="memory://", provider="local", - offline_store=MongoDBOfflineStoreNativeConfig( + offline_store=MongoDBOfflineStoreOneConfig( connection_string=mongodb_connection_string, database="feast_test", collection="feature_history", @@ -170,9 +170,9 @@ def sample_data(mongodb_connection_string: str) -> datetime: @pytest.fixture -def driver_source() -> MongoDBSourceNative: - """Create a MongoDBSourceNative for driver stats.""" - return MongoDBSourceNative( +def driver_source() -> MongoDBSourceOne: + """Create a MongoDBSourceOne for driver stats.""" + return MongoDBSourceOne( name="driver_stats", timestamp_field="event_timestamp", created_timestamp_column="created_at", @@ -180,7 +180,7 @@ def driver_source() -> MongoDBSourceNative: @pytest.fixture -def driver_fv(driver_source: MongoDBSourceNative) -> FeatureView: +def driver_fv(driver_source: MongoDBSourceOne) -> FeatureView: """Create a FeatureView for driver stats.""" driver_entity = Entity( name="driver_id", join_keys=["driver_id"], value_type=ValueType.INT64 @@ -200,11 +200,11 @@ def driver_fv(driver_source: MongoDBSourceNative) -> FeatureView: @_requires_docker def test_pull_latest_from_table_or_query( - repo_config: RepoConfig, sample_data: datetime, driver_source: MongoDBSourceNative + repo_config: RepoConfig, sample_data: datetime, driver_source: MongoDBSourceOne ) -> None: """Test pulling latest features per entity from the single collection.""" now = sample_data - job = MongoDBOfflineStoreNative.pull_latest_from_table_or_query( + job = MongoDBOfflineStoreOne.pull_latest_from_table_or_query( config=repo_config, data_source=driver_source, join_key_columns=["driver_id"], @@ -246,7 +246,7 @@ def test_get_historical_features_pit_join( } ) - job = MongoDBOfflineStoreNative.get_historical_features( + job = MongoDBOfflineStoreOne.get_historical_features( config=repo_config, feature_views=[driver_fv], feature_refs=["driver_stats:conv_rate", "driver_stats:acc_rate"], @@ -277,11 +277,11 @@ def test_get_historical_features_pit_join( @_requires_docker def test_pull_all_from_table_or_query( - repo_config: RepoConfig, sample_data: datetime, driver_source: MongoDBSourceNative + repo_config: RepoConfig, sample_data: datetime, driver_source: MongoDBSourceOne ) -> None: """Test pulling all features within a time range (no deduplication).""" now = sample_data - job = MongoDBOfflineStoreNative.pull_all_from_table_or_query( + job = MongoDBOfflineStoreOne.pull_all_from_table_or_query( config=repo_config, data_source=driver_source, join_key_columns=["driver_id"], @@ -330,7 +330,7 @@ def test_ttl_excludes_stale_features( collection.insert_many(ttl_docs) client.close() - ttl_source = MongoDBSourceNative( + ttl_source = MongoDBSourceOne( name="driver_stats_ttl", timestamp_field="event_timestamp", ) @@ -355,7 +355,7 @@ def test_ttl_excludes_stale_features( } ) - job = MongoDBOfflineStoreNative.get_historical_features( + job = MongoDBOfflineStoreOne.get_historical_features( config=repo_config, feature_views=[ttl_fv], feature_refs=["driver_stats_ttl:conv_rate"], @@ -422,8 +422,8 @@ def test_multiple_feature_views( client.close() # Create sources and feature views - driver_source = MongoDBSourceNative(name="driver_stats_multi") - vehicle_source = MongoDBSourceNative(name="vehicle_stats_multi") + driver_source = MongoDBSourceOne(name="driver_stats_multi") + vehicle_source = MongoDBSourceOne(name="vehicle_stats_multi") driver_entity = Entity( name="driver_id", join_keys=["driver_id"], value_type=ValueType.INT64 @@ -459,7 +459,7 @@ def test_multiple_feature_views( } ) - job = MongoDBOfflineStoreNative.get_historical_features( + job = MongoDBOfflineStoreOne.get_historical_features( config=repo_config, feature_views=[driver_fv, vehicle_fv], feature_refs=[ @@ -534,7 +534,7 @@ def test_compound_join_keys( collection.insert_many(compound_docs) client.close() - source = MongoDBSourceNative(name="user_device_features") + source = MongoDBSourceOne(name="user_device_features") user_entity = Entity( name="user_id", join_keys=["user_id"], value_type=ValueType.INT64 @@ -556,7 +556,7 @@ def test_compound_join_keys( ) # Test pull_latest: should get one row per unique (user_id, device_id) - job = MongoDBOfflineStoreNative.pull_latest_from_table_or_query( + job = MongoDBOfflineStoreOne.pull_latest_from_table_or_query( config=repo_config, data_source=source, join_key_columns=["user_id", "device_id"], @@ -585,7 +585,7 @@ def test_compound_join_keys( } ) - job = MongoDBOfflineStoreNative.get_historical_features( + job = MongoDBOfflineStoreOne.get_historical_features( config=repo_config, feature_views=[fv], feature_refs=["user_device_features:app_opens"], From 2c2549474fa7ba5fba781946d92c4809024e3e82 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Mar 2026 14:14:49 -0400 Subject: [PATCH 20/30] Add README.md documenting MongoDB offline store implementations Signed-off-by: Casey Clements --- .../offline_stores/contrib/mongodb/README.md | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 sdk/python/feast/infra/offline_stores/contrib/mongodb/README.md diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb/README.md b/sdk/python/feast/infra/offline_stores/contrib/mongodb/README.md new file mode 100644 index 0000000000..44983940ff --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb/README.md @@ -0,0 +1,143 @@ +# MongoDB Offline Store + +Two MongoDB offline store implementations optimized for different use cases. + +## Overview + +| Aspect | `MongoDBOfflineStoreMany` | `MongoDBOfflineStoreOne` | +|--------|---------------------------|--------------------------| +| Collections | One per FeatureView | Single shared collection | +| Schema | Flat documents | Nested `features` subdoc | +| Entity ID | Separate columns | Serialized bytes | +| Best for | Small-medium feature stores | Large feature stores | + +## MongoDBOfflineStoreMany (mongodb_many.py) + +**One collection per FeatureView** — each FeatureView maps to its own MongoDB collection. + +### Schema + +```javascript +// Collection: driver_stats +{ + "driver_id": 1001, + "event_timestamp": ISODate("2024-01-15T10:00:00Z"), + "trips_today": 5, + "rating": 4.8 +} +``` + +### Configuration + +```yaml +offline_store: + type: feast.infra.offline_stores.contrib.mongodb.mongodb_many.MongoDBOfflineStoreMany + connection_string: mongodb://localhost:27017 + database: feast +``` + +### When to Use + +✅ **Small to medium feature stores** — loads entire collection into memory +✅ **Fast PIT joins** — Ibis memtables are highly optimized +✅ **Simple schema** — flat documents, easy to query directly +✅ **Per-collection indexes** — each FV can have tailored indexes + +⚠️ **Caution**: Loads ALL documents from each collection. May OOM on very large collections. + +## MongoDBOfflineStoreOne (mongodb_one.py) + +**Single shared collection** — all FeatureViews store data in one collection with a discriminator field. + +### Schema + +```javascript +// Collection: feature_history (shared by all FVs) +{ + "entity_id": Binary("..."), // Serialized entity key + "feature_view": "driver_stats", // Discriminator + "features": { // Nested subdocument + "trips_today": 5, + "rating": 4.8 + }, + "event_timestamp": ISODate("2024-01-15T10:00:00Z"), + "created_at": ISODate("2024-01-15T10:00:01Z") +} +``` + +### Configuration + +```yaml +offline_store: + type: feast.infra.offline_stores.contrib.mongodb.mongodb_one.MongoDBOfflineStoreOne + connection_string: mongodb://localhost:27017 + database: feast + collection: feature_history +``` + +### When to Use + +✅ **Large feature stores** — filters by entity_id, doesn't load entire collection +✅ **Memory-safe** — processes in chunks, bounded memory usage +✅ **Schema consistency** — matches online store pattern +✅ **Efficient materialization** — MQL aggregation pipeline + +⚠️ **Trade-off**: Slightly slower than Many for small workloads due to serialization overhead. + +## Performance Comparison + +Benchmarks with 10 features, 3 historical rows per entity: + +| Entity Rows | Many (time) | One (time) | Winner | +|-------------|-------------|------------|--------| +| 1,000 | 0.30s | 0.06s | One | +| 10,000 | 0.20s | 0.31s | Many | +| 100,000 | 1.51s | 5.22s | Many | +| 1,000,000 | 16.08s | 212s | Many | + +### Memory Behavior + +| Scenario | Many | One | +|----------|------|-----| +| Large feature collection, small entity_df | ❌ Loads all | ✅ Filters | +| Small feature collection, large entity_df | ✅ Fast | ⚠️ Slower | + +## Choosing an Implementation + +``` + ┌─────────────────────────────┐ + │ Is your feature collection │ + │ larger than available RAM? │ + └─────────────────────────────┘ + │ + ┌──────────┴──────────┐ + ▼ ▼ + YES NO + │ │ + ▼ ▼ + ┌───────────────┐ ┌───────────────┐ + │ Use ONE │ │ Use MANY │ + │ (memory-safe) │ │ (faster) │ + └───────────────┘ └───────────────┘ +``` + +## Index Recommendations + +### Many (per-collection) + +```javascript +db.driver_stats.createIndex({ "driver_id": 1, "event_timestamp": -1 }) +``` + +### One (shared collection) + +```javascript +db.feature_history.createIndex({ + "entity_id": 1, + "feature_view": 1, + "event_timestamp": -1 +}) +``` + +The One implementation creates this index automatically on first use. + From b50e22fc47230dfbb370d45af2e46796dce2e87f Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Mar 2026 14:25:44 -0400 Subject: [PATCH 21/30] Rename mongodb/ to mongodb_offline_store/, organize tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename module: mongodb/ → mongodb_offline_store/ (follows naming convention) - Move tests to mongodb_offline_store/ subdirectory: - test_mongodb_offline_retrieval.py → mongodb_offline_store/test_many.py - test_mongodb_offline_retrieval_native.py → mongodb_offline_store/test_one.py - benchmark_mongodb_offline_stores.py → mongodb_offline_store/benchmark.py - Update all imports to use mongodb_offline_store path Signed-off-by: Casey Clements --- .../contrib/{mongodb => mongodb_offline_store}/README.md | 4 ++-- .../contrib/{mongodb => mongodb_offline_store}/__init__.py | 0 .../{mongodb => mongodb_offline_store}/mongodb_many.py | 6 +++--- .../{mongodb => mongodb_offline_store}/mongodb_one.py | 6 ++---- .../contrib/mongodb_offline_store/__init__.py | 0 .../benchmark.py} | 4 ++-- .../test_many.py} | 2 +- .../test_one.py} | 2 +- 8 files changed, 11 insertions(+), 13 deletions(-) rename sdk/python/feast/infra/offline_stores/contrib/{mongodb => mongodb_offline_store}/README.md (95%) rename sdk/python/feast/infra/offline_stores/contrib/{mongodb => mongodb_offline_store}/__init__.py (100%) rename sdk/python/feast/infra/offline_stores/contrib/{mongodb => mongodb_offline_store}/mongodb_many.py (98%) rename sdk/python/feast/infra/offline_stores/contrib/{mongodb => mongodb_offline_store}/mongodb_one.py (99%) create mode 100644 sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/__init__.py rename sdk/python/tests/unit/infra/offline_stores/contrib/{benchmark_mongodb_offline_stores.py => mongodb_offline_store/benchmark.py} (99%) rename sdk/python/tests/unit/infra/offline_stores/contrib/{test_mongodb_offline_retrieval.py => mongodb_offline_store/test_many.py} (99%) rename sdk/python/tests/unit/infra/offline_stores/contrib/{test_mongodb_offline_retrieval_native.py => mongodb_offline_store/test_one.py} (99%) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb/README.md b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md similarity index 95% rename from sdk/python/feast/infra/offline_stores/contrib/mongodb/README.md rename to sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md index 44983940ff..23afc6f2f5 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb/README.md +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md @@ -31,7 +31,7 @@ Two MongoDB offline store implementations optimized for different use cases. ```yaml offline_store: - type: feast.infra.offline_stores.contrib.mongodb.mongodb_many.MongoDBOfflineStoreMany + type: feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_many.MongoDBOfflineStoreMany connection_string: mongodb://localhost:27017 database: feast ``` @@ -69,7 +69,7 @@ offline_store: ```yaml offline_store: - type: feast.infra.offline_stores.contrib.mongodb.mongodb_one.MongoDBOfflineStoreOne + type: feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_one.MongoDBOfflineStoreOne connection_string: mongodb://localhost:27017 database: feast collection: feature_history diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb/__init__.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/__init__.py similarity index 100% rename from sdk/python/feast/infra/offline_stores/contrib/mongodb/__init__.py rename to sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/__init__.py diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb/mongodb_many.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py similarity index 98% rename from sdk/python/feast/infra/offline_stores/contrib/mongodb/mongodb_many.py rename to sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py index 7dac38af02..5c5bf0b0ba 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb/mongodb_many.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py @@ -34,7 +34,7 @@ SavedDatasetLocationAlreadyExists, ) from feast.feature_view import FeatureView -from feast.infra.offline_stores.contrib.mongodb import DRIVER_METADATA +from feast.infra.offline_stores.contrib.mongodb_offline_store import DRIVER_METADATA from feast.infra.offline_stores.ibis import ( get_historical_features_ibis, pull_all_from_table_or_query_ibis, @@ -214,7 +214,7 @@ def _to_proto_impl(self) -> DataSourceProto: data_source_proto = DataSourceProto( name=self.name, type=DataSourceProto.CUSTOM_SOURCE, - data_source_class_type="feast.infra.offline_stores.contrib.mongodb.mongodb_many.MongoDBSourceMany", + data_source_class_type="feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_many.MongoDBSourceMany", field_mapping=self.field_mapping, custom_options=self._mongodb_options.to_proto(), description=self.description, @@ -322,7 +322,7 @@ def to_data_source(self) -> DataSource: class MongoDBOfflineStoreManyConfig(FeastConfigBaseModel): """Configuration for the MongoDB offline store (one collection per FeatureView).""" - type: StrictStr = "feast.infra.offline_stores.contrib.mongodb.mongodb_many.MongoDBOfflineStoreMany" + type: StrictStr = "feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_many.MongoDBOfflineStoreMany" """Offline store type selector""" connection_string: StrictStr = "mongodb://localhost:27017" diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb/mongodb_one.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_one.py similarity index 99% rename from sdk/python/feast/infra/offline_stores/contrib/mongodb/mongodb_one.py rename to sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_one.py index 293b785c86..f40f30df83 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb/mongodb_one.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_one.py @@ -109,7 +109,7 @@ from feast.errors import DataSourceNoNameException, FeastExtrasDependencyImportError from feast.feature_view import FeatureView from feast.infra.key_encoding_utils import serialize_entity_key -from feast.infra.offline_stores.contrib.mongodb import DRIVER_METADATA +from feast.infra.offline_stores.contrib.mongodb_offline_store import DRIVER_METADATA from feast.infra.offline_stores.offline_store import ( OfflineStore, RetrievalJob, @@ -130,9 +130,7 @@ class MongoDBOfflineStoreOneConfig(FeastConfigBaseModel): """Configuration for the MongoDB offline store (single shared collection).""" - type: StrictStr = ( - "feast.infra.offline_stores.contrib.mongodb.mongodb_one.MongoDBOfflineStoreOne" - ) + type: StrictStr = "feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_one.MongoDBOfflineStoreOne" """Offline store type selector""" connection_string: StrictStr = "mongodb://localhost:27017" diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/__init__.py b/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py b/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/benchmark.py similarity index 99% rename from sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py rename to sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/benchmark.py index 3b663b150c..f0d03e32e4 100644 --- a/sdk/python/tests/unit/infra/offline_stores/contrib/benchmark_mongodb_offline_stores.py +++ b/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/benchmark.py @@ -34,12 +34,12 @@ from feast import Entity, FeatureView, Field from feast.infra.key_encoding_utils import serialize_entity_key -from feast.infra.offline_stores.contrib.mongodb.mongodb_many import ( +from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_many import ( MongoDBOfflineStoreMany, MongoDBOfflineStoreManyConfig, MongoDBSourceMany, ) -from feast.infra.offline_stores.contrib.mongodb.mongodb_one import ( +from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_one import ( MongoDBOfflineStoreOne, MongoDBOfflineStoreOneConfig, MongoDBSourceOne, diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py b/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/test_many.py similarity index 99% rename from sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py rename to sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/test_many.py index 1c9882900d..cbd43ea8d1 100644 --- a/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval.py +++ b/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/test_many.py @@ -19,7 +19,7 @@ from testcontainers.mongodb import MongoDbContainer from feast import Entity, FeatureView, Field -from feast.infra.offline_stores.contrib.mongodb.mongodb_many import ( +from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_many import ( MongoDBOfflineStoreMany, MongoDBOfflineStoreManyConfig, MongoDBSourceMany, diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval_native.py b/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/test_one.py similarity index 99% rename from sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval_native.py rename to sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/test_one.py index f18d2d15af..689fef915e 100644 --- a/sdk/python/tests/unit/infra/offline_stores/contrib/test_mongodb_offline_retrieval_native.py +++ b/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/test_one.py @@ -32,7 +32,7 @@ from feast import Entity, FeatureView, Field from feast.infra.key_encoding_utils import serialize_entity_key -from feast.infra.offline_stores.contrib.mongodb.mongodb_one import ( +from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_one import ( MongoDBOfflineStoreOne, MongoDBOfflineStoreOneConfig, MongoDBSourceOne, From bae2648a7583af52128a19866e71fd31e92b3a72 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Mar 2026 14:30:40 -0400 Subject: [PATCH 22/30] Update docstring in benchmark.py Signed-off-by: Casey Clements --- .../offline_stores/contrib/mongodb_offline_store/benchmark.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/benchmark.py b/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/benchmark.py index f0d03e32e4..49d8b8aeb6 100644 --- a/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/benchmark.py +++ b/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/benchmark.py @@ -1,5 +1,7 @@ """ -Performance benchmarks comparing Ibis vs Native MongoDB offline store implementations. +Performance benchmarks comparing the two MongoDB offline store implementations - +one Collection with all feature views +vs. a schema of N collections for N features views. These tests measure performance across different scaling dimensions: 1. Row count scaling (entity_df size) From e4c79bf8cda64078e9d43f1b86469a65ed33d89c Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Mar 2026 14:38:59 -0400 Subject: [PATCH 23/30] Update README to show created_at tie-breaker in Many schema Signed-off-by: Casey Clements --- .../offline_stores/contrib/mongodb_offline_store/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md index 23afc6f2f5..30ea64af2a 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md @@ -22,11 +22,14 @@ Two MongoDB offline store implementations optimized for different use cases. { "driver_id": 1001, "event_timestamp": ISODate("2024-01-15T10:00:00Z"), + "created_at": ISODate("2024-01-15T10:00:01Z"), // Optional tie-breaker "trips_today": 5, "rating": 4.8 } ``` +Ties (same `event_timestamp`) are broken by `created_timestamp_column` if configured. + ### Configuration ```yaml From 548698b590a3f49a374b501e575f641760822424 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Mar 2026 14:43:44 -0400 Subject: [PATCH 24/30] Update README index recommendations for Many implementation - Clarify that indexes should be on join keys + timestamp - Show example for compound join keys - Note that Many does not auto-create indexes Signed-off-by: Casey Clements --- .../contrib/mongodb_offline_store/README.md | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md index 30ea64af2a..561eb70ed7 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md @@ -128,16 +128,31 @@ Benchmarks with 10 features, 3 historical rows per entity: ### Many (per-collection) +Each collection should have an index on the join keys + timestamp: + ```javascript -db.driver_stats.createIndex({ "driver_id": 1, "event_timestamp": -1 }) +// For a FeatureView with join key "driver_id" +db.driver_stats.createIndex({ + "driver_id": 1, // Join key(s) + "event_timestamp": -1 +}) + +// For a FeatureView with compound join keys +db.order_stats.createIndex({ + "customer_id": 1, + "order_id": 1, + "event_timestamp": -1 +}) ``` +**Note**: The Many implementation does not auto-create indexes. Create them manually or via a migration script. + ### One (shared collection) ```javascript db.feature_history.createIndex({ "entity_id": 1, - "feature_view": 1, + "feature_view": 1, "event_timestamp": -1 }) ``` From 1597264f0338efa21b54b48f742df38adc8bd7c1 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Mar 2026 14:47:36 -0400 Subject: [PATCH 25/30] Add auto-create index to MongoDBOfflineStoreMany - Add _ensure_index_many() function with module-level cache - Call during pull_latest_from_table_or_query (materialization) - Creates index on join_keys + timestamp + created_timestamp - Checks for existing index before creating - Update README to reflect auto-create behavior Signed-off-by: Casey Clements --- .../contrib/mongodb_offline_store/README.md | 2 +- .../mongodb_offline_store/mongodb_many.py | 61 +++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md index 561eb70ed7..db6318ee17 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/README.md @@ -145,7 +145,7 @@ db.order_stats.createIndex({ }) ``` -**Note**: The Many implementation does not auto-create indexes. Create them manually or via a migration script. +**Note**: The Many implementation auto-creates indexes during `pull_latest_from_table_or_query` (materialization). ### One (shared collection) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py index 5c5bf0b0ba..3faec603a3 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py @@ -359,6 +359,26 @@ def pull_latest_from_table_or_query( "MongoDB offline store is in preview. API may change without notice.", RuntimeWarning, ) + + # Ensure index exists for efficient queries + if MongoClient is not None: + connection_string = config.offline_store.connection_string + db_name = data_source.database or config.offline_store.database + client: Any = MongoClient( + connection_string, driver=DRIVER_METADATA, tz_aware=True + ) + try: + _ensure_index_many( + client=client, + db_name=db_name, + collection_name=data_source.collection, + join_keys=join_key_columns, + timestamp_field=timestamp_field, + created_timestamp_column=created_timestamp_column, + ) + finally: + client.close() + return pull_latest_from_table_or_query_ibis( config=config, data_source=data_source, @@ -475,6 +495,47 @@ def reader(data_source: DataSource, repo_path: str) -> Table: return reader +# Track which collections have had indexes ensured (module-level cache) +_indexes_ensured: set = set() + + +def _ensure_index_many( + client: Any, + db_name: str, + collection_name: str, + join_keys: List[str], + timestamp_field: str, + created_timestamp_column: Optional[str] = None, +) -> None: + """Create recommended index on a Many-schema collection. + + Index is on: join_keys (ascending) + timestamp (descending) + created_at (descending). + Uses a module-level cache to avoid redundant index creation checks. + """ + cache_key = f"{db_name}.{collection_name}" + if cache_key in _indexes_ensured: + return + + coll = client[db_name][collection_name] + + # Build index key: join_keys (asc) + timestamp (desc) + created_at (desc) + index_keys = [(k, 1) for k in join_keys] + index_keys.append((timestamp_field, -1)) + if created_timestamp_column: + index_keys.append((created_timestamp_column, -1)) + + # Check if equivalent index already exists + existing_indexes = coll.index_information() + for idx_info in existing_indexes.values(): + if idx_info.get("key") == index_keys: + _indexes_ensured.add(cache_key) + return + + # Create the index + coll.create_index(index_keys, background=True) + _indexes_ensured.add(cache_key) + + def _build_data_source_writer( config: RepoConfig, ) -> Callable[[Table, DataSource, str, str, bool], None]: From 39afa9a77274cb2eb644db8776ff7493d95a3ae9 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Mar 2026 15:11:58 -0400 Subject: [PATCH 26/30] Update benchmark.py to use One/Many naming convention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename functions: _generate_ibis_data → _generate_many_data, etc. - Rename fixtures: ibis_config → many_config, native_config → one_config - Rename tests: test_scale_rows_ibis → test_scale_rows_many, etc. - Update all docstrings and print statements - Update summary comparison output format Signed-off-by: Casey Clements --- .../mongodb_offline_store/benchmark.py | 162 +++++++++--------- 1 file changed, 82 insertions(+), 80 deletions(-) diff --git a/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/benchmark.py b/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/benchmark.py index 49d8b8aeb6..fa7f99e06b 100644 --- a/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/benchmark.py +++ b/sdk/python/tests/unit/infra/offline_stores/contrib/mongodb_offline_store/benchmark.py @@ -1,7 +1,8 @@ """ -Performance benchmarks comparing the two MongoDB offline store implementations - -one Collection with all feature views -vs. a schema of N collections for N features views. +Performance benchmarks comparing MongoDB offline store implementations: Many vs One. + +- Many: One collection per FeatureView (MongoDBOfflineStoreMany) +- One: Single shared collection for all FeatureViews (MongoDBOfflineStoreOne) These tests measure performance across different scaling dimensions: 1. Row count scaling (entity_df size) @@ -13,8 +14,8 @@ - Memory (peak Python memory via tracemalloc) - MongoDB server metrics (opcounters, execution stats) -Run with: pytest benchmark_mongodb_offline_stores.py -v -s -Skip slow tests: pytest benchmark_mongodb_offline_stores.py -v -s -m "not slow" +Run with: pytest benchmark.py -v -s +Skip slow tests: pytest benchmark.py -v -s -m "not slow" """ import time @@ -145,8 +146,8 @@ def mongodb_connection_string(mongodb_container: MongoDbContainer) -> str: @pytest.fixture -def ibis_config(mongodb_connection_string: str) -> RepoConfig: - """RepoConfig for Ibis implementation.""" +def many_config(mongodb_connection_string: str) -> RepoConfig: + """RepoConfig for Many implementation (one collection per FeatureView).""" return RepoConfig( project="benchmark", registry="memory://", @@ -161,8 +162,8 @@ def ibis_config(mongodb_connection_string: str) -> RepoConfig: @pytest.fixture -def native_config(mongodb_connection_string: str) -> RepoConfig: - """RepoConfig for Native implementation.""" +def one_config(mongodb_connection_string: str) -> RepoConfig: + """RepoConfig for One implementation (single shared collection).""" return RepoConfig( project="benchmark", registry="memory://", @@ -177,7 +178,7 @@ def native_config(mongodb_connection_string: str) -> RepoConfig: ) -def _generate_ibis_data( +def _generate_many_data( client: MongoClient, db_name: str, collection_name: str, @@ -185,7 +186,7 @@ def _generate_ibis_data( num_features: int, rows_per_entity: int = 5, ) -> datetime: - """Generate test data for Ibis (one collection per FV, flat schema).""" + """Generate test data for Many (one collection per FV, flat schema).""" collection = client[db_name][collection_name] collection.drop() @@ -206,7 +207,7 @@ def _generate_ibis_data( return now -def _generate_native_data( +def _generate_one_data( client: MongoClient, db_name: str, collection_name: str, @@ -215,7 +216,7 @@ def _generate_native_data( num_features: int, rows_per_entity: int = 5, ) -> datetime: - """Generate test data for Native (single collection, nested features).""" + """Generate test data for One (single collection, nested features).""" collection = client[db_name][collection_name] # Don't drop - may have multiple FVs in same collection @@ -241,8 +242,8 @@ def _generate_native_data( return now -def _create_ibis_fv(num_features: int) -> tuple: - """Create Ibis source and FeatureView.""" +def _create_many_fv(num_features: int) -> tuple: + """Create Many source and FeatureView.""" source = MongoDBSourceMany( name="driver_benchmark", database="benchmark_db", @@ -267,8 +268,8 @@ def _create_ibis_fv(num_features: int) -> tuple: return source, fv -def _create_native_fv(num_features: int) -> tuple: - """Create Native source and FeatureView.""" +def _create_one_fv(num_features: int) -> tuple: + """Create One source and FeatureView.""" source = MongoDBSourceOne( name="driver_benchmark", timestamp_field="event_timestamp", @@ -375,10 +376,10 @@ def _print_benchmark_result( @_requires_docker @pytest.mark.parametrize("num_rows", ROW_COUNTS) -def test_scale_rows_ibis( - mongodb_connection_string: str, ibis_config: RepoConfig, num_rows: int +def test_scale_rows_many( + mongodb_connection_string: str, many_config: RepoConfig, num_rows: int ) -> None: - """Benchmark Ibis implementation with varying entity_df sizes. + """Benchmark Many implementation with varying entity_df sizes. Measures: runtime, peak memory, MongoDB opcounters. """ @@ -387,7 +388,7 @@ def test_scale_rows_ibis( client = MongoClient(mongodb_connection_string) try: - now = _generate_ibis_data( + now = _generate_many_data( client, "benchmark_db", "driver_benchmark", @@ -396,7 +397,7 @@ def test_scale_rows_ibis( rows_per_entity=3, ) - _, fv = _create_ibis_fv(num_features) + _, fv = _create_many_fv(num_features) entity_df = pd.DataFrame( { @@ -409,7 +410,7 @@ def test_scale_rows_ibis( def run_query(): job = MongoDBOfflineStoreMany.get_historical_features( - config=ibis_config, + config=many_config, feature_views=[fv], feature_refs=feature_refs, entity_df=entity_df, @@ -428,10 +429,10 @@ def run_query(): @_requires_docker @pytest.mark.parametrize("num_rows", ROW_COUNTS) -def test_scale_rows_native( - mongodb_connection_string: str, native_config: RepoConfig, num_rows: int +def test_scale_rows_one( + mongodb_connection_string: str, one_config: RepoConfig, num_rows: int ) -> None: - """Benchmark Native implementation with varying entity_df sizes. + """Benchmark One implementation with varying entity_df sizes. Measures: runtime, peak memory, MongoDB opcounters. """ @@ -441,7 +442,7 @@ def test_scale_rows_native( client = MongoClient(mongodb_connection_string) try: client["benchmark_db"]["feature_history"].drop() - now = _generate_native_data( + now = _generate_one_data( client, "benchmark_db", "feature_history", @@ -451,7 +452,7 @@ def test_scale_rows_native( rows_per_entity=3, ) - _, fv = _create_native_fv(num_features) + _, fv = _create_one_fv(num_features) entity_df = pd.DataFrame( { @@ -464,7 +465,7 @@ def test_scale_rows_native( def run_query(): job = MongoDBOfflineStoreOne.get_historical_features( - config=native_config, + config=one_config, feature_views=[fv], feature_refs=feature_refs, entity_df=entity_df, @@ -490,15 +491,15 @@ def run_query(): @_requires_docker @pytest.mark.parametrize("num_features", FEATURE_COUNTS) -def test_wide_features_ibis( - mongodb_connection_string: str, ibis_config: RepoConfig, num_features: int +def test_wide_features_many( + mongodb_connection_string: str, many_config: RepoConfig, num_features: int ) -> None: - """Benchmark Ibis with varying feature width.""" + """Benchmark Many with varying feature width.""" num_entities = 1000 client = MongoClient(mongodb_connection_string) try: - now = _generate_ibis_data( + now = _generate_many_data( client, "benchmark_db", "driver_benchmark", @@ -507,7 +508,7 @@ def test_wide_features_ibis( rows_per_entity=3, ) - _, fv = _create_ibis_fv(num_features) + _, fv = _create_many_fv(num_features) entity_df = pd.DataFrame( { @@ -520,7 +521,7 @@ def test_wide_features_ibis( def run_query(): job = MongoDBOfflineStoreMany.get_historical_features( - config=ibis_config, + config=many_config, feature_views=[fv], feature_refs=feature_refs, entity_df=entity_df, @@ -541,16 +542,16 @@ def run_query(): @_requires_docker @pytest.mark.parametrize("num_features", FEATURE_COUNTS) -def test_wide_features_native( - mongodb_connection_string: str, native_config: RepoConfig, num_features: int +def test_wide_features_one( + mongodb_connection_string: str, one_config: RepoConfig, num_features: int ) -> None: - """Benchmark Native with varying feature width.""" + """Benchmark One with varying feature width.""" num_entities = 1000 client = MongoClient(mongodb_connection_string) try: client["benchmark_db"]["feature_history"].drop() - now = _generate_native_data( + now = _generate_one_data( client, "benchmark_db", "feature_history", @@ -560,7 +561,7 @@ def test_wide_features_native( rows_per_entity=3, ) - _, fv = _create_native_fv(num_features) + _, fv = _create_one_fv(num_features) entity_df = pd.DataFrame( { @@ -573,7 +574,7 @@ def test_wide_features_native( def run_query(): job = MongoDBOfflineStoreOne.get_historical_features( - config=native_config, + config=one_config, feature_views=[fv], feature_refs=feature_refs, entity_df=entity_df, @@ -599,10 +600,10 @@ def run_query(): @_requires_docker @pytest.mark.parametrize("unique_ratio", [1.0, 0.5, 0.1]) # 100%, 50%, 10% unique -def test_entity_skew_ibis( - mongodb_connection_string: str, ibis_config: RepoConfig, unique_ratio: float +def test_entity_skew_many( + mongodb_connection_string: str, many_config: RepoConfig, unique_ratio: float ) -> None: - """Benchmark Ibis with varying entity uniqueness in entity_df.""" + """Benchmark Many with varying entity uniqueness in entity_df.""" import numpy as np total_rows = 5000 @@ -612,7 +613,7 @@ def test_entity_skew_ibis( client = MongoClient(mongodb_connection_string) try: - now = _generate_ibis_data( + now = _generate_many_data( client, "benchmark_db", "driver_benchmark", @@ -621,7 +622,7 @@ def test_entity_skew_ibis( rows_per_entity=5, ) - _, fv = _create_ibis_fv(num_features) + _, fv = _create_many_fv(num_features) # Create entity_df with repeated entity_ids entity_ids = np.random.choice( @@ -640,7 +641,7 @@ def test_entity_skew_ibis( def run_query(): job = MongoDBOfflineStoreMany.get_historical_features( - config=ibis_config, + config=many_config, feature_views=[fv], feature_refs=feature_refs, entity_df=entity_df, @@ -652,7 +653,7 @@ def run_query(): result = _run_benchmark_full(run_query, mongo_client=client) print( - f"\n[IBIS] Unique ratio: {unique_ratio:.0%} ({num_unique_entities:,} unique / {total_rows:,} rows)" + f"\n[MANY] Unique ratio: {unique_ratio:.0%} ({num_unique_entities:,} unique / {total_rows:,} rows)" ) print(f" Time: {result.elapsed_seconds:.3f}s") print(f" Memory: {result.peak_memory_mb:.1f} MB") @@ -664,10 +665,10 @@ def run_query(): @_requires_docker @pytest.mark.parametrize("unique_ratio", [1.0, 0.5, 0.1]) -def test_entity_skew_native( - mongodb_connection_string: str, native_config: RepoConfig, unique_ratio: float +def test_entity_skew_one( + mongodb_connection_string: str, one_config: RepoConfig, unique_ratio: float ) -> None: - """Benchmark Native with varying entity uniqueness in entity_df.""" + """Benchmark One with varying entity uniqueness in entity_df.""" import numpy as np total_rows = 5000 @@ -678,7 +679,7 @@ def test_entity_skew_native( client = MongoClient(mongodb_connection_string) try: client["benchmark_db"]["feature_history"].drop() - now = _generate_native_data( + now = _generate_one_data( client, "benchmark_db", "feature_history", @@ -688,7 +689,7 @@ def test_entity_skew_native( rows_per_entity=5, ) - _, fv = _create_native_fv(num_features) + _, fv = _create_one_fv(num_features) entity_ids = np.random.choice( num_unique_entities, size=total_rows, replace=True @@ -706,7 +707,7 @@ def test_entity_skew_native( def run_query(): job = MongoDBOfflineStoreOne.get_historical_features( - config=native_config, + config=one_config, feature_views=[fv], feature_refs=feature_refs, entity_df=entity_df, @@ -718,7 +719,7 @@ def run_query(): result = _run_benchmark_full(run_query, mongo_client=client) print( - f"\n[NATIVE] Unique ratio: {unique_ratio:.0%} ({num_unique_entities:,} unique / {total_rows:,} rows)" + f"\n[ONE] Unique ratio: {unique_ratio:.0%} ({num_unique_entities:,} unique / {total_rows:,} rows)" ) print(f" Time: {result.elapsed_seconds:.3f}s") print(f" Memory: {result.peak_memory_mb:.1f} MB") @@ -735,7 +736,7 @@ def run_query(): @_requires_docker def test_summary_comparison( - mongodb_connection_string: str, ibis_config: RepoConfig, native_config: RepoConfig + mongodb_connection_string: str, many_config: RepoConfig, one_config: RepoConfig ) -> None: """Run a standard comparison and print summary with full metrics.""" num_entities = 2000 @@ -743,8 +744,8 @@ def test_summary_comparison( client = MongoClient(mongodb_connection_string) try: - # Setup Ibis data - now = _generate_ibis_data( + # Setup Many data + now = _generate_many_data( client, "benchmark_db", "driver_benchmark", @@ -753,9 +754,9 @@ def test_summary_comparison( rows_per_entity=5, ) - # Setup Native data + # Setup One data client["benchmark_db"]["feature_history"].drop() - _generate_native_data( + _generate_one_data( client, "benchmark_db", "feature_history", @@ -774,13 +775,13 @@ def test_summary_comparison( feature_refs = [f"driver_benchmark:feature_{i}" for i in range(num_features)] - # Ibis benchmark - _, ibis_fv = _create_ibis_fv(num_features) + # Many benchmark + _, many_fv = _create_many_fv(num_features) - def run_ibis(): + def run_many(): job = MongoDBOfflineStoreMany.get_historical_features( - config=ibis_config, - feature_views=[ibis_fv], + config=many_config, + feature_views=[many_fv], feature_refs=feature_refs, entity_df=entity_df, registry=MagicMock(), @@ -789,15 +790,15 @@ def run_ibis(): ) return job.to_df() - ibis_result = _run_benchmark_full(run_ibis, mongo_client=client) + many_result = _run_benchmark_full(run_many, mongo_client=client) - # Native benchmark - _, native_fv = _create_native_fv(num_features) + # One benchmark + _, one_fv = _create_one_fv(num_features) - def run_native(): + def run_one(): job = MongoDBOfflineStoreOne.get_historical_features( - config=native_config, - feature_views=[native_fv], + config=one_config, + feature_views=[one_fv], feature_refs=feature_refs, entity_df=entity_df, registry=MagicMock(), @@ -806,30 +807,31 @@ def run_native(): ) return job.to_df() - native_result = _run_benchmark_full(run_native, mongo_client=client) + one_result = _run_benchmark_full(run_one, mongo_client=client) # Print summary print("\n" + "=" * 70) - print("SUMMARY COMPARISON") + print("SUMMARY COMPARISON: Many vs One") print("=" * 70) print(f"Entities: {num_entities:,} | Features: {num_features}") print("-" * 70) - print(f"{'Metric':<20} {'Ibis':>20} {'Native':>20}") + print(f"{'Metric':<20} {'Many':>20} {'One':>20}") print("-" * 70) print( - f"{'Time (s)':<20} {ibis_result.elapsed_seconds:>20.3f} {native_result.elapsed_seconds:>20.3f}" + f"{'Time (s)':<20} {many_result.elapsed_seconds:>20.3f} {one_result.elapsed_seconds:>20.3f}" ) print( - f"{'Memory (MB)':<20} {ibis_result.peak_memory_mb:>20.1f} {native_result.peak_memory_mb:>20.1f}" + f"{'Memory (MB)':<20} {many_result.peak_memory_mb:>20.1f} {one_result.peak_memory_mb:>20.1f}" ) print( - f"{'Rows/sec':<20} {num_entities / ibis_result.elapsed_seconds:>20,.0f} {num_entities / native_result.elapsed_seconds:>20,.0f}" + f"{'Rows/sec':<20} {num_entities / many_result.elapsed_seconds:>20,.0f} {num_entities / one_result.elapsed_seconds:>20,.0f}" ) print("-" * 70) - if native_result.elapsed_seconds > 0: - ratio = native_result.elapsed_seconds / ibis_result.elapsed_seconds - print(f"Ibis is {ratio:.1f}x faster than Native") + if one_result.elapsed_seconds > 0: + ratio = one_result.elapsed_seconds / many_result.elapsed_seconds + faster = "Many" if ratio > 1 else "One" + print(f"{faster} is {max(ratio, 1 / ratio):.1f}x faster") print("=" * 70) finally: From 5146c4e81b5057626e747e31c4f1f269304ba921 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Mar 2026 15:15:54 -0400 Subject: [PATCH 27/30] Add comprehensive module docstring to mongodb_many.py Documents: - Collection structure (one per FeatureView) - Index creation (auto-created during materialization) - Document schema (flat, top-level features) - Point-in-time join strategy (Ibis memtables) - Performance characteristics and memory considerations - When to use vs MongoDBOfflineStoreOne - Comparison table with One implementation Signed-off-by: Casey Clements --- .../mongodb_offline_store/mongodb_many.py | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py index 3faec603a3..2fdf67200d 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py @@ -12,6 +12,69 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +MongoDB Offline Store Implementation (Many Collections). + +This module implements a MongoDB offline store using a many-collection schema +where each FeatureView maps to its own dedicated MongoDB collection. It uses +Ibis for point-in-time joins, loading collection data into in-memory tables. + +Collection Structure: + Each FeatureView has its own collection named after the source: + - driver_stats FeatureView → db.driver_stats collection + - vehicle_stats FeatureView → db.vehicle_stats collection + +Collection Index (auto-created during materialization): + db..createIndex({ + "": 1, + "": 1, // if compound key + "event_timestamp": -1, + "created_at": -1 // if created_timestamp_column is set + }) + +Document Schema (example for driver_stats): + { + "_id": ObjectId(), + "driver_id": 1001, + "event_timestamp": ISODate("2026-01-20T12:00:00Z"), + "created_at": ISODate("2026-01-20T12:00:05Z"), + "rating": 4.91, + "trips_last_7d": 132 + } + + Note: Features are stored as top-level fields (flat schema), not nested + in a subdocument. This differs from the "One" implementation. + +Point-in-Time Join Strategy: + 1. Load entire collection into an Ibis memtable + 2. Load entity_df into an Ibis memtable + 3. Use Ibis/pandas merge_asof for point-in-time correctness + 4. Apply TTL filtering per FeatureView + +Performance Characteristics: + - Fast for small to medium collections (fits in memory) + - Optimized Ibis memtable operations for joins + - ⚠️ Loads ENTIRE collection into memory - may OOM on large collections + +When to Use: + ✅ Small to medium feature stores where collections fit in memory + ✅ When query performance is the priority + ✅ When you want simple, flat document schemas + ✅ When each FeatureView has independent scaling needs + + ❌ Avoid when collections are very large (use MongoDBOfflineStoreOne instead) + ❌ Avoid in memory-constrained environments + +Comparison with MongoDBOfflineStoreOne: + | Aspect | Many (this module) | One | + |-----------------|----------------------|------------------------| + | Collections | N (one per FV) | 1 (shared) | + | Schema | Flat top-level | Nested features{} | + | Memory | Loads all docs | Filters by entity | + | Performance | Faster at scale | Memory-efficient | + | Entity ID | Native columns | Serialized bytes | +""" + import json import warnings from datetime import datetime From 612d05ab1c99ca79c67254f0b8c8f568bbd0bbe5 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Mar 2026 15:20:02 -0400 Subject: [PATCH 28/30] Add Feature Freshness and Schema Evolution docs to mongodb_many.py Add missing documentation sections: - Feature Freshness Semantics: document-level freshness, not per-feature - Schema Evolution ('Feature Creep'): flexible schema implications - Notes: entity keys as native types, PIT correctness, TTL constraints Signed-off-by: Casey Clements --- .../mongodb_offline_store/mongodb_many.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py index 2fdf67200d..b1112552f3 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_many.py @@ -45,6 +45,43 @@ Note: Features are stored as top-level fields (flat schema), not nested in a subdocument. This differs from the "One" implementation. +Feature Freshness Semantics: + This implementation operates at *document-level freshness*, not + per-feature freshness. During retrieval (e.g. point-in-time joins), + the system selects the most recent document for a given entity that + satisfies time constraints, and then extracts all requested features + from that document. + + As a result, if a newer document contains only a subset of features, + missing features will be returned as NULL—even if older documents + contained values for those features. The system does not backfill + individual feature values from earlier events. + + This behavior matches common Feast offline store semantics, but may + differ from systems that compute "latest value per feature". + +Schema Evolution ("Feature Creep"): + Because documents can have varying fields over time, different documents + in the same collection may contain different sets of feature fields. + This supports: + - Adding new features without backfilling historical data + - Partial writes or sparse feature computation + + However, it also implies: + - Newly added features will be NULL for older events + - Partially populated documents may lead to NULL values even + when older data contained those features + + Users should ensure that feature computation pipelines write complete + feature sets when consistent availability is required. + +Notes: + - Entity keys are stored as native MongoDB types (not serialized), + which differs from the "One" implementation. + - Point-in-time correctness is enforced per FeatureView. + - TTL (time-to-live) constraints are applied per FeatureView during + historical retrieval. + Point-in-Time Join Strategy: 1. Load entire collection into an Ibis memtable 2. Load entity_df into an Ibis memtable From 970ec797c1b0f570d50527aaea91472c15d8b9c6 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Mar 2026 16:27:35 -0400 Subject: [PATCH 29/30] Add MongoDB DataSourceCreators for universal Feast tests Add DataSourceCreator implementations for MongoDB offline stores: - MongoDBManyDataSourceCreator: Fully functional, passes universal tests. Creates one collection per FeatureView with flat document schema. - MongoDBOneDataSourceCreator: Implementation exists but NOT registered. The One schema requires knowing join keys vs features at data creation time, but DataSourceCreator.create_data_source() doesn't receive entity definitions. See TODO in mongodb.py for details on required interface changes. Other changes: - Fix data_source_class_type path in mongodb_one.py (mongodb_native -> mongodb_one) - Improve datetime handling in mongodb_one.py for non-datetime columns - Add 'mongodb' marker to pytest.ini - Register MongoDBManyDataSourceCreator in repo_configuration.py Signed-off-by: Casey Clements --- .secrets.baseline | 6 +- .../mongodb_offline_store/mongodb_one.py | 9 +- sdk/python/pytest.ini | 1 + .../feature_repos/repo_configuration.py | 27 ++ .../universal/data_sources/mongodb.py | 316 ++++++++++++++++++ 5 files changed, 354 insertions(+), 5 deletions(-) create mode 100644 sdk/python/tests/universal/feature_repos/universal/data_sources/mongodb.py diff --git a/.secrets.baseline b/.secrets.baseline index 9d27a7b000..0391444334 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -1460,14 +1460,14 @@ "filename": "sdk/python/tests/universal/feature_repos/repo_configuration.py", "hashed_secret": "d90e76ef629fb00c95f4e84fec29fbda111e2392", "is_verified": false, - "line_number": 459 + "line_number": 479 }, { "type": "Secret Keyword", "filename": "sdk/python/tests/universal/feature_repos/repo_configuration.py", "hashed_secret": "5baa61e4c9b93f3f0682250b6cf8331b7ee68fd8", "is_verified": false, - "line_number": 461 + "line_number": 481 } ], "sdk/python/tests/universal/feature_repos/universal/data_sources/file.py": [ @@ -1539,5 +1539,5 @@ } ] }, - "generated_at": "2026-03-18T08:09:25Z" + "generated_at": "2026-03-20T20:27:19Z" } diff --git a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_one.py b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_one.py index f40f30df83..3c1aeaf708 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_one.py +++ b/sdk/python/feast/infra/offline_stores/contrib/mongodb_offline_store/mongodb_one.py @@ -219,7 +219,7 @@ def _to_proto_impl(self) -> DataSourceProto: return DataSourceProto( name=self.name, type=DataSourceProto.CUSTOM_SOURCE, - data_source_class_type="feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_native.MongoDBSourceOne", + data_source_class_type="feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_one.MongoDBSourceOne", field_mapping=self.field_mapping, custom_options=DataSourceProto.CustomSourceOptions( configuration=json.dumps({"feature_view": self.name}).encode() @@ -655,7 +655,12 @@ def _run_single(entity_subset_df: pd.DataFrame, coll: Any) -> pd.DataFrame: """ # Prepare entity_df: ensure timestamps are UTC result = entity_subset_df.copy() - if result[event_timestamp_col].dt.tz is None: + # Convert timestamp column to datetime if needed + if not pd.api.types.is_datetime64_any_dtype(result[event_timestamp_col]): + result[event_timestamp_col] = pd.to_datetime( + result[event_timestamp_col], utc=True + ) + elif result[event_timestamp_col].dt.tz is None: result[event_timestamp_col] = pd.to_datetime( result[event_timestamp_col], utc=True ) diff --git a/sdk/python/pytest.ini b/sdk/python/pytest.ini index 1ad76b978e..d5ad19660b 100644 --- a/sdk/python/pytest.ini +++ b/sdk/python/pytest.ini @@ -21,6 +21,7 @@ markers = cloud: Tests requiring cloud credentials local_only: Tests that run entirely locally xdist_group: Group tests to run in the same xdist worker + mongodb: Tests requiring MongoDB timeout = 300 timeout_method = thread diff --git a/sdk/python/tests/universal/feature_repos/repo_configuration.py b/sdk/python/tests/universal/feature_repos/repo_configuration.py index ddd952f71d..2033d41603 100644 --- a/sdk/python/tests/universal/feature_repos/repo_configuration.py +++ b/sdk/python/tests/universal/feature_repos/repo_configuration.py @@ -108,6 +108,33 @@ ] ) +# MongoDB offline stores (require testcontainers and pymongo) +if os.getenv("FEAST_LOCAL_ONLINE_CONTAINER", "False") == "True": + try: + from tests.universal.feature_repos.universal.data_sources.mongodb import ( + MongoDBManyDataSourceCreator, + # MongoDBOneDataSourceCreator, # TODO: Not registered - see TODO in mongodb.py + ) + + AVAILABLE_OFFLINE_STORES.extend( + [ + ("local", MongoDBManyDataSourceCreator), + # TODO: MongoDBOneDataSourceCreator requires DataSourceCreator interface + # changes to pass entity/join key info. See mongodb.py for details. + # ("local", MongoDBOneDataSourceCreator), + ] + ) + OFFLINE_STORE_TO_PROVIDER_CONFIG["mongodb_many"] = ( + "local", + MongoDBManyDataSourceCreator, + ) + # OFFLINE_STORE_TO_PROVIDER_CONFIG["mongodb_one"] = ( + # "local", + # MongoDBOneDataSourceCreator, + # ) + except ImportError: + pass # pymongo or testcontainers not installed + AVAILABLE_ONLINE_STORES: Dict[ str, Tuple[Union[str, Dict[Any, Any]], Optional[Type[OnlineStoreCreator]]] ] = {"sqlite": ({"type": "sqlite"}, None)} diff --git a/sdk/python/tests/universal/feature_repos/universal/data_sources/mongodb.py b/sdk/python/tests/universal/feature_repos/universal/data_sources/mongodb.py new file mode 100644 index 0000000000..8eedc3b695 --- /dev/null +++ b/sdk/python/tests/universal/feature_repos/universal/data_sources/mongodb.py @@ -0,0 +1,316 @@ +""" +MongoDB DataSourceCreator implementations for universal Feast tests. + +Provides two implementations matching the two offline store schemas: +- MongoDBManyDataSourceCreator: One collection per FeatureView (Many) +- MongoDBOneDataSourceCreator: Single shared collection (One) +""" + +from typing import Any, Dict, Optional + +import pandas as pd +import pytest +from testcontainers.mongodb import MongoDbContainer + +from feast.data_source import DataSource +from feast.feature_logging import LoggingDestination +from feast.infra.key_encoding_utils import serialize_entity_key +from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_many import ( + MongoDBOfflineStoreManyConfig, + MongoDBSourceMany, + SavedDatasetMongoDBStorageMany, +) +from feast.infra.offline_stores.contrib.mongodb_offline_store.mongodb_one import ( + MongoDBOfflineStoreOneConfig, + MongoDBSourceOne, +) +from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto +from feast.protos.feast.types.Value_pb2 import Value as ValueProto +from feast.repo_config import FeastConfigBaseModel +from feast.saved_dataset import SavedDatasetStorage +from tests.universal.feature_repos.universal.data_source_creator import ( + DataSourceCreator, +) + +# Import pymongo - will be available since we're testing MongoDB +try: + from pymongo import MongoClient +except ImportError: + MongoClient = None # type: ignore + + +class MongoDBManyDataSourceCreator(DataSourceCreator): + """DataSourceCreator for MongoDBOfflineStoreMany (one collection per FeatureView).""" + + def __init__(self, project_name: str, *args, **kwargs): + super().__init__(project_name) + self.container = MongoDbContainer( + "mongo:7.0", + username="test", + password="test", # pragma: allowlist secret + ).with_exposed_ports(27017) + self.container.start() + self.port = self.container.get_exposed_port(27017) + self.connection_string = ( + f"mongodb://test:test@localhost:{self.port}" # pragma: allowlist secret + ) + self.database = f"feast_test_{project_name}" + self.collections_created: list[str] = [] + + def create_data_source( + self, + df: pd.DataFrame, + destination_name: str, + created_timestamp_column: str = "created_ts", + field_mapping: Optional[Dict[str, str]] = None, + timestamp_field: Optional[str] = "ts", + ) -> DataSource: + """Create a MongoDB data source by inserting df into a collection.""" + collection_name = self.get_prefixed_table_name(destination_name) + + # Insert data into MongoDB + client: Any = MongoClient(self.connection_string, tz_aware=True) + try: + coll = client[self.database][collection_name] + coll.drop() # Clean slate + records = df.to_dict("records") + if records: + coll.insert_many(records) + self.collections_created.append(collection_name) + finally: + client.close() + + return MongoDBSourceMany( + name=destination_name, + database=self.database, + collection=collection_name, + timestamp_field=timestamp_field or "ts", + created_timestamp_column=created_timestamp_column, + field_mapping=field_mapping, + ) + + def get_prefixed_table_name(self, suffix: str) -> str: + return f"{self.project_name}_{suffix}" + + def create_offline_store_config(self) -> FeastConfigBaseModel: + return MongoDBOfflineStoreManyConfig( + connection_string=self.connection_string, + database=self.database, + ) + + def create_saved_dataset_destination(self) -> SavedDatasetStorage: + return SavedDatasetMongoDBStorageMany( + database=self.database, + collection=f"{self.project_name}_saved_dataset", + ) + + def create_logged_features_destination(self) -> LoggingDestination: + # MongoDB doesn't have a native LoggingDestination yet + # Return None or raise NotImplementedError for now + raise NotImplementedError( + "MongoDB LoggingDestination not implemented. " + "Tests requiring logging features will be skipped." + ) + + def teardown(self): + """Clean up: drop collections and stop container.""" + try: + client: Any = MongoClient(self.connection_string, tz_aware=True) + try: + db = client[self.database] + for coll_name in self.collections_created: + db[coll_name].drop() + finally: + client.close() + except Exception: + pass # Container may already be stopped + self.container.stop() + + @staticmethod + def test_markers() -> list: + """Mark tests as requiring MongoDB.""" + return [pytest.mark.mongodb] + + +class MongoDBOneDataSourceCreator(DataSourceCreator): + """DataSourceCreator for MongoDBOfflineStoreOne (single shared collection). + + This implementation uses the nested features schema where all FeatureViews + share a single collection with a discriminator field. + + TODO: This DataSourceCreator has a fundamental limitation. The One schema + requires knowing which columns are join keys vs features to properly + serialize entity_id and nest features. However, create_data_source() only + receives a DataFrame and column names - it doesn't have access to Entity + definitions that specify join keys. + + Current workaround uses heuristics (columns ending in '_id' with int/string + dtype), which is fragile. A proper fix would require modifying the + DataSourceCreator interface to pass entity/join key information to + create_data_source(), which is a Feast core change. + + For now, universal tests may fail for FeatureViews where the heuristic + doesn't correctly identify join keys. Use unit tests in + tests/unit/infra/offline_stores/contrib/mongodb_offline_store/test_one.py + for comprehensive testing of the One implementation. + """ + + ENTITY_KEY_VERSION = 3 + + def __init__(self, project_name: str, *args, **kwargs): + super().__init__(project_name) + self.container = MongoDbContainer( + "mongo:7.0", + username="test", + password="test", # pragma: allowlist secret + ).with_exposed_ports(27017) + self.container.start() + self.port = self.container.get_exposed_port(27017) + self.connection_string = ( + f"mongodb://test:test@localhost:{self.port}" # pragma: allowlist secret + ) + self.database = f"feast_test_{project_name}" + self.collection = "feature_history" + self.feature_views_created: list[str] = [] + # Track entity key columns per feature view for serialization + self._entity_key_columns: Dict[str, list[str]] = {} + + def _serialize_entity_key(self, row: pd.Series, join_keys: list[str]) -> bytes: + """Serialize entity key columns to bytes.""" + entity_key = EntityKeyProto() + for key in join_keys: + entity_key.join_keys.append(key) + val = ValueProto() + value = row[key] + if isinstance(value, int): + val.int64_val = value + elif isinstance(value, str): + val.string_val = value + elif isinstance(value, float): + val.double_val = value + elif isinstance(value, bool): + val.bool_val = value + else: + val.string_val = str(value) + entity_key.entity_values.append(val) + return serialize_entity_key(entity_key, self.ENTITY_KEY_VERSION) + + def create_data_source( + self, + df: pd.DataFrame, + destination_name: str, + created_timestamp_column: str = "created_ts", + field_mapping: Optional[Dict[str, str]] = None, + timestamp_field: Optional[str] = "ts", + ) -> DataSource: + """Create a MongoDB data source by inserting df into the shared collection. + + The data is transformed into the One schema: + - entity_id: serialized entity key + - feature_view: destination_name + - features: nested dict of feature values + - event_timestamp: from timestamp_field + - created_at: from created_timestamp_column + """ + # Determine which columns are join keys vs features + # Join keys must be integer or string types (serializable as entity keys) + timestamp_cols = {timestamp_field, created_timestamp_column} + all_cols = set(df.columns) - timestamp_cols - {None} + + # Heuristic: identify join keys + # 1. Must end with "_id" or be a known key name + # 2. Must be integer or string type (not float) + join_keys = [] + for c in all_cols: + if c.endswith("_id") or c in {"driver", "customer", "entity"}: + dtype = df[c].dtype + # Only integer or string types can be join keys + if dtype in ("int64", "int32", "object") or str(dtype).startswith( + "int" + ): + join_keys.append(c) + + if not join_keys: + # Fallback: first integer column + for c in all_cols: + if df[c].dtype in ("int64", "int32") or str(df[c].dtype).startswith( + "int" + ): + join_keys = [c] + break + + feature_cols = [c for c in all_cols if c not in join_keys] + + # Store for later use + self._entity_key_columns[destination_name] = join_keys + + # Transform to One schema + docs = [] + for _, row in df.iterrows(): + entity_id = self._serialize_entity_key(row, join_keys) + features = {col: row[col] for col in feature_cols if pd.notna(row.get(col))} + + doc = { + "entity_id": entity_id, + "feature_view": destination_name, + "features": features, + } + if timestamp_field and timestamp_field in row: + doc["event_timestamp"] = row[timestamp_field] + if created_timestamp_column and created_timestamp_column in row: + doc["created_at"] = row[created_timestamp_column] + + docs.append(doc) + + # Insert into MongoDB + client: Any = MongoClient(self.connection_string, tz_aware=True) + try: + coll = client[self.database][self.collection] + if docs: + coll.insert_many(docs) + self.feature_views_created.append(destination_name) + finally: + client.close() + + return MongoDBSourceOne( + name=destination_name, + timestamp_field="event_timestamp", + created_timestamp_column="created_at" if created_timestamp_column else None, + field_mapping=field_mapping, + ) + + def get_prefixed_table_name(self, suffix: str) -> str: + return f"{self.project_name}_{suffix}" + + def create_offline_store_config(self) -> FeastConfigBaseModel: + return MongoDBOfflineStoreOneConfig( + connection_string=self.connection_string, + database=self.database, + collection=self.collection, + ) + + def create_saved_dataset_destination(self) -> SavedDatasetStorage: + # One implementation doesn't have SavedDatasetStorage yet + raise NotImplementedError( + "MongoDBOfflineStoreOne SavedDatasetStorage not implemented." + ) + + def create_logged_features_destination(self) -> LoggingDestination: + raise NotImplementedError("MongoDB LoggingDestination not implemented.") + + def teardown(self): + """Clean up: drop the collection and stop container.""" + try: + client: Any = MongoClient(self.connection_string, tz_aware=True) + try: + client[self.database][self.collection].drop() + finally: + client.close() + except Exception: + pass + self.container.stop() + + @staticmethod + def test_markers() -> list: + """Mark tests as requiring MongoDB.""" + return [pytest.mark.mongodb] From 9dc9162a428f4d2afcce6f9edb21636c79029915 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Mar 2026 16:57:17 -0400 Subject: [PATCH 30/30] Add .secrets.baseline Signed-off-by: Casey Clements --- .secrets.baseline | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index 0391444334..260d37dfee 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -1460,14 +1460,14 @@ "filename": "sdk/python/tests/universal/feature_repos/repo_configuration.py", "hashed_secret": "d90e76ef629fb00c95f4e84fec29fbda111e2392", "is_verified": false, - "line_number": 479 + "line_number": 486 }, { "type": "Secret Keyword", "filename": "sdk/python/tests/universal/feature_repos/repo_configuration.py", "hashed_secret": "5baa61e4c9b93f3f0682250b6cf8331b7ee68fd8", "is_verified": false, - "line_number": 481 + "line_number": 488 } ], "sdk/python/tests/universal/feature_repos/universal/data_sources/file.py": [ @@ -1539,5 +1539,5 @@ } ] }, - "generated_at": "2026-03-20T20:27:19Z" + "generated_at": "2026-03-20T20:55:36Z" }