From cdd1b0734fcb20c258cf6ee4c067f23d2fff81e0 Mon Sep 17 00:00:00 2001 From: Dan Herrera Date: Thu, 18 Aug 2022 09:33:14 -0700 Subject: [PATCH] feat: Custom Docker image for Bytewax batch materialization (#3099) Dockerfile and instructions for building a custom Bytewax image. Signed-off-by: Dan Herrera Signed-off-by: Dan Herrera --- .../batch-materialization/bytewax.md | 17 ++++++++++- .../contrib/bytewax/Dockerfile | 29 +++++++++++++++++++ .../bytewax/bytewax_materialization_engine.py | 4 +-- .../contrib/bytewax/dataflow.py | 22 ++++++++++++++ .../contrib/bytewax/entrypoint.sh | 4 +++ 5 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 sdk/python/feast/infra/materialization/contrib/bytewax/Dockerfile create mode 100644 sdk/python/feast/infra/materialization/contrib/bytewax/dataflow.py create mode 100644 sdk/python/feast/infra/materialization/contrib/bytewax/entrypoint.sh diff --git a/docs/reference/batch-materialization/bytewax.md b/docs/reference/batch-materialization/bytewax.md index db2d79ddbf..bd98a4dc6e 100644 --- a/docs/reference/batch-materialization/bytewax.md +++ b/docs/reference/batch-materialization/bytewax.md @@ -55,5 +55,20 @@ batch_engine: The `namespace` configuration directive specifies which Kubernetes [namespace](https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/) jobs, services and configuration maps will be created in. -The `image` parameter specifies which container image to use when running the materialization job. To create a custom image based on this container, please see the [GitHub repository](https://github.com/bytewax/bytewax-feast) for this image. +#### Building a custom Bytewax Docker image + +The `image` configuration directive specifies which container image to use when running the materialization job. To create a custom image based on this container, run the following command: + +``` shell +DOCKER_BUILDKIT=1 docker build . -f ./sdk/python/feast/infra/materialization/contrib/bytewax/Dockerfile -t +``` + +Once that image is built and pushed to a registry, it can be specified as a part of the batch engine configuration: + +``` shell +batch_engine: + type: bytewax + namespace: bytewax + image: +``` diff --git a/sdk/python/feast/infra/materialization/contrib/bytewax/Dockerfile b/sdk/python/feast/infra/materialization/contrib/bytewax/Dockerfile new file mode 100644 index 0000000000..963924f38d --- /dev/null +++ b/sdk/python/feast/infra/materialization/contrib/bytewax/Dockerfile @@ -0,0 +1,29 @@ +FROM python:3.9-slim-bullseye AS build + +RUN apt-get update && \ + apt-get install --no-install-suggests --no-install-recommends --yes git + +WORKDIR /bytewax + +# Copy dataflow code +COPY sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_dataflow.py /bytewax +COPY sdk/python/feast/infra/materialization/contrib/bytewax/dataflow.py /bytewax + +# Copy entrypoint +COPY sdk/python/feast/infra/materialization/contrib/bytewax/entrypoint.sh /bytewax + +# Copy necessary parts of the Feast codebase +COPY sdk/python sdk/python +COPY protos protos +COPY go go +COPY setup.py setup.py +COPY pyproject.toml pyproject.toml +COPY README.md README.md + +# Install Feast for AWS with Bytewax dependencies +# We need this mount thingy because setuptools_scm needs access to the +# git dir to infer the version of feast we're installing. +# https://github.com/pypa/setuptools_scm#usage-from-docker +# I think it also assumes that this dockerfile is being built from the root of the directory. +RUN --mount=source=.git,target=.git,type=bind pip3 install --no-cache-dir -e '.[aws,gcp,bytewax]' + diff --git a/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_engine.py b/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_engine.py index d7d8301a55..c0218a9e4b 100644 --- a/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_engine.py +++ b/sdk/python/feast/infra/materialization/contrib/bytewax/bytewax_materialization_engine.py @@ -20,7 +20,7 @@ ) from feast.infra.offline_stores.offline_store import OfflineStore from feast.infra.online_stores.online_store import OnlineStore -from feast.registry import BaseRegistry +from feast.infra.registry.base_registry import BaseRegistry from feast.repo_config import FeastConfigBaseModel from feast.stream_feature_view import StreamFeatureView from feast.utils import _get_column_names @@ -341,7 +341,7 @@ def _create_job_definition(self, job_id, namespace, pods, env): { "command": ["sh", "-c", "sh ./entrypoint.sh"], "env": job_env, - "image": "bytewax/bytewax-feast:latest", + "image": self.batch_engine_config.image, "imagePullPolicy": "Always", "name": "process", "ports": [ diff --git a/sdk/python/feast/infra/materialization/contrib/bytewax/dataflow.py b/sdk/python/feast/infra/materialization/contrib/bytewax/dataflow.py new file mode 100644 index 0000000000..e3d95e2a75 --- /dev/null +++ b/sdk/python/feast/infra/materialization/contrib/bytewax/dataflow.py @@ -0,0 +1,22 @@ +import yaml + +from feast import FeatureStore, RepoConfig +from feast.infra.materialization.contrib.bytewax.bytewax_materialization_dataflow import ( + BytewaxMaterializationDataflow, +) + +if __name__ == "__main__": + with open("/var/feast/feature_store.yaml") as f: + feast_config = yaml.safe_load(f) + + with open("/var/feast/bytewax_materialization_config.yaml") as b: + bytewax_config = yaml.safe_load(b) + + config = RepoConfig(**feast_config) + store = FeatureStore(config=config) + + job = BytewaxMaterializationDataflow( + config, + store.get_feature_view(bytewax_config["feature_view"]), + bytewax_config["paths"], + ) diff --git a/sdk/python/feast/infra/materialization/contrib/bytewax/entrypoint.sh b/sdk/python/feast/infra/materialization/contrib/bytewax/entrypoint.sh new file mode 100644 index 0000000000..0179e5481f --- /dev/null +++ b/sdk/python/feast/infra/materialization/contrib/bytewax/entrypoint.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +cd /bytewax +python dataflow.py