diff --git a/.prow/scripts/publish-java-sdk.sh b/.prow/scripts/publish-java-sdk.sh index 17513d0eb0d..91123c8d4ee 100755 --- a/.prow/scripts/publish-java-sdk.sh +++ b/.prow/scripts/publish-java-sdk.sh @@ -69,4 +69,4 @@ gpg --import --batch --yes $GPG_KEY_IMPORT_DIR/private-key echo "============================================================" echo "Deploying Java SDK with revision: $REVISION" echo "============================================================" -mvn --projects sdk/java -Drevision=$REVISION --batch-mode clean deploy +mvn --projects datatypes/java,sdk/java -Drevision=$REVISION --batch-mode clean deploy diff --git a/CHANGELOG.md b/CHANGELOG.md index f6ad89e0c0d..cc969c34b2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,164 @@ # Changelog +## [v0.4.4](https://github.com/gojek/feast/tree/v0.4.4) (2020-01-28) + +[Full Changelog](https://github.com/gojek/feast/compare/v0.4.3...v0.4.4) + +**Merged pull requests:** + +- Change RedisBackedJobService to use a connection pool [\#439](https://github.com/gojek/feast/pull/439) ([zhilingc](https://github.com/zhilingc)) +- Update protos with Tensorflow data validation schema [\#438](https://github.com/gojek/feast/pull/438) ([davidheryanto](https://github.com/davidheryanto)) +- Update GKE installation and chart values to work with 0.4.3 [\#434](https://github.com/gojek/feast/pull/434) ([lgvital](https://github.com/lgvital)) +- Parameterize end-to-end test scripts [\#433](https://github.com/gojek/feast/pull/433) ([Yanson](https://github.com/Yanson)) +- Remove "resource" concept and the need to specify a kind in feature sets [\#432](https://github.com/gojek/feast/pull/432) ([woop](https://github.com/woop)) +- Add retry options to BigQuery [\#431](https://github.com/gojek/feast/pull/431) ([Yanson](https://github.com/Yanson)) +- Fix logging [\#430](https://github.com/gojek/feast/pull/430) ([Yanson](https://github.com/Yanson)) +- Add documentation for bigquery batch retrieval [\#428](https://github.com/gojek/feast/pull/428) ([zhilingc](https://github.com/zhilingc)) +- Publish datatypes/java along with sdk/java [\#426](https://github.com/gojek/feast/pull/426) ([ches](https://github.com/ches)) +- Update basic Feast example to Feast 0.4 [\#424](https://github.com/gojek/feast/pull/424) ([woop](https://github.com/woop)) +- Unserializable FluentBackoff cause null pointer exception in Dataflow Runner [\#417](https://github.com/gojek/feast/pull/417) ([khorshuheng](https://github.com/khorshuheng)) +- Introduce datatypes/java module for proto generation [\#391](https://github.com/gojek/feast/pull/391) ([ches](https://github.com/ches)) +- Allow user to override job options [\#377](https://github.com/gojek/feast/pull/377) ([khorshuheng](https://github.com/khorshuheng)) + +## [v0.4.3](https://github.com/gojek/feast/tree/v0.4.3) (2020-01-08) + +[Full Changelog](https://github.com/gojek/feast/compare/v0.4.2...v0.4.3) + +**Fixed bugs:** + +- Bugfix for redis ingestion retries throwing NullPointerException on remote runners [\#417](https://github.com/gojek/feast/pull/417) ([khorshuheng](https://github.com/khorshuheng)) + +## [v0.4.2](https://github.com/gojek/feast/tree/v0.4.2) (2020-01-07) + +[Full Changelog](https://github.com/gojek/feast/compare/v0.4.1...v0.4.2) + +**Fixed bugs:** + +- Missing argument in error string in ValidateFeatureRowDoFn [\#401](https://github.com/gojek/feast/issues/401) + +**Merged pull requests:** + +- Define maven revision property when packaging jars in Dockerfile so the images are built successfully [\#410](https://github.com/gojek/feast/pull/410) ([davidheryanto](https://github.com/davidheryanto)) +- Deduplicate rows in subquery [\#409](https://github.com/gojek/feast/pull/409) ([zhilingc](https://github.com/zhilingc)) +- Filter out extra fields, deduplicate fields in ingestion [\#404](https://github.com/gojek/feast/pull/404) ([zhilingc](https://github.com/zhilingc)) +- Automatic documentation generation for gRPC API [\#403](https://github.com/gojek/feast/pull/403) ([woop](https://github.com/woop)) +- Update feast core default values to include hibernate merge strategy [\#400](https://github.com/gojek/feast/pull/400) ([zhilingc](https://github.com/zhilingc)) +- Move cli into feast package [\#398](https://github.com/gojek/feast/pull/398) ([zhilingc](https://github.com/zhilingc)) +- Use Nexus staging plugin for deployment [\#394](https://github.com/gojek/feast/pull/394) ([khorshuheng](https://github.com/khorshuheng)) +- Handle retry for redis io flow [\#274](https://github.com/gojek/feast/pull/274) ([khorshuheng](https://github.com/khorshuheng)) + +## [v0.4.1](https://github.com/gojek/feast/tree/v0.4.1) (2019-12-30) + +[Full Changelog](https://github.com/gojek/feast/compare/v0.4.0...v0.4.1) + +**Merged pull requests:** + +- Add project-related commands to CLI [\#397](https://github.com/gojek/feast/pull/397) ([zhilingc](https://github.com/zhilingc)) + +## [v0.4.0](https://github.com/gojek/feast/tree/v0.4.0) (2019-12-28) + +[Full Changelog](https://github.com/gojek/feast/compare/v0.3.5...v0.4.0) + +**Implemented enhancements:** + +- Edit description in feature specification to also reflect in BigQuery schema description. [\#239](https://github.com/gojek/feast/issues/239) +- Allow for disabling of metrics pushing [\#57](https://github.com/gojek/feast/issues/57) + +**Merged pull requests:** + +- Java SDK release script [\#406](https://github.com/gojek/feast/pull/406) ([davidheryanto](https://github.com/davidheryanto)) +- Use fixed 'dev' revision for test-e2e-batch [\#395](https://github.com/gojek/feast/pull/395) ([davidheryanto](https://github.com/davidheryanto)) +- Project Namespacing [\#393](https://github.com/gojek/feast/pull/393) ([woop](https://github.com/woop)) +- \\(concepts\): change data types to upper case because lower case … [\#389](https://github.com/gojek/feast/pull/389) ([david30907d](https://github.com/david30907d)) +- Remove alpha v1 from java package name [\#387](https://github.com/gojek/feast/pull/387) ([khorshuheng](https://github.com/khorshuheng)) +- Minor bug fixes for Python SDK [\#383](https://github.com/gojek/feast/pull/383) ([voonhous](https://github.com/voonhous)) +- Allow user to override job options [\#377](https://github.com/gojek/feast/pull/377) ([khorshuheng](https://github.com/khorshuheng)) +- Add documentation to default values.yaml in Feast chart [\#376](https://github.com/gojek/feast/pull/376) ([davidheryanto](https://github.com/davidheryanto)) +- Add support for file paths for providing entity rows during batch retrieval [\#375](https://github.com/gojek/feast/pull/375) ([voonhous](https://github.com/voonhous)) +- Update sync helm chart script to ensure requirements.lock in in sync with requirements.yaml [\#373](https://github.com/gojek/feast/pull/373) ([davidheryanto](https://github.com/davidheryanto)) +- Catch errors thrown by BQ during entity table loading [\#371](https://github.com/gojek/feast/pull/371) ([zhilingc](https://github.com/zhilingc)) +- Async job management [\#361](https://github.com/gojek/feast/pull/361) ([zhilingc](https://github.com/zhilingc)) +- Infer schema of PyArrow table directly [\#355](https://github.com/gojek/feast/pull/355) ([voonhous](https://github.com/voonhous)) +- Add readiness checks for Feast services in end to end test [\#337](https://github.com/gojek/feast/pull/337) ([davidheryanto](https://github.com/davidheryanto)) +- Create CHANGELOG.md [\#321](https://github.com/gojek/feast/pull/321) ([woop](https://github.com/woop)) + +## [v0.3.6](https://github.com/gojek/feast/tree/v0.3.6) (2020-01-03) + +**Merged pull requests:** + +[Full Changelog](https://github.com/gojek/feast/compare/v0.3.5...v0.3.6) + +- Add support for file paths for providing entity rows during batch retrieval [\#375](https://github.com/gojek/feast/pull/376) ([voonhous](https://github.com/voonhous)) + +## [v0.3.5](https://github.com/gojek/feast/tree/v0.3.5) (2019-12-26) + +[Full Changelog](https://github.com/gojek/feast/compare/v0.3.4...v0.3.5) + +**Merged pull requests:** + +- Always set destination table in BigQuery query config in Feast Batch Serving so it can handle large results [\#392](https://github.com/gojek/feast/pull/392) ([davidheryanto](https://github.com/davidheryanto)) + +## [v0.3.4](https://github.com/gojek/feast/tree/v0.3.4) (2019-12-23) + +[Full Changelog](https://github.com/gojek/feast/compare/v0.3.3...v0.3.4) + +**Merged pull requests:** + +- Make redis key creation more determinisitic [\#380](https://github.com/gojek/feast/pull/380) ([zhilingc](https://github.com/zhilingc)) + +## [v0.3.3](https://github.com/gojek/feast/tree/v0.3.3) (2019-12-18) + +[Full Changelog](https://github.com/gojek/feast/compare/v0.3.2...v0.3.3) + +**Implemented enhancements:** + +- Added Docker Compose for Feast [\#272](https://github.com/gojek/feast/issues/272) +- Added ability to check import job status and cancel job through Python SDK [\#194](https://github.com/gojek/feast/issues/194) +- Added basic customer transactions example [\#354](https://github.com/gojek/feast/pull/354) ([woop](https://github.com/woop)) + +**Merged pull requests:** + +- Added Prow jobs to automate the release of Docker images and Python SDK [\#369](https://github.com/gojek/feast/pull/369) ([davidheryanto](https://github.com/davidheryanto)) +- Fixed installation link in README.md [\#368](https://github.com/gojek/feast/pull/368) ([Jeffwan](https://github.com/Jeffwan)) +- Fixed Java SDK tests not actually running \(missing dependencies\) [\#366](https://github.com/gojek/feast/pull/366) ([woop](https://github.com/woop)) +- Added more batch retrieval tests [\#357](https://github.com/gojek/feast/pull/357) ([zhilingc](https://github.com/zhilingc)) +- Python SDK and Feast Core Bug Fixes [\#353](https://github.com/gojek/feast/pull/353) ([woop](https://github.com/woop)) +- Updated buildFeatureSets method in Golang SDK [\#351](https://github.com/gojek/feast/pull/351) ([davidheryanto](https://github.com/davidheryanto)) +- Python SDK cleanup [\#348](https://github.com/gojek/feast/pull/348) ([woop](https://github.com/woop)) +- Broke up queries for point in time correctness joins [\#347](https://github.com/gojek/feast/pull/347) ([zhilingc](https://github.com/zhilingc)) +- Exports gRPC call metrics and Feast resource metrics in Core [\#345](https://github.com/gojek/feast/pull/345) ([davidheryanto](https://github.com/davidheryanto)) +- Fixed broken Google Group link on Community page [\#343](https://github.com/gojek/feast/pull/343) ([ches](https://github.com/ches)) +- Ensured ImportJobTest is not flaky by checking WriteToStore metric and requesting adequate resources for testing [\#332](https://github.com/gojek/feast/pull/332) ([davidheryanto](https://github.com/davidheryanto)) +- Added docker-compose file with Jupyter notebook [\#328](https://github.com/gojek/feast/pull/328) ([khorshuheng](https://github.com/khorshuheng)) +- Added minimal implementation of ingesting Parquet and CSV files [\#327](https://github.com/gojek/feast/pull/327) ([voonhous](https://github.com/voonhous)) + +## [v0.3.2](https://github.com/gojek/feast/tree/v0.3.2) (2019-11-29) + +[Full Changelog](https://github.com/gojek/feast/compare/v0.3.1...v0.3.2) + +**Merged pull requests:** + +- Fixed incorrect BigQuery schema creation from FeatureSetSpec [\#340](https://github.com/gojek/feast/pull/340) ([davidheryanto](https://github.com/davidheryanto)) +- Filtered out feature sets that dont share the same source [\#339](https://github.com/gojek/feast/pull/339) ([zhilingc](https://github.com/zhilingc)) +- Changed latency calculation method to not use Timer [\#338](https://github.com/gojek/feast/pull/338) ([zhilingc](https://github.com/zhilingc)) +- Moved Prometheus annotations to pod template for serving [\#336](https://github.com/gojek/feast/pull/336) ([zhilingc](https://github.com/zhilingc)) +- Removed metrics windowing, cleaned up step names for metrics writing [\#334](https://github.com/gojek/feast/pull/334) ([zhilingc](https://github.com/zhilingc)) +- Set BigQuery table time partition inside get table function [\#333](https://github.com/gojek/feast/pull/333) ([zhilingc](https://github.com/zhilingc)) +- Added unit test in Redis to return values with no max age set [\#329](https://github.com/gojek/feast/pull/329) ([smadarasmi](https://github.com/smadarasmi)) +- Consolidated jobs into single steps instead of branching out [\#326](https://github.com/gojek/feast/pull/326) ([zhilingc](https://github.com/zhilingc)) +- Pinned Python SDK to minor versions for dependencies [\#322](https://github.com/gojek/feast/pull/322) ([woop](https://github.com/woop)) +- Added Auto format to Google style with Spotless [\#317](https://github.com/gojek/feast/pull/317) ([ches](https://github.com/ches)) + +## [v0.3.1](https://github.com/gojek/feast/tree/v0.3.1) (2019-11-25) + +[Full Changelog](https://github.com/gojek/feast/compare/v0.3.0...v0.3.1) + +**Merged pull requests:** + +- Added Prometheus metrics to serving [\#316](https://github.com/gojek/feast/pull/316) ([zhilingc](https://github.com/zhilingc)) +- Changed default job metrics sink to Statsd [\#315](https://github.com/gojek/feast/pull/315) ([zhilingc](https://github.com/zhilingc)) +- Fixed module import error in Feast CLI [\#314](https://github.com/gojek/feast/pull/314) ([davidheryanto](https://github.com/davidheryanto)) + ## [v0.3.0](https://github.com/gojek/feast/tree/v0.3.0) (2019-11-19) [Full Changelog](https://github.com/gojek/feast/compare/v0.1.8...v0.3.0) diff --git a/core/pom.xml b/core/pom.xml index 954c7c00185..e1567ae8fe3 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -39,10 +39,6 @@ false - - org.xolstice.maven.plugins - protobuf-maven-plugin - diff --git a/core/src/main/proto/feast b/core/src/main/proto/feast deleted file mode 120000 index d520da9126b..00000000000 --- a/core/src/main/proto/feast +++ /dev/null @@ -1 +0,0 @@ -../../../../protos/feast \ No newline at end of file diff --git a/core/src/main/proto/third_party b/core/src/main/proto/third_party deleted file mode 120000 index 363d20598e6..00000000000 --- a/core/src/main/proto/third_party +++ /dev/null @@ -1 +0,0 @@ -../../../../protos/third_party \ No newline at end of file diff --git a/core/src/main/resources/log4j2.xml b/core/src/main/resources/log4j2.xml index 65b3c5aa4bb..efbf7d1f624 100644 --- a/core/src/main/resources/log4j2.xml +++ b/core/src/main/resources/log4j2.xml @@ -22,9 +22,10 @@ %d{yyyy-MM-dd HH:mm:ss.SSS} %5p ${hostName} --- [%15.15t] %-40.40c{1.} : %m%n%ex ${env:LOG_TYPE:-Console} + ${env:LOG_LEVEL:-info} - + @@ -35,8 +36,11 @@ - - - + + + + + + diff --git a/datatypes/java/README.md b/datatypes/java/README.md new file mode 100644 index 00000000000..535fac73d2e --- /dev/null +++ b/datatypes/java/README.md @@ -0,0 +1,55 @@ +Feast Data Types for Java +========================= + +This module produces Java class files for Feast's data type and gRPC service +definitions, from Protobuf IDL. These are used across Feast components for wire +interchange, contracts, etc. + +End users of Feast will be best served by our Java SDK which adds higher-level +conveniences, but the data types are published independently for custom needs, +without any additional dependencies the SDK may add. + +Dependency Coordinates +---------------------- + +```xml + + dev.feast + datatypes-java + 0.4.0-SNAPSHOT + +``` + +Use the version corresponding to the Feast release you have deployed in your +environment—see the [Feast release notes] for details. + +[Feast release notes]: ../../CHANGELOG.md + +Using the `.proto` Definitions +------------------------------ + +The `.proto` definitions are packaged as resources within the Maven artifact, +which may be useful to `include` them in dependent Protobuf definitions in a +downstream project, or for other JVM languages to consume from their builds to +generate more idiomatic bindings. + +Google's Gradle plugin, for instance, [can use protos in dependencies][Gradle] +either for `include` or to compile with a different `protoc` plugin than Java. + +[sbt-protoc] offers similar functionality for sbt/Scala. + +[Gradle]: https://github.com/google/protobuf-gradle-plugin#protos-in-dependencies +[sbt-protoc]: https://github.com/thesamet/sbt-protoc + +Releases +-------- + +The module is published to Maven Central upon each release of Feast (since +v0.3.7). + +For developers, the publishing process is automated along with the Java SDK by +[the `publish-java-sdk` build task in Prow][prow task], where you can see how +it works. Artifacts are staged to Sonatype where a maintainer needs to take a +release action for them to go live on Maven Central. + +[prow task]: https://github.com/gojek/feast/blob/17e7dca8238aae4dcbf0ff9f0db5d80ef8e035cf/.prow/config.yaml#L166-L192 diff --git a/datatypes/java/pom.xml b/datatypes/java/pom.xml new file mode 100644 index 00000000000..a6dfa8e345a --- /dev/null +++ b/datatypes/java/pom.xml @@ -0,0 +1,72 @@ + + + + 4.0.0 + + Feast Data Types for Java + + Data types and service contracts used throughout Feast components and + their interchanges. These are generated from Protocol Buffers and gRPC + definitions included in the package. + + datatypes-java + + + dev.feast + feast-parent + ${revision} + ../.. + + + + + + org.xolstice.maven.plugins + protobuf-maven-plugin + + true + + com.google.protobuf:protoc:${protocVersion}:exe:${os.detected.classifier} + + grpc-java + + io.grpc:protoc-gen-grpc-java:${grpcVersion}:exe:${os.detected.classifier} + + + + + + compile + compile-custom + test-compile + + + + + + + + + + io.grpc + grpc-services + + + diff --git a/sdk/java/src/main/proto/feast b/datatypes/java/src/main/proto/feast similarity index 100% rename from sdk/java/src/main/proto/feast rename to datatypes/java/src/main/proto/feast diff --git a/datatypes/java/src/main/proto/third_party b/datatypes/java/src/main/proto/third_party new file mode 120000 index 00000000000..f015f8477d1 --- /dev/null +++ b/datatypes/java/src/main/proto/third_party @@ -0,0 +1 @@ +../../../../../protos/third_party \ No newline at end of file diff --git a/docs/getting-started/installing-feast.md b/docs/getting-started/installing-feast.md index 0b8b42f26f6..699dd5fa8f1 100644 --- a/docs/getting-started/installing-feast.md +++ b/docs/getting-started/installing-feast.md @@ -268,7 +268,7 @@ bq mk ${FEAST_BIGQUERY_DATASET_ID} Create the service account that Feast will run as: ```bash -gcloud iam service-accounts create ${FEAST_SERVICE_ACCOUNT_NAME} +gcloud iam service-accounts create ${FEAST_S_ACCOUNT_NAME} gcloud projects add-iam-policy-binding ${FEAST_GCP_PROJECT_ID} \ --member serviceAccount:${FEAST_S_ACCOUNT_NAME}@${FEAST_GCP_PROJECT_ID}.iam.gserviceaccount.com \ @@ -324,6 +324,15 @@ PING 10.123.114.11 (10.203.164.22) 56(84) bytes of data. 64 bytes from 10.123.114.11: icmp_seq=2 ttl=63 time=51.2 ms ``` +Add firewall rules in gcloud to open up ports: +```bash +gcloud compute firewall-rules create feast-core-port --allow tcp:32090 +gcloud compute firewall-rules create feast-online-port --allow tcp:32091 +gcloud compute firewall-rules create feast-batch-port --allow tcp:32092 +gcloud compute firewall-rules create feast-redis-port --allow tcp:32101 +gcloud compute firewall-rules create feast-kafka-ports --allow tcp:31090-31095 +``` + ### 3. Set up Helm Run the following command to provide Tiller with authorization to install Feast: @@ -377,7 +386,8 @@ cp values.yaml my-feast-values.yaml Update `my-feast-values.yaml` based on your GCP and GKE environment. * Required fields are paired with comments which indicate whether they need to be replaced. -* All occurrences of `feast.example.com` should be replaced with either your domain name or the IP stored in `$FEAST_IP`. +* All occurrences of `EXTERNAL_IP` should be replaced with either your domain name or the IP stored in `$FEAST_IP`. +* Replace all occurrences of `YOUR_BUCKET_NAME` with your bucket name stored in `$FEAST_GCS_BUCKET` Install the Feast Helm chart: @@ -421,4 +431,3 @@ feast config set serving_url ${FEAST_ONLINE_SERVING_URL} ``` That's it! You can now start to use Feast! - diff --git a/examples/basic/basic.ipynb b/examples/basic/basic.ipynb index 6a83e6a08b5..49658b42357 100644 --- a/examples/basic/basic.ipynb +++ b/examples/basic/basic.ipynb @@ -2,12 +2,10 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "# Feast Basic Customer Transactions Example" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", @@ -48,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -73,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -84,11 +82,13 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "client = Client(core_url=CORE_URL, serving_url=BATCH_SERVING_URL) # Connect to Feast Core" + "client = Client(core_url=CORE_URL, serving_url=BATCH_SERVING_URL) # Connect to Feast Core\n", + "client.create_project('customer_project')\n", + "client.set_project('customer_project')" ] }, { @@ -107,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -119,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -154,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -174,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -197,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -213,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -230,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -255,7 +255,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -280,14 +280,14 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "job = client.get_batch_features(\n", - " feature_ids=[\n", - " f\"customer_transactions:{customer_fs.version}:daily_transactions\", \n", - " f\"customer_transactions:{customer_fs.version}:total_transactions\", \n", + " feature_refs=[\n", + " f\"daily_transactions\", \n", + " f\"total_transactions\", \n", " ],\n", " entity_rows=entity_rows\n", " )\n", @@ -311,11 +311,12 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "online_client = Client(core_url=CORE_URL, serving_url=ONLINE_SERVING_URL)" + "online_client = Client(core_url=CORE_URL, serving_url=ONLINE_SERVING_URL)\n", + "online_client.set_project(\"customer_project\")" ] }, { @@ -327,14 +328,14 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "online_features = online_client.get_online_features(\n", - " feature_ids=[\n", - " f\"customer_transactions:{customer_fs.version}:daily_transactions\",\n", - " f\"customer_transactions:{customer_fs.version}:total_transactions\",\n", + " feature_refs=[\n", + " f\"daily_transactions\",\n", + " f\"total_transactions\",\n", " ],\n", " entity_rows=[\n", " GetOnlineFeaturesRequest.EntityRow(\n", @@ -373,18 +374,18 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.7.3" }, "pycharm": { "stem_cell": { "cell_type": "raw", - "source": [], "metadata": { "collapsed": false - } + }, + "source": [] } } }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/infra/charts/feast/Chart.yaml b/infra/charts/feast/Chart.yaml index 80d17aeb6ba..c8f328548a9 100644 --- a/infra/charts/feast/Chart.yaml +++ b/infra/charts/feast/Chart.yaml @@ -1,4 +1,4 @@ apiVersion: v1 description: A Helm chart to install Feast on kubernetes name: feast -version: 0.3.2 +version: 0.4.4 diff --git a/infra/charts/feast/charts/feast-core/Chart.yaml b/infra/charts/feast/charts/feast-core/Chart.yaml index efda76d3026..86d0699b9ac 100644 --- a/infra/charts/feast/charts/feast-core/Chart.yaml +++ b/infra/charts/feast/charts/feast-core/Chart.yaml @@ -1,4 +1,4 @@ apiVersion: v1 description: A Helm chart for core component of Feast name: feast-core -version: 0.3.2 +version: 0.4.4 diff --git a/infra/charts/feast/charts/feast-serving/Chart.yaml b/infra/charts/feast/charts/feast-serving/Chart.yaml index 8486275d94f..2e9cf89243d 100644 --- a/infra/charts/feast/charts/feast-serving/Chart.yaml +++ b/infra/charts/feast/charts/feast-serving/Chart.yaml @@ -1,4 +1,4 @@ apiVersion: v1 description: A Helm chart for serving component of Feast name: feast-serving -version: 0.3.2 +version: 0.4.4 diff --git a/infra/charts/feast/values.yaml b/infra/charts/feast/values.yaml index ebc8c802a16..db3ec44f330 100644 --- a/infra/charts/feast/values.yaml +++ b/infra/charts/feast/values.yaml @@ -2,29 +2,29 @@ # - Feast Core # - Feast Serving Online # - Feast Serving Batch -# +# # The configuration for different components can be referenced from: # - charts/feast-core/values.yaml # - charts/feast-serving/values.yaml # # Note that "feast-serving-online" and "feast-serving-batch" are # aliases to "feast-serving" chart since in typical scenario two instances -# of Feast Serving: online and batch will be deployed. Both described +# of Feast Serving: online and batch will be deployed. Both described # using the same chart "feast-serving". # # The following are default values for typical Feast deployment, but not # for production setting. Refer to "values-production.yaml" for recommended # values in production environment. -# -# Note that the import job by default uses DirectRunner +# +# Note that the import job by default uses DirectRunner # https://beam.apache.org/documentation/runners/direct/ # in this configuration since it allows Feast to run in more environments # (unlike DataflowRunner which requires Google Cloud services). -# -# A secret containing Google Cloud service account JSON key is required -# in this configuration. +# +# A secret containing Google Cloud service account JSON key is required +# in this configuration. # https://cloud.google.com/iam/docs/creating-managing-service-accounts -# +# # The Google Cloud service account must have the following roles: # - bigquery.dataEditor # - bigquery.jobUser @@ -32,12 +32,13 @@ # Assuming a service account JSON key file has been downloaded to # (please name the file key.json): # /home/user/key.json -# +# # Run the following command to create the secret in your Kubernetes cluster: # # kubectl create secret generic feast-gcp-service-account \ # --from-file=/home/user/key.json # +# Replace every instance of EXTERNAL_IP with the external IP of your GKE cluster # ============================================================ # Feast Core @@ -51,12 +52,15 @@ feast-core: # to the client. These instances of Feast Serving however can still use # the same shared Feast Core. enabled: true - # jvmOptions are options that will be passed to the Java Virtual Machine (JVM) + # Specify what image tag to use. Keep this consistent for all components + image: + tag: "0.4.4" + # jvmOptions are options that will be passed to the Java Virtual Machine (JVM) # running Feast Core. # # For example, it is good practice to set min and max heap size in JVM. # https://stackoverflow.com/questions/6902135/side-effect-for-increasing-maxpermsize-and-max-heap-size - jvmOptions: + jvmOptions: - -Xms1024m - -Xmx1024m # resources that should be allocated to Feast Core. @@ -68,18 +72,43 @@ feast-core: memory: 2048Mi # gcpServiceAccount is the Google service account that Feast Core will use. gcpServiceAccount: - # useExistingSecret specifies Feast to use an existing secret containing + # useExistingSecret specifies Feast to use an existing secret containing # Google Cloud service account JSON key file. - # + # # This is the only supported option for now to use a service account JSON. # Feast admin is expected to create this secret before deploying Feast. useExistingSecret: true existingSecret: # name is the secret name of the existing secret for the service account. - name: feast-gcp-service-account + name: feast-gcp-service-account # key is the secret key of the existing secret for the service account. # key is normally derived from the file name of the JSON key file. key: key.json + # Setting service.type to NodePort exposes feast-core service at a static port + service: + type: NodePort + grpc: + # this is the port that is exposed outside of the cluster + nodePort: 32090 + # Make kafka externally accessible using NodePort + # Please set EXTERNAL_IP to your cluster's external IP + kafka: + external: + enabled: true + type: NodePort + domain: EXTERNAL_IP + configurationOverrides: + "advertised.listeners": |- + EXTERNAL://EXTERNAL_IP:$((31090 + ${KAFKA_BROKER_ID})) + "listener.security.protocol.map": |- + PLAINTEXT:PLAINTEXT,EXTERNAL:PLAINTEXT + application.yaml: + feast: + stream: + options: + # Point to one of your Kafka brokers + # Please set EXTERNAL_IP to your cluster's external IP + bootstrapServers: EXTERNAL_IP:31090 # ============================================================ # Feast Serving Online @@ -88,14 +117,22 @@ feast-core: feast-serving-online: # enabled specifies whether to install Feast Serving Online component. enabled: true + # Specify what image tag to use. Keep this consistent for all components + image: + tag: "0.4.4" # redis.enabled specifies whether Redis should be installed as part of Feast Serving. - # + # # If enabled is set to "false", Feast admin has to ensure there is an # existing Redis running outside Feast, that Feast Serving can connect to. + # master.service.type set to NodePort exposes Redis to outside of the cluster redis: enabled: true + master: + service: + nodePort: 32101 + type: NodePort # jvmOptions are options that will be passed to the Feast Serving JVM. - jvmOptions: + jvmOptions: - -Xms1024m - -Xmx1024m # resources that should be allocated to Feast Serving. @@ -105,23 +142,28 @@ feast-serving-online: memory: 1024Mi limits: memory: 2048Mi + # Make service accessible to outside of cluster using NodePort + service: + type: NodePort + grpc: + nodePort: 32091 # store.yaml is the configuration for Feast Store. - # + # # Refer to this link for more description: # https://github.com/gojek/feast/blob/79eb4ab5fa3d37102c1dca9968162a98690526ba/protos/feast/core/Store.proto store.yaml: name: redis type: REDIS redis_config: - # If redis.enabled is set to false, Feast admin should uncomment and - # set the host value to an "existing" Redis instance Feast will use as - # online Store. - # - # Else, if redis.enabled is set to true, no additional configuration is - # required. + # If redis.enabled is set to false, Feast admin should uncomment and + # set the host value to an "existing" Redis instance Feast will use as + # online Store. Also use the correct port for that existing instance. # + # Else, if redis.enabled is set to true, replace EXTERNAL_IP with your + # cluster's external IP. # host: redis-host - port: 6379 + host: EXTERNAL_IP + port: 32101 subscriptions: - name: "*" project: "*" @@ -134,14 +176,17 @@ feast-serving-online: feast-serving-batch: # enabled specifies whether to install Feast Serving Batch component. enabled: true + # Specify what image tag to use. Keep this consistent for all components + image: + tag: "0.4.4" # redis.enabled specifies whether Redis should be installed as part of Feast Serving. - # + # # This is usually set to "false" for Feast Serving Batch because the default # store is BigQuery. redis: enabled: false # jvmOptions are options that will be passed to the Feast Serving JVM. - jvmOptions: + jvmOptions: - -Xms1024m - -Xmx1024m # resources that should be allocated to Feast Serving. @@ -151,17 +196,22 @@ feast-serving-batch: memory: 1024Mi limits: memory: 2048Mi + # Make service accessible to outside of cluster using NodePort + service: + type: NodePort + grpc: + nodePort: 32092 # gcpServiceAccount is the service account that Feast Serving will use. gcpServiceAccount: - # useExistingSecret specifies Feast to use an existing secret containing + # useExistingSecret specifies Feast to use an existing secret containing # Google Cloud service account JSON key file. - # + # # This is the only supported option for now to use a service account JSON. # Feast admin is expected to create this secret before deploying Feast. useExistingSecret: true existingSecret: # name is the secret name of the existing secret for the service account. - name: feast-gcp-service-account + name: feast-gcp-service-account # key is the secret key of the existing secret for the service account. # key is normally derived from the file name of the JSON key file. key: key.json @@ -172,28 +222,33 @@ feast-serving-batch: # for a complete list and description of the configuration. application.yaml: feast: - jobs: - # staging-location specifies the URI to store intermediate files for + jobs: + # staging-location specifies the URI to store intermediate files for # batch serving (required if using BigQuery as Store). - # - # Please set the value to an "existing" Google Cloud Storage URI that + # + # Please set the value to an "existing" Google Cloud Storage URI that # Feast serving has write access to. - staging-location: gs://bucket/path - # Type of store to store job metadata. + staging-location: gs://YOUR_BUCKET_NAME/serving/batch + # Type of store to store job metadata. # - # This default configuration assumes that Feast Serving Online is + # This default configuration assumes that Feast Serving Online is # enabled as well. So Feast Serving Batch will share the same # Redis instance to store job statuses. store-type: REDIS + store-options: + # Use the externally exposed redis instance deployed by Online service + # Please set EXTERNAL_IP to your cluster's external IP + host: EXTERNAL_IP + port: 32101 # store.yaml is the configuration for Feast Store. - # + # # Refer to this link for more description: # https://github.com/gojek/feast/blob/79eb4ab5fa3d37102c1dca9968162a98690526ba/protos/feast/core/Store.proto store.yaml: name: bigquery type: BIGQUERY bigquery_config: - # project_id specifies the Google Cloud Project. Please set this to the + # project_id specifies the Google Cloud Project. Please set this to the # project id you are using BigQuery in. project_id: PROJECT_ID # dataset_id specifies an "existing" BigQuery dataset Feast Serving Batch diff --git a/infra/docker-compose/docker-compose.yml b/infra/docker-compose/docker-compose.yml index a224500ca0a..44750650cec 100644 --- a/infra/docker-compose/docker-compose.yml +++ b/infra/docker-compose/docker-compose.yml @@ -59,6 +59,8 @@ services: redis: image: redis:5-alpine + ports: + - "6379:6379" kafka: image: confluentinc/cp-kafka:5.2.1 @@ -70,7 +72,8 @@ services: KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INSIDE:PLAINTEXT,OUTSIDE:PLAINTEXT KAFKA_INTER_BROKER_LISTENER_NAME: INSIDE ports: - - 9094:9092 + - "9092:9092" + - "9094:9094" depends_on: - zookeeper diff --git a/ingestion/pom.xml b/ingestion/pom.xml index 4908b546985..c829674a64d 100644 --- a/ingestion/pom.xml +++ b/ingestion/pom.xml @@ -31,10 +31,6 @@ - - org.xolstice.maven.plugins - protobuf-maven-plugin - org.apache.maven.plugins maven-shade-plugin @@ -90,6 +86,12 @@ + + dev.feast + datatypes-java + ${project.version} + + org.glassfish javax.el @@ -105,7 +107,7 @@ org.hibernate.validator hibernate-validator - 6.0.13.Final + 6.1.0.Final @@ -223,15 +225,6 @@ slf4j-api - - - - ch.qos.logback - logback-classic - 1.2.3 - runtime - - com.github.kstyrc diff --git a/ingestion/src/main/proto/feast b/ingestion/src/main/proto/feast deleted file mode 120000 index d520da9126b..00000000000 --- a/ingestion/src/main/proto/feast +++ /dev/null @@ -1 +0,0 @@ -../../../../protos/feast \ No newline at end of file diff --git a/ingestion/src/main/proto/feast_ingestion/types/CoalesceAccum.proto b/ingestion/src/main/proto/feast_ingestion/types/CoalesceAccum.proto deleted file mode 100644 index cb64dd715f6..00000000000 --- a/ingestion/src/main/proto/feast_ingestion/types/CoalesceAccum.proto +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright 2018 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -syntax = "proto3"; - -import "google/protobuf/timestamp.proto"; -import "feast/types/Field.proto"; - -option java_package = "feast_ingestion.types"; -option java_outer_classname = "CoalesceAccumProto"; - -// Accumlator for merging feature rows. -message CoalesceAccum { - string entityKey = 1; - google.protobuf.Timestamp eventTimestamp = 3; - string entityName = 4; - - map features = 6; - // map of features to their counter values when they were last added to accumulator - map featureMarks = 7; - int64 counter = 8; -} \ No newline at end of file diff --git a/ingestion/src/main/proto/feast_ingestion/types/CoalesceKey.proto b/ingestion/src/main/proto/feast_ingestion/types/CoalesceKey.proto deleted file mode 100644 index 9730b49ec3b..00000000000 --- a/ingestion/src/main/proto/feast_ingestion/types/CoalesceKey.proto +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright 2018 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -syntax = "proto3"; - -option java_package = "feast_ingestion.types"; -option java_outer_classname = "CoalesceKeyProto"; - -message CoalesceKey { - string entityName = 1; - string entityKey = 2; -} \ No newline at end of file diff --git a/ingestion/src/main/proto/third_party b/ingestion/src/main/proto/third_party deleted file mode 120000 index 363d20598e6..00000000000 --- a/ingestion/src/main/proto/third_party +++ /dev/null @@ -1 +0,0 @@ -../../../../protos/third_party \ No newline at end of file diff --git a/ingestion/src/test/proto/DriverArea.proto b/ingestion/src/test/proto/DriverArea.proto deleted file mode 100644 index fee838b9e17..00000000000 --- a/ingestion/src/test/proto/DriverArea.proto +++ /dev/null @@ -1,10 +0,0 @@ -syntax = "proto3"; - -package feast; - -option java_outer_classname = "DriverAreaProto"; - -message DriverArea { - int32 driverId = 1; - int32 areaId = 2; -} \ No newline at end of file diff --git a/ingestion/src/test/proto/Ping.proto b/ingestion/src/test/proto/Ping.proto deleted file mode 100644 index b1069afa5bd..00000000000 --- a/ingestion/src/test/proto/Ping.proto +++ /dev/null @@ -1,12 +0,0 @@ -syntax = "proto3"; - -package feast; -import "google/protobuf/timestamp.proto"; - -option java_outer_classname = "PingProto"; - -message Ping { - double lat = 1; - double lng = 2; - google.protobuf.Timestamp timestamp = 3; -} diff --git a/pom.xml b/pom.xml index 05fb701ac44..821d3b72321 100644 --- a/pom.xml +++ b/pom.xml @@ -28,6 +28,7 @@ pom + datatypes/java ingestion core serving @@ -54,6 +55,8 @@ 2.28.2 0.21.0 + + 2.12.1 @@ -260,6 +263,26 @@ + + org.apache.logging.log4j + log4j-api + ${log4jVersion} + + + org.apache.logging.log4j + log4j-core + ${log4jVersion} + + + org.apache.logging.log4j + log4j-jul + ${log4jVersion} + + + org.apache.logging.log4j + log4j-slf4j-impl + ${log4jVersion} + io.grpc @@ -79,10 +85,6 @@ - - org.xolstice.maven.plugins - protobuf-maven-plugin - org.apache.maven.plugins diff --git a/sdk/python/feast/cli.py b/sdk/python/feast/cli.py index 601f41d4b11..8e8f185d038 100644 --- a/sdk/python/feast/cli.py +++ b/sdk/python/feast/cli.py @@ -17,7 +17,6 @@ import click from feast import config as feast_config from feast.client import Client -from feast.resource import ResourceFactory from feast.feature_set import FeatureSet import toml import pkg_resources @@ -147,17 +146,25 @@ def feature_set_list(): print(tabulate(table, headers=["NAME", "VERSION"], tablefmt="plain")) -@feature_set.command("create") -@click.argument("name") -def feature_set_create(name): +@feature_set.command("apply") +@click.option( + "--filename", + "-f", + help="Path to a feature set configuration file that will be applied", + type=click.Path(exists=True), +) +def feature_set_create(filename): """ - Create a feature set + Create or update a feature set """ + + feature_sets = [FeatureSet.from_dict(fs_dict) for fs_dict in yaml_loader(filename)] + feast_client = Client( core_url=feast_config.get_config_property_or_fail("core_url") ) # type: Client - feast_client.apply(FeatureSet(name=name)) + feast_client.apply(feature_sets) @feature_set.command("describe") @@ -264,29 +271,5 @@ def ingest(name, version, filename, file_type): feature_set.ingest_file(file_path=filename) -@cli.command() -@click.option( - "--filename", - "-f", - help="Path to the configuration file that will be applied", - type=click.Path(exists=True), -) -def apply(filename): - """ - Apply a configuration to a resource by filename or stdin - """ - - resources = [ - ResourceFactory.get_resource(res_dict["kind"]).from_dict(res_dict) - for res_dict in yaml_loader(filename) - ] - - feast_client = Client( - core_url=feast_config.get_config_property_or_fail("core_url") - ) # type: Client - - feast_client.apply(resources) - - if __name__ == "__main__": cli() diff --git a/sdk/python/feast/feature_set.py b/sdk/python/feast/feature_set.py index d5576607513..c47c51e5a21 100644 --- a/sdk/python/feast/feature_set.py +++ b/sdk/python/feast/feature_set.py @@ -689,8 +689,6 @@ def from_dict(cls, fs_dict): Returns a FeatureSet object based on the feature set dict """ - if ("kind" not in fs_dict) and (fs_dict["kind"].strip() != "feature_set"): - raise Exception(f"Resource kind is not a feature set {str(fs_dict)}") feature_set_proto = json_format.ParseDict( fs_dict, FeatureSetProto(), ignore_unknown_fields=True ) diff --git a/sdk/python/feast/loaders/yaml.py b/sdk/python/feast/loaders/yaml.py index 4cbe15dfaaf..130a71a3d02 100644 --- a/sdk/python/feast/loaders/yaml.py +++ b/sdk/python/feast/loaders/yaml.py @@ -53,7 +53,7 @@ def _get_yaml_contents(yml: str) -> str: with open(yml, "r") as f: yml_content = f.read() - elif isinstance(yml, str) and "kind" in yml.lower(): + elif isinstance(yml, str): yml_content = yml else: raise Exception( @@ -73,7 +73,4 @@ def _yaml_to_dict(yaml_string): Dictionary containing the same object """ - yaml_dict = yaml.safe_load(yaml_string) - if not isinstance(yaml_dict, dict) or not "kind" in yaml_dict: - raise Exception(f"Could not detect YAML kind from resource: ${yaml_string}") - return yaml_dict + return yaml.safe_load(yaml_string) diff --git a/sdk/python/feast/resource.py b/sdk/python/feast/resource.py deleted file mode 100644 index 17a65291667..00000000000 --- a/sdk/python/feast/resource.py +++ /dev/null @@ -1,10 +0,0 @@ -from feast.feature_set import FeatureSet - -# TODO: This factory adds no value. It should be removed asap. -class ResourceFactory: - @staticmethod - def get_resource(kind): - if kind == "feature_set": - return FeatureSet - else: - raise ValueError(kind) diff --git a/serving/pom.xml b/serving/pom.xml index dc3391df62f..c15881030e2 100644 --- a/serving/pom.xml +++ b/serving/pom.xml @@ -47,10 +47,6 @@ false - - org.xolstice.maven.plugins - protobuf-maven-plugin - org.apache.maven.plugins maven-failsafe-plugin @@ -74,6 +70,12 @@ + + dev.feast + datatypes-java + ${project.version} + + org.slf4j diff --git a/serving/src/main/java/feast/serving/store/bigquery/BatchRetrievalQueryRunnable.java b/serving/src/main/java/feast/serving/store/bigquery/BatchRetrievalQueryRunnable.java index d437294dfc3..e875de35a80 100644 --- a/serving/src/main/java/feast/serving/store/bigquery/BatchRetrievalQueryRunnable.java +++ b/serving/src/main/java/feast/serving/store/bigquery/BatchRetrievalQueryRunnable.java @@ -52,6 +52,27 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +/** + * BatchRetrievalQueryRunnable is a Runnable for running a BigQuery Feast batch retrieval job async. + * + *

It does the following, in sequence: + * + *

1. Retrieve the temporal bounds of the entity dataset provided. This will be used to filter + * the feature set tables when performing the feature retrieval. + * + *

2. For each of the feature sets requested, generate the subquery for doing a point-in-time + * correctness join of the features in the feature set to the entity table. + * + *

3. Run each of the subqueries in parallel and wait for them to complete. If any of the jobs + * are unsuccessful, the thread running the BatchRetrievalQueryRunnable catches the error and + * updates the job database. + * + *

4. When all the subquery jobs are complete, join the outputs of all the subqueries into a + * single table. + * + *

5. Extract the output of the join to a remote file, and write the location of the remote file + * to the job database, and mark the retrieval job as successful. + */ @AutoValue public abstract class BatchRetrievalQueryRunnable implements Runnable { @@ -109,18 +130,22 @@ public abstract static class Builder { @Override public void run() { + // 1. Retrieve the temporal bounds of the entity dataset provided FieldValueList timestampLimits = getTimestampLimits(entityTableName()); + // 2. Generate the subqueries List featureSetQueries = generateQueries(timestampLimits); QueryJobConfiguration queryConfig; try { + // 3 & 4. Run the subqueries in parallel then collect the outputs Job queryJob = runBatchQuery(featureSetQueries); queryConfig = queryJob.getConfiguration(); String exportTableDestinationUri = String.format("%s/%s/*.avro", jobStagingLocation(), feastJobId()); + // 5. Export the table // Hardcode the format to Avro for now ExtractJobConfiguration extractConfig = ExtractJobConfiguration.of( @@ -141,6 +166,7 @@ public void run() { List fileUris = parseOutputFileURIs(); + // 5. Update the job database jobService() .upsert( ServingAPIProto.Job.newBuilder() @@ -181,6 +207,8 @@ Job runBatchQuery(List featureSetQueries) List featureSetInfos = new ArrayList<>(); + // For each of the feature sets requested, start an async job joining the features in that + // feature set to the provided entity table for (int i = 0; i < featureSetQueries.size(); i++) { QueryJobConfiguration queryJobConfig = QueryJobConfiguration.newBuilder(featureSetQueries.get(i)) @@ -197,6 +225,8 @@ Job runBatchQuery(List featureSetQueries) for (int i = 0; i < featureSetQueries.size(); i++) { try { + // Try to retrieve the outputs of all the jobs. The timeout here is a formality; + // a stricter timeout is implemented in the actual SubqueryCallable. FeatureSetInfo featureSetInfo = executorCompletionService.take().get(SUBQUERY_TIMEOUT_SECS, TimeUnit.SECONDS); featureSetInfos.add(featureSetInfo); @@ -218,6 +248,8 @@ Job runBatchQuery(List featureSetQueries) } } + // Generate and run a join query to collect the outputs of all the + // subqueries into a single table. String joinQuery = QueryTemplater.createJoinQuery( featureSetInfos, entityTableColumnNames(), entityTableName()); diff --git a/serving/src/main/java/feast/serving/store/bigquery/SubqueryCallable.java b/serving/src/main/java/feast/serving/store/bigquery/SubqueryCallable.java index e0b8f457986..14026030b42 100644 --- a/serving/src/main/java/feast/serving/store/bigquery/SubqueryCallable.java +++ b/serving/src/main/java/feast/serving/store/bigquery/SubqueryCallable.java @@ -30,8 +30,8 @@ import java.util.concurrent.Callable; /** - * Waits for a bigquery job to complete; when complete, it updates the feature set info with the - * output table name, as well as increments the completed jobs counter in the query job listener. + * Waits for a point-in-time correctness join to complete. On completion, returns a featureSetInfo + * updated with the reference to the table containing the results of the query. */ @AutoValue public abstract class SubqueryCallable implements Callable { diff --git a/serving/src/main/proto/feast b/serving/src/main/proto/feast deleted file mode 120000 index d520da9126b..00000000000 --- a/serving/src/main/proto/feast +++ /dev/null @@ -1 +0,0 @@ -../../../../protos/feast \ No newline at end of file diff --git a/serving/src/main/proto/third_party b/serving/src/main/proto/third_party deleted file mode 120000 index 363d20598e6..00000000000 --- a/serving/src/main/proto/third_party +++ /dev/null @@ -1 +0,0 @@ -../../../../protos/third_party \ No newline at end of file diff --git a/serving/src/main/resources/log4j2.xml b/serving/src/main/resources/log4j2.xml index 02520cb36c2..661c8e5061c 100644 --- a/serving/src/main/resources/log4j2.xml +++ b/serving/src/main/resources/log4j2.xml @@ -21,19 +21,23 @@ %d{yyyy-MM-dd HH:mm:ss.SSS} %5p ${hostName} --- [%15.15t] %-40.40c{1.} : %m%n%ex + ${env:LOG_TYPE:-Console} + ${env:LOG_LEVEL:-info} + + + - - + + - - - + + \ No newline at end of file diff --git a/serving/src/main/resources/templates/join_featuresets.sql b/serving/src/main/resources/templates/join_featuresets.sql index e57b0c10314..60b7c7d7a12 100644 --- a/serving/src/main/resources/templates/join_featuresets.sql +++ b/serving/src/main/resources/templates/join_featuresets.sql @@ -1,3 +1,6 @@ +/* + Joins the outputs of multiple point-in-time-correctness joins to a single table. + */ WITH joined as ( SELECT * FROM `{{ leftTableName }}` {% for featureSet in featureSets %} diff --git a/serving/src/main/resources/templates/single_featureset_pit_join.sql b/serving/src/main/resources/templates/single_featureset_pit_join.sql index f6678421851..1f4612b3503 100644 --- a/serving/src/main/resources/templates/single_featureset_pit_join.sql +++ b/serving/src/main/resources/templates/single_featureset_pit_join.sql @@ -1,9 +1,24 @@ -WITH union_features AS (SELECT +/* + This query template performs the point-in-time correctness join for a single feature set table + to the provided entity table. + + 1. Concatenate the timestamp and entities from the feature set table with the entity dataset. + Feature values are joined to this table later for improved efficiency. + featureset_timestamp is equal to null in rows from the entity dataset. + */ +WITH union_features AS ( +SELECT + -- uuid is a unique identifier for each row in the entity dataset. Generated by `QueryTemplater.createEntityTableUUIDQuery` uuid, + -- event_timestamp contains the timestamps to join onto event_timestamp, + -- the feature_timestamp, i.e. the latest occurrence of the requested feature relative to the entity_dataset timestamp NULL as {{ featureSet.project }}_{{ featureSet.name }}_v{{ featureSet.version }}_feature_timestamp, + -- created timestamp of the feature at the corresponding feature_timestamp NULL as created_timestamp, + -- select only entities belonging to this feature set {{ featureSet.entities | join(', ')}}, + -- boolean for filtering the dataset later true AS is_entity_table FROM `{{leftTableName}}` UNION ALL @@ -15,7 +30,18 @@ SELECT {{ featureSet.entities | join(', ')}}, false AS is_entity_table FROM `{{projectId}}.{{datasetId}}.{{ featureSet.project }}_{{ featureSet.name }}_v{{ featureSet.version }}` WHERE event_timestamp <= '{{maxTimestamp}}' AND event_timestamp >= Timestamp_sub(TIMESTAMP '{{ minTimestamp }}', interval {{ featureSet.maxAge }} second) -), joined AS ( +), +/* + 2. Window the data in the unioned dataset, partitioning by entity and ordering by event_timestamp, as + well as is_entity_table. + Within each window, back-fill the feature_timestamp - as a result of this, the null feature_timestamps + in the rows from the entity table should now contain the latest timestamps relative to the row's + event_timestamp. + + For rows where event_timestamp(provided datetime) - feature_timestamp > max age, set the + feature_timestamp to null. + */ +joined AS ( SELECT uuid, event_timestamp, @@ -34,6 +60,10 @@ SELECT FROM union_features WINDOW w AS (PARTITION BY {{ featureSet.entities | join(', ') }} ORDER BY event_timestamp DESC, is_entity_table DESC, created_timestamp DESC ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) ) +/* + 3. Select only the rows from the entity table, and join the features from the original feature set table + to the dataset using the entity values, feature_timestamp, and created_timestamps. + */ LEFT JOIN ( SELECT event_timestamp as {{ featureSet.project }}_{{ featureSet.name }}_v{{ featureSet.version }}_feature_timestamp, @@ -46,6 +76,9 @@ FROM `{{projectId}}.{{datasetId}}.{{ featureSet.project }}_{{ featureSet.name }} ) USING ({{ featureSet.project }}_{{ featureSet.name }}_v{{ featureSet.version }}_feature_timestamp, created_timestamp, {{ featureSet.entities | join(', ')}}) WHERE is_entity_table ) +/* + 4. Finally, deduplicate the rows by selecting the first occurrence of each entity table row UUID. + */ SELECT k.* FROM (