diff --git a/.github/workflows/pull_request_push_test.yml b/.github/workflows/pull_request_push_test.yml index 77667815e..778fa05b4 100644 --- a/.github/workflows/pull_request_push_test.yml +++ b/.github/workflows/pull_request_push_test.yml @@ -22,7 +22,7 @@ on: - "docs/**" - "ui/**" - "**/README.md" - + schedule: # Runs daily at 1 PM UTC (9 PM CST), will send notification to TEAMS_WEBHOOK - cron: '00 13 * * *' @@ -127,7 +127,7 @@ jobs: SQL1_USER: ${{secrets.SQL1_USER}} SQL1_PASSWORD: ${{secrets.SQL1_PASSWORD}} run: | - # run only test with databricks. run in 4 parallel jobs + # run only test with databricks. run in 6 parallel jobs pytest -n 6 --cov-report term-missing --cov=feathr_project/feathr feathr_project/test --cov-config=.github/workflows/.coveragerc_db azure_synapse_test: # might be a bit duplication to setup both the azure_synapse test and databricks test, but for now we will keep those to accelerate the test speed @@ -195,7 +195,7 @@ jobs: SQL1_PASSWORD: ${{secrets.SQL1_PASSWORD}} run: | # skip databricks related test as we just ran the test; also seperate databricks and synapse test to make sure there's no write conflict - # run in 4 parallel jobs to make the time shorter + # run in 6 parallel jobs to make the time shorter pytest -n 6 --cov-report term-missing --cov=feathr_project/feathr feathr_project/test --cov-config=.github/workflows/.coveragerc_sy local_spark_test: diff --git a/.gitignore b/.gitignore index 976c0b239..4fe490c96 100644 --- a/.gitignore +++ b/.gitignore @@ -213,3 +213,6 @@ null/* project/.bloop metals.sbt .bsp/sbt.json + +# Feathr output debug folder +**/debug/ diff --git a/docs/dev_guide/new_contributor_guide.md b/docs/dev_guide/new_contributor_guide.md index 1856ffd84..223b7d91b 100644 --- a/docs/dev_guide/new_contributor_guide.md +++ b/docs/dev_guide/new_contributor_guide.md @@ -6,11 +6,11 @@ parent: Feathr Developer Guides # What can I contribute? All forms of contributions are welcome, including and not limited to: -* Improve or contribute new [notebook samples](https://github.com/feathr-ai/feathr/tree/main/feathr_project/feathrcli/data/feathr_user_workspace) +* Improve or contribute new [notebook samples](https://github.com/feathr-ai/feathr/tree/main/docs/samples) * Add tutorial, blog posts, tech talks etc * Increase media coverage and exposure * Improve user-facing documentation or developer-facing documentation -* Add testing code +* Add testing code * Add new features * Refactor and improve architecture * For any other forms of contribution and collaboration, don't hesitate to reach out to us. @@ -18,7 +18,7 @@ All forms of contributions are welcome, including and not limited to: # I am interested, how can I start? If you are new to this project, we recommend start with [`good-first-issue`](https://github.com/feathr-ai/feathr/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). -The issues are also labled with what types of programming language the task need. +The issues are also labled with what types of programming language the task need. * [`good-first-issue` and `Python`](https://github.com/feathr-ai/feathr/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3Apython) * [`good-first-issue` and `Scala`](https://github.com/feathr-ai/feathr/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3Ascala) * [`good-first-issue` and `Java`](https://github.com/feathr-ai/feathr/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3Ajava) diff --git a/docs/quickstart_synapse.md b/docs/quickstart_synapse.md index d07198d92..c310dd789 100644 --- a/docs/quickstart_synapse.md +++ b/docs/quickstart_synapse.md @@ -24,7 +24,7 @@ Feathr has native cloud integration. Here are the steps to use Feathr on Azure: 1. Follow the [Feathr ARM deployment guide](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html) to run Feathr on Azure. This allows you to quickly get started with automated deployment using Azure Resource Manager template. Alternatively, if you want to set up everything manually, you can checkout the [Feathr CLI deployment guide](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html) to run Feathr on Azure. This allows you to understand what is going on and set up one resource at a time. -2. Once the deployment is complete,run the Feathr Jupyter Notebook by clicking this button: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/feathr-ai/feathr/main?labpath=feathr_project%2Ffeathrcli%2Fdata%2Ffeathr_user_workspace%2Fnyc_driver_demo.ipynb). +2. Once the deployment is complete,run the Feathr Jupyter Notebook by clicking this button: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/feathr-ai/feathr/main?labpath=docs%2Fsamples%2Fnyc_taxi_demo.ipynb). 3. You only need to change the specified `Resource Prefix`. ## Step 2: Install Feathr diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb old mode 100755 new mode 100644 index aaefdfbdc..7d41696e8 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb +++ b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb @@ -1,1383 +1,1216 @@ { - "cells":[ - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"843d3142-24ca-4bd1-9e31-b55163804fe3", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "dbutils.widgets.text(\"RESOURCE_PREFIX\", \"\")\n", - "dbutils.widgets.text(\"REDIS_KEY\", \"\")" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"384e5e16-7213-4186-9d04-09d03b155534", - "showTitle":false, - "title":"" - } - }, - "source":[ - "# Feathr Feature Store on Databricks Demo Notebook\n", - "\n", - "This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n", - "\n", - "This notebook is specifically written for Databricks and is relying on some of the Databricks packages such as `dbutils`. The intention here is to provide a \"one click run\" example with minimum configuration. For example:\n", - "- This notebook skips feature registry which requires running Azure Purview. \n", - "- To make the online feature query work, you will need to configure the Redis endpoint. \n", - "\n", - "The full-fledged notebook can be found from [here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)." - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"c2ce58c7-9263-469a-bbb7-43364ddb07b8", - "showTitle":false, - "title":"" - } - }, - "source":[ - "## Prerequisite\n", - "\n", - "To use feathr materialization for online scoring with Redis cache, you may deploy a Redis cluster and set `RESOURCE_PREFIX` and `REDIS_KEY` via Databricks widgets. Note that the deployed Redis host address should be `{RESOURCE_PREFIX}redis.redis.cache.windows.net`. More details about how to deploy the Redis cluster can be found [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html#configurure-redis-cluster).\n", - "\n", - "To run this notebook, you'll need to install `feathr` pip package. Here, we install notebook-scoped library. For details, please see [Azure Databricks dependency management document](https://learn.microsoft.com/en-us/azure/databricks/libraries/)." - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"4609d7ad-ad74-40fc-b97e-f440a0fa0737", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "!pip install feathr" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"c81fa80c-bca6-4ae5-84ad-659a036977bd", - "showTitle":false, - "title":"" - } - }, - "source":[ - "## Notebook Steps\n", - "\n", - "This tutorial demonstrates the key capabilities of Feathr, including:\n", - "\n", - "1. Install Feathr and necessary dependencies.\n", - "1. Create shareable features with Feathr feature definition configs.\n", - "1. Create training data using point-in-time correct feature join\n", - "1. Train and evaluate a prediction model.\n", - "1. Materialize feature values for online scoring.\n", - "\n", - "The overall data flow is as follows:\n", - "\n", - "" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"80223a02-631c-40c8-91b3-a037249ffff9", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "from datetime import datetime, timedelta\n", - "import glob\n", - "import json\n", - "from math import sqrt\n", - "import os\n", - "from pathlib import Path\n", - "import requests\n", - "from tempfile import TemporaryDirectory\n", - "\n", - "from azure.identity import AzureCliCredential, DefaultAzureCredential \n", - "from azure.keyvault.secrets import SecretClient\n", - "import pandas as pd\n", - "from pyspark.ml import Pipeline\n", - "from pyspark.ml.evaluation import RegressionEvaluator\n", - "from pyspark.ml.feature import VectorAssembler\n", - "from pyspark.ml.regression import GBTRegressor\n", - "from pyspark.sql import DataFrame, SparkSession\n", - "import pyspark.sql.functions as F\n", - "\n", - "import feathr\n", - "from feathr import (\n", - " FeathrClient,\n", - " # Feature data types\n", - " BOOLEAN, FLOAT, INT32, ValueType,\n", - " # Feature data sources\n", - " INPUT_CONTEXT, HdfsSource,\n", - " # Feature aggregations\n", - " TypedKey, WindowAggTransformation,\n", - " # Feature types and anchor\n", - " DerivedFeature, Feature, FeatureAnchor,\n", - " # Materialization\n", - " BackfillTime, MaterializationSettings, RedisSink,\n", - " # Offline feature computation\n", - " FeatureQuery, ObservationSettings,\n", - ")\n", - "from feathr.datasets import nyc_taxi\n", - "from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n", - "from feathr.utils.config import generate_config\n", - "from feathr.utils.job_utils import get_result_df\n", - "\n", - "\n", - "print(f\"\"\"Feathr version: {feathr.__version__}\n", - "Databricks runtime version: {spark.conf.get(\"spark.databricks.clusterUsageTags.sparkVersion\")}\"\"\")" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"ab35fa01-b392-457e-8fde-7e445a3c39b5", - "showTitle":false, - "title":"" - } - }, - "source":[ - "## 2. Create Shareable Features with Feathr Feature Definition Configs\n", - "\n", - "In this notebook, we define all the necessary resource key values for authentication. We use the values passed by the databricks widgets at the top of this notebook. Instead of manually entering the values to the widgets, we can also use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) to retrieve them.\n", - "Please refer to [how-to guide documents for granting key-vault access](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html#3-grant-key-vault-and-synapse-access-to-selected-users-optional) and [Databricks' Azure Key Vault-backed scopes](https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes) for more details." - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"09f93a9f-7b33-4d91-8f31-ee3b20991696", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "RESOURCE_PREFIX = dbutils.widgets.get(\"RESOURCE_PREFIX\")\n", - "PROJECT_NAME = \"feathr_getting_started\"\n", - "\n", - "REDIS_KEY = dbutils.widgets.get(\"REDIS_KEY\")\n", - "\n", - "# Use a databricks cluster\n", - "SPARK_CLUSTER = \"databricks\"\n", - "\n", - "# Databricks file system path\n", - "DATA_STORE_PATH = f\"dbfs:/{PROJECT_NAME}\"" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"41d3648a-9bc9-40dc-90da-bc82b21ef9b3", - "showTitle":false, - "title":"" - } - }, - "source":[ - "In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec.\n", - "\n", - "Note: When submitting jobs, Databricks recommend to use new clusters for greater reliability. If you want to use an existing all-purpose cluster, you may set\n", - "`existing_cluster_id': ctx.tags().get('clusterId').get()` to the `databricks_config`, replacing `new_cluster` config values." - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"331753d6-1850-47b5-ad97-84b7c01d79d1", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "# Redis credential\n", - "os.environ['REDIS_PASSWORD'] = REDIS_KEY\n", - "\n", - "# Setup databricks env configs\n", - "ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n", - "databricks_config = {\n", - " 'run_name': \"FEATHR_FILL_IN\",\n", - " # To use an existing all-purpose cluster:\n", - " # 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n", - " # To use a new job cluster:\n", - " 'new_cluster': {\n", - " 'spark_version': \"11.2.x-scala2.12\",\n", - " 'node_type_id': \"Standard_D3_v2\",\n", - " 'num_workers':1,\n", - " 'spark_conf': {\n", - " 'FEATHR_FILL_IN': \"FEATHR_FILL_IN\",\n", - " # Exclude conflicting packages if use feathr <= v0.8.0:\n", - " 'spark.jars.excludes': \"commons-logging:commons-logging,org.slf4j:slf4j-api,com.google.protobuf:protobuf-java,javax.xml.bind:jaxb-api\",\n", - " },\n", - " },\n", - " 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n", - " 'spark_jar_task': {\n", - " 'main_class_name': \"FEATHR_FILL_IN\",\n", - " 'parameters': [\"FEATHR_FILL_IN\"],\n", - " },\n", - "}\n", - "os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n", - "os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n", - "os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n", - "os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee", - "showTitle":false, - "title":"" - } - }, - "source":[ - "### Configurations\n", - "\n", - "Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/feathr-ai/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field." - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"8cd64e3a-376c-48e6-ba41-5197f3591d48", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n", - "\n", - "with open(config_path, 'r') as f: \n", - " print(f.read())" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"58d22dc1-7590-494d-94ca-3e2488c31c8e", - "showTitle":false, - "title":"" - } - }, - "source":[ - "All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable." - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"3fef7f2f-df19-4f53-90a5-ff7999ed983d", - "showTitle":false, - "title":"" - } - }, - "source":[ - "### Initialize Feathr Client" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"9713a2df-c7b2-4562-88b0-b7acce3cc43a", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "client = FeathrClient(config_path=config_path)" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"c3b64bda-d42c-4a64-b976-0fb604cf38c5", - "showTitle":false, - "title":"" - } - }, - "source":[ - "### View the NYC taxi fare dataset" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"c4ccd7b3-298a-4e5a-8eec-b7e309db393e", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n", - "\n", - "# Download the data file\n", - "df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n", - "df_raw.limit(5).toPandas()" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"7430c942-64e5-4b70-b823-16ce1d1b3cee", - "showTitle":false, - "title":"" - } - }, - "source":[ - "### Defining features with Feathr\n", - "\n", - "In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n", - "\n", - "* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n", - "* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", - "* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n", - "\n", - "Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n", - "\n", - "There are two types of features -- anchored features and derivated features:\n", - "\n", - "* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n", - "* **Derived features**: Features that are computed on top of other features.\n", - "\n", - "#### Define anchored features\n", - "\n", - "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object." - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"75b8d2ed-84df-4446-ae07-5f715434f3ea", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n", - "TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\"" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"93abbcc2-562b-47e4-ad4c-1fedd7cc64df", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "# We define f_trip_distance and f_trip_time_duration features separately\n", - "# so that we can reuse them later for the derived features.\n", - "f_trip_distance = Feature(\n", - " name=\"f_trip_distance\",\n", - " feature_type=FLOAT,\n", - " transform=\"trip_distance\",\n", - ")\n", - "f_trip_time_duration = Feature(\n", - " name=\"f_trip_time_duration\",\n", - " feature_type=FLOAT,\n", - " transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n", - ")\n", - "\n", - "features = [\n", - " f_trip_distance,\n", - " f_trip_time_duration,\n", - " Feature(\n", - " name=\"f_is_long_trip_distance\",\n", - " feature_type=BOOLEAN,\n", - " transform=\"trip_distance > 30.0\",\n", - " ),\n", - " Feature(\n", - " name=\"f_day_of_week\",\n", - " feature_type=INT32,\n", - " transform=\"dayofweek(lpep_dropoff_datetime)\",\n", - " ),\n", - " Feature(\n", - " name=\"f_day_of_month\",\n", - " feature_type=INT32,\n", - " transform=\"dayofmonth(lpep_dropoff_datetime)\",\n", - " ),\n", - " Feature(\n", - " name=\"f_hour_of_day\",\n", - " feature_type=INT32,\n", - " transform=\"hour(lpep_dropoff_datetime)\",\n", - " ),\n", - "]\n", - "\n", - "# After you have defined features, bring them together to build the anchor to the source.\n", - "feature_anchor = FeatureAnchor(\n", - " name=\"feature_anchor\",\n", - " source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n", - " features=features,\n", - ")" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"728d2d5f-c11f-4941-bdc5-48507f5749f1", - "showTitle":false, - "title":"" - } - }, - "source":[ - "We can define the source with a preprocessing python function." - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"3cc59a0e-a41b-480e-a84e-ca5443d63143", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "def preprocessing(df: DataFrame) -> DataFrame:\n", - " import pyspark.sql.functions as F\n", - " df = df.withColumn(\"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\"))\n", - " return df\n", - "\n", - "batch_source = HdfsSource(\n", - " name=\"nycTaxiBatchSource\",\n", - " path=DATA_FILE_PATH,\n", - " event_timestamp_column=TIMESTAMP_COL,\n", - " preprocessing=preprocessing,\n", - " timestamp_format=TIMESTAMP_FORMAT,\n", - ")" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"46f863c4-bb81-434a-a448-6b585031a221", - "showTitle":false, - "title":"" - } - }, - "source":[ - "For the features with aggregation, the supported functions are as follows:\n", - "\n", - "| Aggregation Function | Input Type | Description |\n", - "| --- | --- | --- |\n", - "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", - "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", - "|LATEST| Any |Returns the latest not-null values from within the defined time window |" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"a373ecbe-a040-4cd3-9d87-0d5f4c5ba553", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "agg_key = TypedKey(\n", - " key_column=\"DOLocationID\",\n", - " key_column_type=ValueType.INT32,\n", - " description=\"location id in NYC\",\n", - " full_name=\"nyc_taxi.location_id\",\n", - ")\n", - "\n", - "agg_window = \"90d\"\n", - "\n", - "# Anchored features with aggregations\n", - "agg_features = [\n", - " Feature(\n", - " name=\"f_location_avg_fare\",\n", - " key=agg_key,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(\n", - " agg_expr=\"fare_amount_cents\",\n", - " agg_func=\"AVG\",\n", - " window=agg_window,\n", - " ),\n", - " ),\n", - " Feature(\n", - " name=\"f_location_max_fare\",\n", - " key=agg_key,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(\n", - " agg_expr=\"fare_amount_cents\",\n", - " agg_func=\"MAX\",\n", - " window=agg_window,\n", - " ),\n", - " ),\n", - "]\n", - "\n", - "agg_feature_anchor = FeatureAnchor(\n", - " name=\"agg_feature_anchor\",\n", - " source=batch_source, # External data source for feature. Typically a data table.\n", - " features=agg_features,\n", - ")" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"149f85e2-fa3c-4895-b0c5-de5543ca9b6d", - "showTitle":false, - "title":"" - } - }, - "source":[ - "#### Define derived features\n", - "\n", - "We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"05633bc3-9118-449b-9562-45fc437576c2", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "derived_features = [\n", - " DerivedFeature(\n", - " name=\"f_trip_time_distance\",\n", - " feature_type=FLOAT,\n", - " input_features=[\n", - " f_trip_distance,\n", - " f_trip_time_duration,\n", - " ],\n", - " transform=\"f_trip_distance / f_trip_time_duration\",\n", - " )\n", - "]" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"ad102c45-586d-468c-85f0-9454401ef10b", - "showTitle":false, - "title":"" - } - }, - "source":[ - "### Build features\n", - "\n", - "Finally, we build the features." - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"91bb5ebb-87e4-470b-b8eb-1c89b351740e", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "client.build_features(\n", - " anchor_list=[feature_anchor, agg_feature_anchor],\n", - " derived_feature_list=derived_features,\n", - ")" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"632d5f46-f9e2-41a8-aab7-34f75206e2aa", - "showTitle":false, - "title":"" - } - }, - "source":[ - "## 3. Create Training Data Using Point-in-Time Correct Feature Join\n", - "\n", - "After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n", - "\n", - "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", - "what features and how these features should be joined to the observation data. \n", - "\n", - "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/point-in-time-join.md)" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"02feabc9-2f2f-43e8-898d-b28082798e98", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "feature_names = [feature.name for feature in features + agg_features + derived_features]\n", - "feature_names" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "DATA_FORMAT = \"parquet\"\n", - "offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"67e81466-c736-47ba-b122-e640642c01cf", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "# Features that we want to request. Can use a subset of features\n", - "query = FeatureQuery(\n", - " feature_list=feature_names,\n", - " key=agg_key,\n", - ")\n", - "settings = ObservationSettings(\n", - " observation_path=DATA_FILE_PATH,\n", - " event_timestamp_column=TIMESTAMP_COL,\n", - " timestamp_format=TIMESTAMP_FORMAT,\n", - ")\n", - "client.get_offline_features(\n", - " observation_settings=settings,\n", - " feature_query=query,\n", - " # Note, execution_configurations argument only works when using a new job cluster\n", - " # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n", - " execution_configurations=SparkExecutionConfiguration({\n", - " \"spark.feathr.outputFormat\": DATA_FORMAT,\n", - " }),\n", - " output_path=offline_features_path,\n", - ")\n", - "\n", - "client.wait_job_to_finish(timeout_sec=500)" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"9871af55-25eb-41ee-a58a-fda74b1a174e", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "# Show feature results\n", - "df = get_result_df(\n", - " spark=spark,\n", - " client=client,\n", - " data_format=\"parquet\",\n", - " res_url=offline_features_path,\n", - ")\n", - "df.select(feature_names).limit(5).toPandas()" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f", - "showTitle":false, - "title":"" - } - }, - "source":[ - "## 4. Train and Evaluate a Prediction Model\n", - "\n", - "After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n", - "\n", - "Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process." - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"5a226026-1c7b-48db-8f91-88d5c2ddf023", - "showTitle":false, - "title":"" - } - }, - "source":[ - "### Load Train and Test Data from the Offline Feature Values" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"bd2cdc83-0920-46e8-9454-e5e6e7832ce0", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "# Train / test split\n", - "train_df, test_df = (\n", - " df # Dataframe that we generated from get_offline_features call.\n", - " .withColumn(\"label\", F.col(\"fare_amount\").cast(\"double\"))\n", - " .where(F.col(\"f_trip_time_duration\") > 0)\n", - " .fillna(0)\n", - " .randomSplit([0.8, 0.2])\n", - ")\n", - "\n", - "print(f\"Num train samples: {train_df.count()}\")\n", - "print(f\"Num test samples: {test_df.count()}\")" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd", - "showTitle":false, - "title":"" - } - }, - "source":[ - "### Build a ML Pipeline\n", - "\n", - "Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model." - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"2a254361-63e9-45b2-8c19-40549762eacb", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "# Generate a feature vector column for SparkML\n", - "vector_assembler = VectorAssembler(\n", - " inputCols=[x for x in df.columns if x in feature_names],\n", - " outputCol=\"features\",\n", - ")\n", - "\n", - "# Define a model\n", - "gbt = GBTRegressor(\n", - " featuresCol=\"features\",\n", - " maxIter=100,\n", - " maxDepth=5,\n", - " maxBins=16,\n", - ")\n", - "\n", - "# Create a ML pipeline\n", - "ml_pipeline = Pipeline(stages=[\n", - " vector_assembler,\n", - " gbt,\n", - "])" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"bef93538-9591-4247-97b6-289d2055b7b1", - "showTitle":false, - "title":"" - } - }, - "source":[ - "### Train and Evaluate the Model" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"0c3d5f35-11a3-4644-9992-5860169d8302", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "# Train a model\n", - "model = ml_pipeline.fit(train_df)\n", - "\n", - "# Make predictions\n", - "predictions = model.transform(test_df)" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"1f9b584c-6228-4a02-a6c3-9b8dd2b78091", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "# Evaluate\n", - "evaluator = RegressionEvaluator(\n", - " labelCol=\"label\",\n", - " predictionCol=\"prediction\",\n", - ")\n", - "\n", - "rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n", - "mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n", - "print(f\"RMSE: {rmse}\\nMAE: {mae}\")" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"25c33abd-6e87-437d-a6a1-86435f065a1e", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n", - "predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n", - "\n", - "predictions_pdf.plot(\n", - " x=\"index\",\n", - " y=[\"label\", \"prediction\"],\n", - " style=['-', ':'],\n", - " figsize=(20, 10),\n", - ")" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"664d78cc-4a92-430c-9e05-565ba904558e", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "predictions_pdf.plot.scatter(\n", - " x=\"label\",\n", - " y=\"prediction\",\n", - " xlim=(0, 100),\n", - " ylim=(0, 100),\n", - " figsize=(10, 10),\n", - ")" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"8a56d165-c813-4ce0-8ae6-9f4d313c463d", - "showTitle":false, - "title":"" - } - }, - "source":[ - "## 5. Materialize Feature Values for Online Scoring\n", - "\n", - "While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n", - "\n", - "Note, only the features anchored to offline data source can be materialized." - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"751fa72e-8f94-40a1-994e-3e8315b51d37", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "materialized_feature_names = [feature.name for feature in agg_features]\n", - "materialized_feature_names" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"4d4699ed-42e6-408f-903d-2f799284f4b6", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "if REDIS_KEY and RESOURCE_PREFIX:\n", - " FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n", - "\n", - " # Get the last date from the dataset\n", - " backfill_timestamp = (\n", - " df_raw\n", - " .select(F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL))\n", - " .agg({TIMESTAMP_COL: \"max\"})\n", - " .collect()[0][0]\n", - " )\n", - "\n", - " # Time range to materialize\n", - " backfill_time = BackfillTime(\n", - " start=backfill_timestamp,\n", - " end=backfill_timestamp,\n", - " step=timedelta(days=1),\n", - " )\n", - "\n", - " # Destinations:\n", - " # For online store,\n", - " redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n", - "\n", - " # For offline store,\n", - " # adls_sink = HdfsSink(output_path=)\n", - "\n", - " settings = MaterializationSettings(\n", - " name=FEATURE_TABLE_NAME + \".job\", # job name\n", - " backfill_time=backfill_time,\n", - " sinks=[redis_sink], # or adls_sink\n", - " feature_names=materialized_feature_names,\n", - " )\n", - "\n", - " client.materialize_features(\n", - " settings=settings,\n", - " # Note, execution_configurations argument only works when using a new job cluster\n", - " execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n", - " )\n", - "\n", - " client.wait_job_to_finish(timeout_sec=500)" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"5aa13acd-58ec-4fc2-86bb-dc1d9951ebb9", - "showTitle":false, - "title":"" - } - }, - "source":[ - "Now, you can retrieve features for online scoring as follows:" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"424bc9eb-a47f-4b46-be69-8218d55e66ad", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "if REDIS_KEY and RESOURCE_PREFIX:\n", - " # Note, to get a single key, you may use client.get_online_features instead\n", - " materialized_feature_values = client.multi_get_online_features(\n", - " feature_table=FEATURE_TABLE_NAME,\n", - " keys=[\"239\", \"265\"],\n", - " feature_names=materialized_feature_names,\n", - " )\n", - " materialized_feature_values" - ] - }, - { - "cell_type":"markdown", - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"3596dc71-a363-4b6a-a169-215c89978558", - "showTitle":false, - "title":"" - } - }, - "source":[ - "## Cleanup" - ] - }, - { - "cell_type":"code", - "execution_count":null, - "metadata":{ - "application/vnd.databricks.v1+cell":{ - "inputWidgets":{ - - }, - "nuid":"b5fb292e-bbb6-4dd7-8e79-c62d9533e820", - "showTitle":false, - "title":"" - } - }, - "outputs":[ - - ], - "source":[ - "# Remove temporary files\n", - "dbutils.fs.rm(\"dbfs:/tmp/\", recurse=True)" - ] - } - ], - "metadata":{ - "application/vnd.databricks.v1+notebook":{ - "dashboards":[ - - ], - "language":"python", - "notebookMetadata":{ - "pythonIndentUnit":4 - }, - "notebookName":"databricks_quickstart_nyc_taxi_demo", - "notebookOrigID":2365994027381987, - "widgets":{ - "REDIS_KEY":{ - "currentValue":"", - "nuid":"d39ce0d5-bcfe-47ef-b3d9-eff67e5cdeca", - "widgetInfo":{ - "defaultValue":"", - "label":null, - "name":"REDIS_KEY", - "options":{ - "validationRegex":null, - "widgetType":"text" - }, - "widgetType":"text" - } - }, - "RESOURCE_PREFIX":{ - "currentValue":"", - "nuid":"87a26035-86fc-4dbd-8dd0-dc546c1c63c1", - "widgetInfo":{ - "defaultValue":"", - "label":null, - "name":"RESOURCE_PREFIX", - "options":{ - "validationRegex":null, - "widgetType":"text" - }, - "widgetType":"text" - } - } - } - }, - "kernelspec":{ - "display_name":"Python 3.10.8 64-bit", - "language":"python", - "name":"python3" + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "843d3142-24ca-4bd1-9e31-b55163804fe3", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "dbutils.widgets.text(\"RESOURCE_PREFIX\", \"\")\n", + "dbutils.widgets.text(\"REDIS_KEY\", \"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "384e5e16-7213-4186-9d04-09d03b155534", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Feathr Feature Store on Databricks Demo Notebook\n", + "\n", + "This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n", + "\n", + "This notebook is specifically written for Databricks and is relying on some of the Databricks packages such as `dbutils`. The intention here is to provide a \"one click run\" example with minimum configuration. For example:\n", + "- This notebook skips feature registry which requires running Azure Purview. \n", + "- To make the online feature query work, you will need to configure the Redis endpoint. \n", + "\n", + "The full-fledged notebook can be found from [here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c2ce58c7-9263-469a-bbb7-43364ddb07b8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Prerequisite\n", + "\n", + "To use feathr materialization for online scoring with Redis cache, you may deploy a Redis cluster and set `RESOURCE_PREFIX` and `REDIS_KEY` via Databricks widgets. Note that the deployed Redis host address should be `{RESOURCE_PREFIX}redis.redis.cache.windows.net`. More details about how to deploy the Redis cluster can be found [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html#configurure-redis-cluster).\n", + "\n", + "To run this notebook, you'll need to install `feathr` pip package. Here, we install notebook-scoped library. For details, please see [Azure Databricks dependency management document](https://learn.microsoft.com/en-us/azure/databricks/libraries/)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "4609d7ad-ad74-40fc-b97e-f440a0fa0737", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Install feathr from the latest codes in the repo. You may use `pip install feathr` as well.\n", + "!pip install \"git+https://github.com/feathr-ai/feathr#subdirectory=feathr_project\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c81fa80c-bca6-4ae5-84ad-659a036977bd", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Notebook Steps\n", + "\n", + "This tutorial demonstrates the key capabilities of Feathr, including:\n", + "\n", + "1. Install Feathr and necessary dependencies.\n", + "1. Create shareable features with Feathr feature definition configs.\n", + "1. Create training data using point-in-time correct feature join\n", + "1. Train and evaluate a prediction model.\n", + "1. Materialize feature values for online scoring.\n", + "\n", + "The overall data flow is as follows:\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "80223a02-631c-40c8-91b3-a037249ffff9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from datetime import timedelta\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.evaluation import RegressionEvaluator\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml.regression import GBTRegressor\n", + "from pyspark.sql import DataFrame\n", + "import pyspark.sql.functions as F\n", + "\n", + "import feathr\n", + "from feathr import (\n", + " FeathrClient,\n", + " # Feature data types\n", + " BOOLEAN,\n", + " FLOAT,\n", + " INT32,\n", + " ValueType,\n", + " # Feature data sources\n", + " INPUT_CONTEXT,\n", + " HdfsSource,\n", + " # Feature aggregations\n", + " TypedKey,\n", + " WindowAggTransformation,\n", + " # Feature types and anchor\n", + " DerivedFeature,\n", + " Feature,\n", + " FeatureAnchor,\n", + " # Materialization\n", + " BackfillTime,\n", + " MaterializationSettings,\n", + " RedisSink,\n", + " # Offline feature computation\n", + " FeatureQuery,\n", + " ObservationSettings,\n", + ")\n", + "from feathr.datasets import nyc_taxi\n", + "from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n", + "from feathr.utils.config import generate_config\n", + "from feathr.utils.job_utils import get_result_df\n", + "\n", + "\n", + "print(\n", + " f\"\"\"Feathr version: {feathr.__version__}\n", + "Databricks runtime version: {spark.conf.get(\"spark.databricks.clusterUsageTags.sparkVersion\")}\"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "ab35fa01-b392-457e-8fde-7e445a3c39b5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 2. Create Shareable Features with Feathr Feature Definition Configs\n", + "\n", + "In this notebook, we define all the necessary resource key values for authentication. We use the values passed by the databricks widgets at the top of this notebook. Instead of manually entering the values to the widgets, we can also use [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) to retrieve them.\n", + "Please refer to [how-to guide documents for granting key-vault access](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html#3-grant-key-vault-and-synapse-access-to-selected-users-optional) and [Databricks' Azure Key Vault-backed scopes](https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "09f93a9f-7b33-4d91-8f31-ee3b20991696", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "RESOURCE_PREFIX = dbutils.widgets.get(\"RESOURCE_PREFIX\")\n", + "PROJECT_NAME = \"feathr_getting_started\"\n", + "\n", + "REDIS_KEY = dbutils.widgets.get(\"REDIS_KEY\")\n", + "\n", + "# Use a databricks cluster\n", + "SPARK_CLUSTER = \"databricks\"\n", + "\n", + "# Databricks file system path\n", + "DATA_STORE_PATH = f\"dbfs:/{PROJECT_NAME}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "331753d6-1850-47b5-ad97-84b7c01d79d1", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Redis credential\n", + "os.environ[\"REDIS_PASSWORD\"] = REDIS_KEY" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Configurations\n", + "\n", + "Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com//feathr-ai/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field.\n", + "\n", + "In the following cell, we set required databricks credentials automatically by using a databricks notebook context object as well as new job cluster spec." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "8cd64e3a-376c-48e6-ba41-5197f3591d48", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "config_path = generate_config(\n", + " resource_prefix=RESOURCE_PREFIX,\n", + " project_name=PROJECT_NAME,\n", + " spark_config__spark_cluster=SPARK_CLUSTER,\n", + " # You may set an existing cluster id here, but Databricks recommend to use new clusters for greater reliability.\n", + " databricks_cluster_id=None, # Set None to create a new job cluster\n", + " databricks_workspace_token_value=ctx.apiToken().get(),\n", + " spark_config__databricks__workspace_instance_url=f\"https://{ctx.tags().get('browserHostName').get()}\",\n", + ")\n", + "\n", + "with open(config_path, \"r\") as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "58d22dc1-7590-494d-94ca-3e2488c31c8e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3fef7f2f-df19-4f53-90a5-ff7999ed983d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Initialize Feathr Client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "9713a2df-c7b2-4562-88b0-b7acce3cc43a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client = FeathrClient(config_path=config_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c3b64bda-d42c-4a64-b976-0fb604cf38c5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### View the NYC taxi fare dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c4ccd7b3-298a-4e5a-8eec-b7e309db393e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n", + "\n", + "# Download the data file\n", + "df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n", + "df_raw.limit(5).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "7430c942-64e5-4b70-b823-16ce1d1b3cee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Defining features with Feathr\n", + "\n", + "In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n", + "\n", + "* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n", + "* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", + "* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n", + "\n", + "Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n", + "\n", + "There are two types of features -- anchored features and derivated features:\n", + "\n", + "* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n", + "* **Derived features**: Features that are computed on top of other features.\n", + "\n", + "#### Define anchored features\n", + "\n", + "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "75b8d2ed-84df-4446-ae07-5f715434f3ea", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n", + "TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "93abbcc2-562b-47e4-ad4c-1fedd7cc64df", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# We define f_trip_distance and f_trip_time_duration features separately\n", + "# so that we can reuse them later for the derived features.\n", + "f_trip_distance = Feature(\n", + " name=\"f_trip_distance\",\n", + " feature_type=FLOAT,\n", + " transform=\"trip_distance\",\n", + ")\n", + "f_trip_time_duration = Feature(\n", + " name=\"f_trip_time_duration\",\n", + " feature_type=FLOAT,\n", + " transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n", + ")\n", + "\n", + "features = [\n", + " f_trip_distance,\n", + " f_trip_time_duration,\n", + " Feature(\n", + " name=\"f_is_long_trip_distance\",\n", + " feature_type=BOOLEAN,\n", + " transform=\"trip_distance > 30.0\",\n", + " ),\n", + " Feature(\n", + " name=\"f_day_of_week\",\n", + " feature_type=INT32,\n", + " transform=\"dayofweek(lpep_dropoff_datetime)\",\n", + " ),\n", + " Feature(\n", + " name=\"f_day_of_month\",\n", + " feature_type=INT32,\n", + " transform=\"dayofmonth(lpep_dropoff_datetime)\",\n", + " ),\n", + " Feature(\n", + " name=\"f_hour_of_day\",\n", + " feature_type=INT32,\n", + " transform=\"hour(lpep_dropoff_datetime)\",\n", + " ),\n", + "]\n", + "\n", + "# After you have defined features, bring them together to build the anchor to the source.\n", + "feature_anchor = FeatureAnchor(\n", + " name=\"feature_anchor\",\n", + " source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n", + " features=features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "728d2d5f-c11f-4941-bdc5-48507f5749f1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "We can define the source with a preprocessing python function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3cc59a0e-a41b-480e-a84e-ca5443d63143", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def preprocessing(df: DataFrame) -> DataFrame:\n", + " import pyspark.sql.functions as F\n", + "\n", + " df = df.withColumn(\n", + " \"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\")\n", + " )\n", + " return df\n", + "\n", + "\n", + "batch_source = HdfsSource(\n", + " name=\"nycTaxiBatchSource\",\n", + " path=DATA_FILE_PATH,\n", + " event_timestamp_column=TIMESTAMP_COL,\n", + " preprocessing=preprocessing,\n", + " timestamp_format=TIMESTAMP_FORMAT,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "46f863c4-bb81-434a-a448-6b585031a221", + "showTitle": false, + "title": "" + } + }, + "source": [ + "For the features with aggregation, the supported functions are as follows:\n", + "\n", + "| Aggregation Function | Input Type | Description |\n", + "| --- | --- | --- |\n", + "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", + "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", + "|LATEST| Any |Returns the latest not-null values from within the defined time window |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "a373ecbe-a040-4cd3-9d87-0d5f4c5ba553", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "agg_key = TypedKey(\n", + " key_column=\"DOLocationID\",\n", + " key_column_type=ValueType.INT32,\n", + " description=\"location id in NYC\",\n", + " full_name=\"nyc_taxi.location_id\",\n", + ")\n", + "\n", + "agg_window = \"90d\"\n", + "\n", + "# Anchored features with aggregations\n", + "agg_features = [\n", + " Feature(\n", + " name=\"f_location_avg_fare\",\n", + " key=agg_key,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(\n", + " agg_expr=\"fare_amount_cents\",\n", + " agg_func=\"AVG\",\n", + " window=agg_window,\n", + " ),\n", + " ),\n", + " Feature(\n", + " name=\"f_location_max_fare\",\n", + " key=agg_key,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(\n", + " agg_expr=\"fare_amount_cents\",\n", + " agg_func=\"MAX\",\n", + " window=agg_window,\n", + " ),\n", + " ),\n", + "]\n", + "\n", + "agg_feature_anchor = FeatureAnchor(\n", + " name=\"agg_feature_anchor\",\n", + " source=batch_source, # External data source for feature. Typically a data table.\n", + " features=agg_features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "149f85e2-fa3c-4895-b0c5-de5543ca9b6d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Define derived features\n", + "\n", + "We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "05633bc3-9118-449b-9562-45fc437576c2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "derived_features = [\n", + " DerivedFeature(\n", + " name=\"f_trip_time_distance\",\n", + " feature_type=FLOAT,\n", + " input_features=[\n", + " f_trip_distance,\n", + " f_trip_time_duration,\n", + " ],\n", + " transform=\"f_trip_distance / f_trip_time_duration\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "ad102c45-586d-468c-85f0-9454401ef10b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Build features\n", + "\n", + "Finally, we build the features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "91bb5ebb-87e4-470b-b8eb-1c89b351740e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client.build_features(\n", + " anchor_list=[feature_anchor, agg_feature_anchor],\n", + " derived_feature_list=derived_features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "632d5f46-f9e2-41a8-aab7-34f75206e2aa", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 3. Create Training Data Using Point-in-Time Correct Feature Join\n", + "\n", + "After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n", + "\n", + "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", + "what features and how these features should be joined to the observation data. \n", + "\n", + "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com//feathr-ai/feathr/blob/main/docs/concepts/point-in-time-join.md)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "02feabc9-2f2f-43e8-898d-b28082798e98", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "feature_names = [feature.name for feature in features + agg_features + derived_features]\n", + "feature_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "DATA_FORMAT = \"parquet\"\n", + "offline_features_path = str(\n", + " Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "67e81466-c736-47ba-b122-e640642c01cf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Features that we want to request. Can use a subset of features\n", + "query = FeatureQuery(\n", + " feature_list=feature_names,\n", + " key=agg_key,\n", + ")\n", + "settings = ObservationSettings(\n", + " observation_path=DATA_FILE_PATH,\n", + " event_timestamp_column=TIMESTAMP_COL,\n", + " timestamp_format=TIMESTAMP_FORMAT,\n", + ")\n", + "client.get_offline_features(\n", + " observation_settings=settings,\n", + " feature_query=query,\n", + " # Note, execution_configurations argument only works when using a new job cluster\n", + " # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n", + " execution_configurations=SparkExecutionConfiguration(\n", + " {\n", + " \"spark.feathr.outputFormat\": DATA_FORMAT,\n", + " }\n", + " ),\n", + " output_path=offline_features_path,\n", + ")\n", + "\n", + "client.wait_job_to_finish(timeout_sec=500)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "9871af55-25eb-41ee-a58a-fda74b1a174e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Show feature results\n", + "df = get_result_df(\n", + " spark=spark,\n", + " client=client,\n", + " data_format=\"parquet\",\n", + " res_url=offline_features_path,\n", + ")\n", + "df.select(feature_names).limit(5).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 4. Train and Evaluate a Prediction Model\n", + "\n", + "After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n", + "\n", + "Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "5a226026-1c7b-48db-8f91-88d5c2ddf023", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Load Train and Test Data from the Offline Feature Values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "bd2cdc83-0920-46e8-9454-e5e6e7832ce0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Train / test split\n", + "train_df, test_df = (\n", + " df.withColumn( # Dataframe that we generated from get_offline_features call.\n", + " \"label\", F.col(\"fare_amount\").cast(\"double\")\n", + " )\n", + " .where(F.col(\"f_trip_time_duration\") > 0)\n", + " .fillna(0)\n", + " .randomSplit([0.8, 0.2])\n", + ")\n", + "\n", + "print(f\"Num train samples: {train_df.count()}\")\n", + "print(f\"Num test samples: {test_df.count()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Build a ML Pipeline\n", + "\n", + "Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "2a254361-63e9-45b2-8c19-40549762eacb", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Generate a feature vector column for SparkML\n", + "vector_assembler = VectorAssembler(\n", + " inputCols=[x for x in df.columns if x in feature_names],\n", + " outputCol=\"features\",\n", + ")\n", + "\n", + "# Define a model\n", + "gbt = GBTRegressor(\n", + " featuresCol=\"features\",\n", + " maxIter=100,\n", + " maxDepth=5,\n", + " maxBins=16,\n", + ")\n", + "\n", + "# Create a ML pipeline\n", + "ml_pipeline = Pipeline(\n", + " stages=[\n", + " vector_assembler,\n", + " gbt,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "bef93538-9591-4247-97b6-289d2055b7b1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Train and Evaluate the Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "0c3d5f35-11a3-4644-9992-5860169d8302", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Train a model\n", + "model = ml_pipeline.fit(train_df)\n", + "\n", + "# Make predictions\n", + "predictions = model.transform(test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "1f9b584c-6228-4a02-a6c3-9b8dd2b78091", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Evaluate\n", + "evaluator = RegressionEvaluator(\n", + " labelCol=\"label\",\n", + " predictionCol=\"prediction\",\n", + ")\n", + "\n", + "rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n", + "mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n", + "print(f\"RMSE: {rmse}\\nMAE: {mae}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "25c33abd-6e87-437d-a6a1-86435f065a1e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n", + "predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n", + "\n", + "predictions_pdf.plot(\n", + " x=\"index\",\n", + " y=[\"label\", \"prediction\"],\n", + " style=[\"-\", \":\"],\n", + " figsize=(20, 10),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "664d78cc-4a92-430c-9e05-565ba904558e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "predictions_pdf.plot.scatter(\n", + " x=\"label\",\n", + " y=\"prediction\",\n", + " xlim=(0, 100),\n", + " ylim=(0, 100),\n", + " figsize=(10, 10),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "8a56d165-c813-4ce0-8ae6-9f4d313c463d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 5. Materialize Feature Values for Online Scoring\n", + "\n", + "While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n", + "\n", + "Note, only the features anchored to offline data source can be materialized." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "751fa72e-8f94-40a1-994e-3e8315b51d37", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "materialized_feature_names = [feature.name for feature in agg_features]\n", + "materialized_feature_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "4d4699ed-42e6-408f-903d-2f799284f4b6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "if REDIS_KEY and RESOURCE_PREFIX:\n", + " FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n", + "\n", + " # Get the last date from the dataset\n", + " backfill_timestamp = (\n", + " df_raw.select(\n", + " F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL)\n", + " )\n", + " .agg({TIMESTAMP_COL: \"max\"})\n", + " .collect()[0][0]\n", + " )\n", + "\n", + " # Time range to materialize\n", + " backfill_time = BackfillTime(\n", + " start=backfill_timestamp,\n", + " end=backfill_timestamp,\n", + " step=timedelta(days=1),\n", + " )\n", + "\n", + " # Destinations:\n", + " # For online store,\n", + " redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n", + "\n", + " # For offline store,\n", + " # adls_sink = HdfsSink(output_path=)\n", + "\n", + " settings = MaterializationSettings(\n", + " name=FEATURE_TABLE_NAME + \".job\", # job name\n", + " backfill_time=backfill_time,\n", + " sinks=[redis_sink], # or adls_sink\n", + " feature_names=materialized_feature_names,\n", + " )\n", + "\n", + " client.materialize_features(\n", + " settings=settings,\n", + " # Note, execution_configurations argument only works when using a new job cluster\n", + " execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n", + " )\n", + "\n", + " client.wait_job_to_finish(timeout_sec=500)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "5aa13acd-58ec-4fc2-86bb-dc1d9951ebb9", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Now, you can retrieve features for online scoring as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "424bc9eb-a47f-4b46-be69-8218d55e66ad", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "if REDIS_KEY and RESOURCE_PREFIX:\n", + " # Note, to get a single key, you may use client.get_online_features instead\n", + " materialized_feature_values = client.multi_get_online_features(\n", + " feature_table=FEATURE_TABLE_NAME,\n", + " keys=[\"239\", \"265\"],\n", + " feature_names=materialized_feature_names,\n", + " )\n", + " materialized_feature_values" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3596dc71-a363-4b6a-a169-215c89978558", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "b5fb292e-bbb6-4dd7-8e79-c62d9533e820", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Remove temporary files\n", + "dbutils.fs.rm(\"dbfs:/tmp/\", recurse=True)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "databricks_quickstart_nyc_taxi_demo", + "notebookOrigID": 2365994027381987, + "widgets": { + "REDIS_KEY": { + "currentValue": "", + "nuid": "d39ce0d5-bcfe-47ef-b3d9-eff67e5cdeca", + "widgetInfo": { + "defaultValue": "", + "label": null, + "name": "REDIS_KEY", + "options": { + "validationRegex": null, + "widgetType": "text" }, - "language_info":{ - "codemirror_mode":{ - "name":"ipython", - "version":3 - }, - "file_extension":".py", - "mimetype":"text/x-python", - "name":"python", - "nbconvert_exporter":"python", - "pygments_lexer":"ipython3", - "version":"3.10.8" + "widgetType": "text" + } + }, + "RESOURCE_PREFIX": { + "currentValue": "", + "nuid": "87a26035-86fc-4dbd-8dd0-dc546c1c63c1", + "widgetInfo": { + "defaultValue": "", + "label": null, + "name": "RESOURCE_PREFIX", + "options": { + "validationRegex": null, + "widgetType": "text" }, - "vscode":{ - "interpreter":{ - "hash":"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" - } - } + "widgetType": "text" + } + } + } + }, + "kernelspec": { + "display_name": "Python 3.10.4 ('feathr')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 }, - "nbformat":4, - "nbformat_minor":0 -} \ No newline at end of file + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "vscode": { + "interpreter": { + "hash": "e34a1a57d2e174682770a82d94a178aa36d3ccfaa21227c5d2308e319b7ae532" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb deleted file mode 100644 index 19e13395c..000000000 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb +++ /dev/null @@ -1,1445 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "384e5e16-7213-4186-9d04-09d03b155534", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Feathr Feature Store on Databricks Demo Notebook\n", - "\n", - "This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. This is a notebook that's specially designed for databricks clusters and is relying on some of the databricks packages such as dbutils.\n", - "\n", - "The intent of this notebook is like \"one click run\" without configuring anything, so it has relatively limited capability. \n", - "\n", - "- For example, in this notebook there's no feature registry available since that requires running Azure Purview. \n", - "- Also for online store (Redis), you need to configure the Redis endpoint, otherwise that part will not work. \n", - "\n", - "However, the core part of Feathr, especially defining features, get offline features, point-in-time joins etc., should \"just work\". The full-fledged notebook is [located here](https://github.com/feathr-ai/feathr/blob/main/docs/samples/nyc_taxi_demo.ipynb)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "# Notebook Steps\n", - "\n", - "This tutorial demonstrates the key capabilities of Feathr, including:\n", - "\n", - "1. Install and set up Feathr with Azure\n", - "2. Create shareable features with Feathr feature definition configs.\n", - "3. Create a training dataset via point-in-time feature join.\n", - "4. Compute and write features.\n", - "5. Train a model using these features to predict fares.\n", - "6. Materialize feature value to online store.\n", - "7. Fetch feature value in real-time from online store for online scoring.\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The feature flow is as below:\n", - "\n", - "![Feature Flow](https://github.com/feathr-ai/feathr/blob/main/docs/images/feature_flow.png?raw=true)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "f00b9d0b-94d1-418f-89b9-25bbacb8b068", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "! pip install feathr pandavro scikit-learn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "80223a02-631c-40c8-91b3-a037249ffff9", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "import glob\n", - "import os\n", - "import tempfile\n", - "from datetime import datetime, timedelta\n", - "from math import sqrt\n", - "\n", - "import pandas as pd\n", - "import pandavro as pdx\n", - "from feathr import FeathrClient\n", - "from feathr import BOOLEAN, FLOAT, INT32, ValueType\n", - "from feathr import Feature, DerivedFeature, FeatureAnchor\n", - "from feathr import BackfillTime, MaterializationSettings\n", - "from feathr import FeatureQuery, ObservationSettings\n", - "from feathr import RedisSink\n", - "from feathr import INPUT_CONTEXT, HdfsSource\n", - "from feathr import WindowAggTransformation\n", - "from feathr import TypedKey\n", - "from sklearn.metrics import mean_squared_error\n", - "from sklearn.model_selection import train_test_split\n", - "from azure.identity import DefaultAzureCredential\n", - "from azure.keyvault.secrets import SecretClient\n", - "import json\n", - "import requests" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "41d3648a-9bc9-40dc-90da-bc82b21ef9b3", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Get the required databricks credentials automatically:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "331753d6-1850-47b5-ad97-84b7c01d79d1", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# Get current databricks notebook context\n", - "ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n", - "host_name = ctx.tags().get(\"browserHostName\").get()\n", - "host_token = ctx.apiToken().get()\n", - "cluster_id = ctx.tags().get(\"clusterId\").get()\n", - "\n", - "\n", - "\n", - "# databricks_config = {'run_name':'FEATHR_FILL_IN','existing_cluster_id':cluster_id,'libraries':[{'jar':'FEATHR_FILL_IN'}],'spark_jar_task':{'main_class_name':'FEATHR_FILL_IN','parameters':['FEATHR_FILL_IN']}}\n", - "os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + host_name\n", - "os.environ['spark_config__databricks__config_template']='{\"run_name\":\"FEATHR_FILL_IN\",\"new_cluster\":{\"spark_version\":\"10.4.x-scala2.12\",\"node_type_id\":\"Standard_D3_v2\",\"num_workers\":2,\"spark_conf\":{\"FEATHR_FILL_IN\":\"FEATHR_FILL_IN\"}},\"libraries\":[{\"jar\":\"FEATHR_FILL_IN\"}],\"spark_jar_task\":{\"main_class_name\":\"FEATHR_FILL_IN\",\"parameters\":[\"FEATHR_FILL_IN\"]}}'\n", - "# os.environ['spark_config__databricks__config_template']=json.dumps(databricks_config)\n", - "os.environ['spark_config__databricks__work_dir']='dbfs:/feathr_getting_started'\n", - "os.environ['project_config__project_name']='feathr_getting_started'\n", - "os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = host_token" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You need to setup the Redis credentials below in order to push features to online store. You can skip this part if you don't have Redis, but there will be failures for `client.materialize_features(settings, allow_materialize_non_agg_feature =True)` API." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get redis credentials; This is to parse Redis connection string.\n", - "redis_port=\"\"\n", - "redis_host=\"\"\n", - "redis_password=\"\"\n", - "redis_ssl=\"\"\n", - "\n", - "# Set the resource link\n", - "os.environ['online_store__redis__host'] = redis_host\n", - "os.environ['online_store__redis__port'] = redis_port\n", - "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", - "os.environ['REDIS_PASSWORD']=redis_password" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Configure required credentials (skip if you don't use those):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "8cd64e3a-376c-48e6-ba41-5197f3591d48", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "import tempfile\n", - "yaml_config = \"\"\"\n", - "# Please refer to https://github.com/feathr-ai/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", - "api_version: 1\n", - "project_config:\n", - " project_name: 'feathr_getting_started2'\n", - " required_environment_variables:\n", - " - 'REDIS_PASSWORD'\n", - "offline_store:\n", - " adls:\n", - " adls_enabled: true\n", - " wasb:\n", - " wasb_enabled: true\n", - " s3:\n", - " s3_enabled: false\n", - " s3_endpoint: ''\n", - " jdbc:\n", - " jdbc_enabled: false\n", - " jdbc_database: ''\n", - " jdbc_table: ''\n", - " snowflake:\n", - " snowflake_enabled: false\n", - " url: \".snowflakecomputing.com\"\n", - " user: \"\"\n", - " role: \"\"\n", - " warehouse: \"\"\n", - "spark_config:\n", - " # choice for spark runtime. Currently support: azure_synapse, databricks\n", - " # The `databricks` configs will be ignored if `azure_synapse` is set and vice versa.\n", - " spark_cluster: \"databricks\"\n", - " spark_result_output_parts: \"1\"\n", - "\n", - "online_store:\n", - " redis:\n", - " host: '.redis.cache.windows.net'\n", - " port: 6380\n", - " ssl_enabled: True\n", - "feature_registry:\n", - " api_endpoint: \"https://.azurewebsites.net/api/v1\"\n", - "\"\"\"\n", - "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", - "with open(tmp.name, \"w\") as text_file:\n", - " text_file.write(yaml_config)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "3fef7f2f-df19-4f53-90a5-ff7999ed983d", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Initialize Feathr Client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "9713a2df-c7b2-4562-88b0-b7acce3cc43a", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "client = FeathrClient(config_path=tmp.name)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c3b64bda-d42c-4a64-b976-0fb604cf38c5", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## View the data\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The data is as below" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c4ccd7b3-298a-4e5a-8eec-b7e309db393e", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "import pandas as pd\n", - "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "7430c942-64e5-4b70-b823-16ce1d1b3cee", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Defining Features with Feathr\n", - "\n", - "In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md)\n", - "\n", - "\n", - "1. The typed key (a.k.a. entity id) identifies the subject of feature, e.g. a user id, 123.\n", - "2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", - "3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "16420730-582e-4e11-a343-efc0ddd35108", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Note that, in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n", - "It is merely a function/transformation executing against request data at runtime.\n", - "For example, the day of week of the request, which is calculated by converting the request UNIX timestamp." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "728d2d5f-c11f-4941-bdc5-48507f5749f1", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Define Sources Section with UDFs\n", - "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "3cc59a0e-a41b-480e-a84e-ca5443d63143", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "batch_source = HdfsSource(name=\"nycTaxiBatchSource\",\n", - " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", - " event_timestamp_column=\"lpep_dropoff_datetime\",\n", - " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "46f863c4-bb81-434a-a448-6b585031a221", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Define Anchors and Features\n", - "A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "a373ecbe-a040-4cd3-9d87-0d5f4c5ba553", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "f_trip_distance = Feature(name=\"f_trip_distance\",\n", - " feature_type=FLOAT, transform=\"trip_distance\")\n", - "\n", - "features = [\n", - " f_trip_distance,\n", - " Feature(name=\"f_is_long_trip_distance\",\n", - " feature_type=BOOLEAN,\n", - " transform=\"cast_float(trip_distance)>30\"),\n", - " Feature(name=\"f_day_of_week\",\n", - " feature_type=INT32,\n", - " transform=\"dayofweek(lpep_dropoff_datetime)\"),\n", - "]\n", - "\n", - "request_anchor = FeatureAnchor(name=\"request_features\",\n", - " source=INPUT_CONTEXT,\n", - " features=features)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "149f85e2-fa3c-4895-b0c5-de5543ca9b6d", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Window aggregation features\n", - "\n", - "For window aggregation features, see the supported fields below:\n", - "\n", - "Note that the `agg_func` should be any of these:\n", - "\n", - "| Aggregation Type | Input Type | Description |\n", - "| --- | --- | --- |\n", - "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", - "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", - "|LATEST| Any |Returns the latest not-null values from within the defined time window |\n", - "\n", - "\n", - "After you have defined features and sources, bring them together to build an anchor:\n", - "\n", - "\n", - "Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "05633bc3-9118-449b-9562-45fc437576c2", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "location_id = TypedKey(key_column=\"DOLocationID\",\n", - " key_column_type=ValueType.INT32,\n", - " description=\"location id in NYC\",\n", - " full_name=\"nyc_taxi.location_id\")\n", - "agg_features = [Feature(name=\"f_location_avg_fare\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", - " agg_func=\"AVG\",\n", - " window=\"90d\")),\n", - " Feature(name=\"f_location_max_fare\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", - " agg_func=\"MAX\",\n", - " window=\"90d\")),\n", - " ]\n", - "\n", - "agg_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n", - " source=batch_source,\n", - " features=agg_features)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "d2ecaca9-057e-4b36-811f-320f66f753ed", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Derived Features Section\n", - "Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "270fb11e-8a71-404f-9639-ad29d8e6a2c1", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "f_trip_distance_rounded = DerivedFeature(name=\"f_trip_distance_rounded\",\n", - " feature_type=INT32,\n", - " input_features=[f_trip_distance],\n", - " transform=\"f_trip_distance * 10\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "ad102c45-586d-468c-85f0-9454401ef10b", - "showTitle": false, - "title": "" - } - }, - "source": [ - "And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "91bb5ebb-87e4-470b-b8eb-1c89b351740e", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=[\n", - " f_trip_distance_rounded])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "632d5f46-f9e2-41a8-aab7-34f75206e2aa", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Create training data using point-in-time correct feature join\n", - "\n", - "A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n", - "\n", - "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", - "what features and how these features should be joined to the observation data. \n", - "\n", - "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/point-in-time-join.md)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "output_path = 'dbfs:/feathrazure_test.avro'\n", - "\n", - "\n", - "feature_query = FeatureQuery(\n", - " feature_list=[\"f_location_avg_fare\", \"f_trip_distance_rounded\", \"f_is_long_trip_distance\"], key=location_id)\n", - "settings = ObservationSettings(\n", - " observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", - " event_timestamp_column=\"lpep_dropoff_datetime\",\n", - " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")\n", - "client.get_offline_features(observation_settings=settings,\n", - " feature_query=feature_query,\n", - " output_path=output_path\n", - " )\n", - "client.wait_job_to_finish(timeout_sec=500)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "51f078e3-3f8f-4f10-b7f1-499ac8a9ff07", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Download the result and show the result\n", - "\n", - "Let's use the helper function `get_result_df` to download the result and view it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "23c797b2-ac1a-4cf3-b0ed-c05216de3f37", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "from feathr.utils.job_utils import get_result_df\n", - "df_res = get_result_df(client, format=\"avro\", res_url = output_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "b9be042e-eb12-46b9-9d91-a0e5dd0c704f", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "df_res" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Train a machine learning model\n", - "After getting all the features, let's train a machine learning model with the converted feature by Feathr:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "84745f36-5bac-49c0-903b-38828b923c7c", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# remove columns\n", - "from sklearn.ensemble import GradientBoostingRegressor\n", - "final_df = df_res\n", - "final_df.drop([\"lpep_pickup_datetime\", \"lpep_dropoff_datetime\",\n", - " \"store_and_fwd_flag\"], axis=1, inplace=True, errors='ignore')\n", - "final_df.fillna(0, inplace=True)\n", - "final_df['fare_amount'] = final_df['fare_amount'].astype(\"float64\")\n", - "\n", - "\n", - "train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"fare_amount\"], axis=1),\n", - " final_df[\"fare_amount\"],\n", - " test_size=0.2,\n", - " random_state=42)\n", - "model = GradientBoostingRegressor()\n", - "model.fit(train_x, train_y)\n", - "\n", - "y_predict = model.predict(test_x)\n", - "\n", - "y_actual = test_y.values.flatten().tolist()\n", - "rmse = sqrt(mean_squared_error(y_actual, y_predict))\n", - "\n", - "sum_actuals = sum_errors = 0\n", - "\n", - "for actual_val, predict_val in zip(y_actual, y_predict):\n", - " abs_error = actual_val - predict_val\n", - " if abs_error < 0:\n", - " abs_error = abs_error * -1\n", - "\n", - " sum_errors = sum_errors + abs_error\n", - " sum_actuals = sum_actuals + actual_val\n", - "\n", - "mean_abs_percent_error = sum_errors / sum_actuals\n", - "print(\"Model MAPE:\")\n", - "print(mean_abs_percent_error)\n", - "print()\n", - "print(\"Model Accuracy:\")\n", - "print(1 - mean_abs_percent_error)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "5a226026-1c7b-48db-8f91-88d5c2ddf023", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Materialize feature value into offline/online storage\n", - "\n", - "While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n", - "and materialize the feature value to offline and/or online storage. \n", - "\n", - "We can push the generated features to the online store like below:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "3b924c66-8634-42fe-90f3-c844487d3f75", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "backfill_time = BackfillTime(start=datetime(\n", - " 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n", - "redisSink = RedisSink(table_name=\"nycTaxiDemoFeature\")\n", - "settings = MaterializationSettings(\"nycTaxiTable\",\n", - " backfill_time=backfill_time,\n", - " sinks=[redisSink],\n", - " feature_names=[\"f_location_avg_fare\", \"f_location_max_fare\"])\n", - "\n", - "client.materialize_features(settings, allow_materialize_non_agg_feature =True)\n", - "client.wait_job_to_finish(timeout_sec=500)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd", - "showTitle": false, - "title": "" - } - }, - "source": [ - "We can then get the features from the online store (Redis):" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "bef93538-9591-4247-97b6-289d2055b7b1", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Fetching feature value for online inference\n", - "\n", - "For features that are already materialized by the previous step, their latest value can be queried via the client's\n", - "`get_online_features` or `multi_get_online_features` API." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "0c3d5f35-11a3-4644-9992-5860169d8302", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "res = client.get_online_features('nycTaxiDemoFeature', '265', [\n", - " 'f_location_avg_fare', 'f_location_max_fare'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "4d4699ed-42e6-408f-903d-2f799284f4b6", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "client.multi_get_online_features(\"nycTaxiDemoFeature\", [\"239\", \"265\"], [\n", - " 'f_location_avg_fare', 'f_location_max_fare'])" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 4 - }, - "notebookName": "nyc_driver_demo", - "notebookOrigID": 930353059183053, - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3.9.14 64-bit", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.14" - }, - "vscode": { - "interpreter": { - "hash": "a665b5d41d17b532ea9890333293a1b812fa0b73c9c25c950b3cedf1bebd0438" - } - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/docs/samples/nyc_taxi_demo.ipynb b/docs/samples/nyc_taxi_demo.ipynb index fbc349f4c..31754950e 100644 --- a/docs/samples/nyc_taxi_demo.ipynb +++ b/docs/samples/nyc_taxi_demo.ipynb @@ -1,721 +1,1134 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Feathr Feature Store on Azure Demo Notebook\n", - "\n", - "This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. It includes these steps:\n", - "\n", - "\n", - "This tutorial demonstrates the key capabilities of Feathr, including:\n", - "\n", - "1. Install and set up Feathr with Azure\n", - "2. Create shareable features with Feathr feature definition configs.\n", - "3. Create a training dataset via point-in-time feature join.\n", - "4. Compute and write features.\n", - "5. Train a model using these features to predict fares.\n", - "6. Materialize feature value to online store.\n", - "7. Fetch feature value in real-time from online store for online scoring.\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The feature flow is as below:\n", - "\n", - "![Feature Flow](https://github.com/feathr-ai/feathr/blob/main/docs/images/feature_flow.png?raw=true)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Use Quick Start Template to Provision Azure Resources\n", - "First step is to provision required cloud resources if you want to use Feathr. Feathr provides a python based client to interact with cloud resources.\n", - "\n", - "Please follow the steps [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html) to provision required cloud resources. Due to the complexity of the possible cloud environment, it is almost impossible to create a script that works for all the use cases. Because of this, [azure_resource_provision.sh](https://github.com/feathr-ai/feathr/blob/main/docs/how-to-guides/azure_resource_provision.sh) is a full end to end command line to create all the required resources, and you can tailor the script as needed, while [the companion documentation](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html) can be used as a complete guide for using that shell script.\n", - "\n", - "\n", - "![Architecture](https://github.com/feathr-ai/feathr/blob/main/docs/images/architecture.png?raw=true)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Install Feathr \n", - "\n", - "Install Feathr using pip:\n", - "\n", - "`pip install -U feathr pandavro scikit-learn`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Configure the required environment with Feathr Quick Start Template\n", - "\n", - "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. Run the code below to install Feathr, login to Azure to get the required credentials to access more cloud resources." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**REQUIRED STEP: Fill in the resource prefix when provisioning the resources**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "resource_prefix = \"feathr_resource_prefix\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! pip install feathr azure-cli pandavro scikit-learn" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Login to Azure with a device code (You will see instructions in the output):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! az login --use-device-code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import glob\n", - "import os\n", - "import tempfile\n", - "from datetime import datetime, timedelta\n", - "from math import sqrt\n", - "\n", - "import pandas as pd\n", - "import pandavro as pdx\n", - "from feathr import FeathrClient\n", - "from feathr import BOOLEAN, FLOAT, INT32, ValueType\n", - "from feathr import Feature, DerivedFeature, FeatureAnchor\n", - "from feathr import BackfillTime, MaterializationSettings\n", - "from feathr import FeatureQuery, ObservationSettings\n", - "from feathr import RedisSink\n", - "from feathr import INPUT_CONTEXT, HdfsSource\n", - "from feathr import WindowAggTransformation\n", - "from feathr import TypedKey\n", - "from sklearn.metrics import mean_squared_error\n", - "from sklearn.model_selection import train_test_split\n", - "from azure.identity import DefaultAzureCredential\n", - "from azure.keyvault.secrets import SecretClient\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get all the required credentials from Azure KeyVault" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get all the required credentials from Azure Key Vault\n", - "key_vault_name=resource_prefix+\"kv\"\n", - "synapse_workspace_url=resource_prefix+\"syws\"\n", - "adls_account=resource_prefix+\"dls\"\n", - "adls_fs_name=resource_prefix+\"fs\"\n", - "purview_name=resource_prefix+\"purview\"\n", - "key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n", - "credential = DefaultAzureCredential(exclude_interactive_browser_credential=False, additionally_allowed_tenants=['*'])\n", - "client = SecretClient(vault_url=key_vault_uri, credential=credential)\n", - "secretName = \"FEATHR-ONLINE-STORE-CONN\"\n", - "retrieved_secret = client.get_secret(secretName).value\n", - "\n", - "# Get redis credentials; This is to parse Redis connection string.\n", - "redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n", - "redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n", - "redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n", - "redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n", - "\n", - "# Set the resource link\n", - "os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n", - "os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n", - "os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n", - "os.environ['online_store__redis__host'] = redis_host\n", - "os.environ['online_store__redis__port'] = redis_port\n", - "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", - "os.environ['REDIS_PASSWORD']=redis_password\n", - "feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Configure the required environment (Don't need to update if using the above Quick Start Template)\n", - "\n", - "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. If you use Feathr CLI to create a workspace, you should have a folder with a file called `feathr_config.yaml` in it with all the required configurations. Otherwise, update the configuration below.\n", - "\n", - "The code below will write this configuration string to a temporary location and load it to Feathr. Please still refer to [feathr_config.yaml](https://github.com/feathr-ai/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It should also have more explanations on the meaning of each variable." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import tempfile\n", - "yaml_config = \"\"\"\n", - "# Please refer to https://github.com/feathr-ai/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", - "api_version: 1\n", - "project_config:\n", - " project_name: 'feathr_getting_started'\n", - " required_environment_variables:\n", - " - 'REDIS_PASSWORD'\n", - " - 'AZURE_CLIENT_ID'\n", - " - 'AZURE_TENANT_ID'\n", - " - 'AZURE_CLIENT_SECRET'\n", - "offline_store:\n", - " adls:\n", - " adls_enabled: true\n", - " wasb:\n", - " wasb_enabled: true\n", - " s3:\n", - " s3_enabled: false\n", - " s3_endpoint: 's3.amazonaws.com'\n", - " jdbc:\n", - " jdbc_enabled: false\n", - " jdbc_database: 'feathrtestdb'\n", - " jdbc_table: 'feathrtesttable'\n", - " snowflake:\n", - " url: \"dqllago-ol19457.snowflakecomputing.com\"\n", - " user: \"feathrintegration\"\n", - " role: \"ACCOUNTADMIN\"\n", - " warehouse: \"COMPUTE_WH\"\n", - "spark_config:\n", - " spark_cluster: 'azure_synapse'\n", - " spark_result_output_parts: '1'\n", - " azure_synapse:\n", - " dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n", - " pool_name: 'spark3'\n", - " workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_getting_started'\n", - " executor_size: 'Small'\n", - " executor_num: 1\n", - " databricks:\n", - " workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n", - " config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n", - " work_dir: 'dbfs:/feathr_getting_started'\n", - "online_store:\n", - " redis:\n", - " host: 'feathrazuretest3redis.redis.cache.windows.net'\n", - " port: 6380\n", - " ssl_enabled: True\n", - "feature_registry:\n", - " api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n", - "\"\"\"\n", - "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", - "with open(tmp.name, \"w\") as text_file:\n", - " text_file.write(yaml_config)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup necessary environment variables (Skip if using the above Quick Start Template)\n", - "\n", - "You should setup the environment variables in order to run this sample. More environment variables can be set by referring to [feathr_config.yaml](https://github.com/feathr-ai/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It also has more explanations on the meaning of each variable.\n", - "\n", - "To run this notebook, for Azure users, you need AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET and REDIS_PASSWORD.\n", - "To run this notebook, for Databricks useres, you need DATABRICKS_WORKSPACE_TOKEN_VALUE and REDIS_PASSWORD." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Initialize Feathr Client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = FeathrClient(config_path=tmp.name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## View the data\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The data is as below" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Defining Features with Feathr\n", - "\n", - "In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md)\n", - "\n", - "\n", - "1. The typed key (a.k.a. entity id) identifies the subject of feature, e.g. a user id, 123.\n", - "2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", - "3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that, in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n", - "It is merely a function/transformation executing against request data at runtime.\n", - "For example, the day of week of the request, which is calculated by converting the request UNIX timestamp.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Sources Section with UDFs\n", - "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql import SparkSession, DataFrame\n", - "def feathr_udf_day_calc(df: DataFrame) -> DataFrame:\n", - " from pyspark.sql.functions import dayofweek, dayofyear, col\n", - " df = df.withColumn(\"fare_amount_cents\", col(\"fare_amount\")*100)\n", - " return df\n", - "\n", - "batch_source = HdfsSource(name=\"nycTaxiBatchSource\",\n", - " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", - " event_timestamp_column=\"lpep_dropoff_datetime\",\n", - " preprocessing=feathr_udf_day_calc,\n", - " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Anchors and Features\n", - "A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "f_trip_distance = Feature(name=\"f_trip_distance\",\n", - " feature_type=FLOAT, transform=\"trip_distance\")\n", - "f_trip_time_duration = Feature(name=\"f_trip_time_duration\",\n", - " feature_type=INT32,\n", - " transform=\"(to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime))/60\")\n", - "\n", - "features = [\n", - " f_trip_distance,\n", - " f_trip_time_duration,\n", - " Feature(name=\"f_is_long_trip_distance\",\n", - " feature_type=BOOLEAN,\n", - " transform=\"cast_float(trip_distance)>30\"),\n", - " Feature(name=\"f_day_of_week\",\n", - " feature_type=INT32,\n", - " transform=\"dayofweek(lpep_dropoff_datetime)\"),\n", - "]\n", - "\n", - "request_anchor = FeatureAnchor(name=\"request_features\",\n", - " source=INPUT_CONTEXT,\n", - " features=features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Window aggregation features\n", - "\n", - "For window aggregation features, see the supported fields below:\n", - "\n", - "Note that the `agg_func` should be any of these:\n", - "\n", - "| Aggregation Type | Input Type | Description |\n", - "| --- | --- | --- |\n", - "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", - "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", - "|LATEST| Any |Returns the latest not-null values from within the defined time window |\n", - "\n", - "\n", - "After you have defined features and sources, bring them together to build an anchor:\n", - "\n", - "\n", - "Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "location_id = TypedKey(key_column=\"DOLocationID\",\n", - " key_column_type=ValueType.INT32,\n", - " description=\"location id in NYC\",\n", - " full_name=\"nyc_taxi.location_id\")\n", - "agg_features = [Feature(name=\"f_location_avg_fare\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", - " agg_func=\"AVG\",\n", - " window=\"90d\")),\n", - " Feature(name=\"f_location_max_fare\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", - " agg_func=\"MAX\",\n", - " window=\"90d\")),\n", - " Feature(name=\"f_location_total_fare_cents\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"fare_amount_cents\",\n", - " agg_func=\"SUM\",\n", - " window=\"90d\")),\n", - " ]\n", - "\n", - "agg_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n", - " source=batch_source,\n", - " features=agg_features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Derived Features Section\n", - "Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "f_trip_time_distance = DerivedFeature(name=\"f_trip_time_distance\",\n", - " feature_type=FLOAT,\n", - " input_features=[\n", - " f_trip_distance, f_trip_time_duration],\n", - " transform=\"f_trip_distance * f_trip_time_duration\")\n", - "\n", - "f_trip_time_rounded = DerivedFeature(name=\"f_trip_time_rounded\",\n", - " feature_type=INT32,\n", - " input_features=[f_trip_time_duration],\n", - " transform=\"f_trip_time_duration % 10\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=[\n", - " f_trip_time_distance, f_trip_time_rounded])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create training data using point-in-time correct feature join\n", - "\n", - "A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n", - "\n", - "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", - "what features and how these features should be joined to the observation data. \n", - "\n", - "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/point-in-time-join.md)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if client.spark_runtime == 'databricks':\n", - " output_path = 'dbfs:/feathrazure_test.avro'\n", - "else:\n", - " output_path = feathr_output_path\n", - "\n", - "\n", - "feature_query = FeatureQuery(\n", - " feature_list=[\"f_location_avg_fare\", \"f_trip_time_rounded\", \"f_is_long_trip_distance\", \"f_location_total_fare_cents\"], key=location_id)\n", - "settings = ObservationSettings(\n", - " observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", - " event_timestamp_column=\"lpep_dropoff_datetime\",\n", - " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")\n", - "client.get_offline_features(observation_settings=settings,\n", - " feature_query=feature_query,\n", - " output_path=output_path)\n", - "client.wait_job_to_finish(timeout_sec=500)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download the result and show the result\n", - "\n", - "Let's use the helper function `get_result_df` to download the result and view it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_result_df(client: FeathrClient) -> pd.DataFrame:\n", - " \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n", - " res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n", - " tmp_dir = tempfile.TemporaryDirectory()\n", - " client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n", - " dataframe_list = []\n", - " # assuming the result are in avro format\n", - " for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n", - " dataframe_list.append(pdx.read_avro(file))\n", - " vertical_concat_df = pd.concat(dataframe_list, axis=0)\n", - " tmp_dir.cleanup()\n", - " return vertical_concat_df\n", - "\n", - "df_res = get_result_df(client)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_res" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train a machine learning model\n", - "After getting all the features, let's train a machine learning model with the converted feature by Feathr:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# remove columns\n", - "from sklearn.ensemble import GradientBoostingRegressor\n", - "final_df = df_res\n", - "final_df.drop([\"lpep_pickup_datetime\", \"lpep_dropoff_datetime\",\n", - " \"store_and_fwd_flag\"], axis=1, inplace=True, errors='ignore')\n", - "final_df.fillna(0, inplace=True)\n", - "final_df['fare_amount'] = final_df['fare_amount'].astype(\"float64\")\n", - "\n", - "\n", - "train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"fare_amount\"], axis=1),\n", - " final_df[\"fare_amount\"],\n", - " test_size=0.2,\n", - " random_state=42)\n", - "model = GradientBoostingRegressor()\n", - "model.fit(train_x, train_y)\n", - "\n", - "y_predict = model.predict(test_x)\n", - "\n", - "y_actual = test_y.values.flatten().tolist()\n", - "rmse = sqrt(mean_squared_error(y_actual, y_predict))\n", - "\n", - "sum_actuals = sum_errors = 0\n", - "\n", - "for actual_val, predict_val in zip(y_actual, y_predict):\n", - " abs_error = actual_val - predict_val\n", - " if abs_error < 0:\n", - " abs_error = abs_error * -1\n", - "\n", - " sum_errors = sum_errors + abs_error\n", - " sum_actuals = sum_actuals + actual_val\n", - "\n", - "mean_abs_percent_error = sum_errors / sum_actuals\n", - "print(\"Model MAPE:\")\n", - "print(mean_abs_percent_error)\n", - "print()\n", - "print(\"Model Accuracy:\")\n", - "print(1 - mean_abs_percent_error)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Materialize feature value into offline/online storage\n", - "\n", - "While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n", - "and materialize the feature value to offline and/or online storage. \n", - "\n", - "We can push the generated features to the online store like below:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "backfill_time = BackfillTime(start=datetime(\n", - " 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n", - "redisSink = RedisSink(table_name=\"nycTaxiDemoFeature\")\n", - "settings = MaterializationSettings(\"nycTaxiTable\",\n", - " backfill_time=backfill_time,\n", - " sinks=[redisSink],\n", - " feature_names=[\"f_location_avg_fare\", \"f_location_max_fare\"])\n", - "\n", - "client.materialize_features(settings, allow_materialize_non_agg_feature =True)\n", - "client.wait_job_to_finish(timeout_sec=500)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can then get the features from the online store (Redis):\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Fetching feature value for online inference\n", - "\n", - "For features that are already materialized by the previous step, their latest value can be queried via the client's\n", - "`get_online_features` or `multi_get_online_features` API." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "res = client.get_online_features('nycTaxiDemoFeature', '265', [\n", - " 'f_location_avg_fare', 'f_location_max_fare'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.multi_get_online_features(\"nycTaxiDemoFeature\", [\"239\", \"265\"], [\n", - " 'f_location_avg_fare', 'f_location_max_fare'])\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Registering and Fetching features\n", - "\n", - "We can also register the features with an Apache Atlas compatible service, such as Azure Purview, and share the registered features across teams:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.register_features()\n", - "client.list_registered_features(project_name=\"feathr_getting_started\")" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "384e5e16-7213-4186-9d04-09d03b155534", + "showTitle": false, + "title": "" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.9.14 64-bit", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.14" + }, + "source": [ + "# Feathr Quick Start Notebook\n", + "\n", + "This notebook illustrates the use of Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n", + "\n", + "The major problems Feathr solves are:\n", + "\n", + "1. Create, share and manage useful features from raw source data.\n", + "2. Provide Point-in-time feature join to create training dataset to ensure no data leakage.\n", + "3. Deploy the same feature data to online store to eliminate training and inference data skew." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite\n", + "\n", + "Feathr has native cloud integration. First step is to provision required cloud resources if you want to use Feathr.\n", + "\n", + "Follow the [Feathr ARM deployment guide](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html) to run Feathr on Azure. This allows you to quickly get started with automated deployment using Azure Resource Manager template. For more details, please refer [README.md](https://github.com/feathr-ai/feathr#%EF%B8%8F-running-feathr-on-cloud-with-a-few-simple-steps).\n", + "\n", + "Additionally, to run this notebook, you'll need to install `feathr` pip package. For local spark, simply run `pip install feathr` on the machine that runs this notebook. To use Databricks or Azure Synapse Analytics, please see dependency management documents:\n", + "- [Azure Databricks dependency management](https://learn.microsoft.com/en-us/azure/databricks/libraries/)\n", + "- [Azure Synapse Analytics dependency management](https://learn.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-azure-portal-add-libraries)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Notebook Steps\n", + "\n", + "This tutorial demonstrates the key capabilities of Feathr, including:\n", + "\n", + "1. Install Feathr and necessary dependencies\n", + "2. Create shareable features with Feathr feature definition configs\n", + "3. Create training data using point-in-time correct feature join\n", + "4. Train a prediction model and evaluate the model and features\n", + "5. Register the features to share across teams\n", + "6. Materialize feature values for online scoring\n", + "\n", + "The overall data flow is as follows:\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Install Feathr and Necessary Dependancies\n", + "\n", + "Install feathr and necessary packages by running `pip install feathr[notebook]` if you haven't installed them already." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "80223a02-631c-40c8-91b3-a037249ffff9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from datetime import timedelta\n", + "from math import sqrt\n", + "import os\n", + "from pathlib import Path\n", + "from tempfile import TemporaryDirectory\n", + "\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.evaluation import RegressionEvaluator\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml.regression import GBTRegressor\n", + "from pyspark.sql import DataFrame, SparkSession\n", + "import pyspark.sql.functions as F\n", + "\n", + "import feathr\n", + "from feathr import (\n", + " FeathrClient,\n", + " # Feature data types\n", + " BOOLEAN, FLOAT, INT32, ValueType,\n", + " # Feature data sources\n", + " INPUT_CONTEXT, HdfsSource,\n", + " # Feature aggregations\n", + " TypedKey, WindowAggTransformation,\n", + " # Feature types and anchor\n", + " DerivedFeature, Feature, FeatureAnchor,\n", + " # Materialization\n", + " BackfillTime, MaterializationSettings, RedisSink,\n", + " # Offline feature computation\n", + " FeatureQuery, ObservationSettings,\n", + ")\n", + "from feathr.datasets import nyc_taxi\n", + "from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n", + "from feathr.utils.config import generate_config\n", + "from feathr.utils.job_utils import get_result_df\n", + "from feathr.utils.platform import is_databricks, is_jupyter\n", + "\n", + "print(f\"Feathr version: {feathr.__version__}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Create Shareable Features with Feathr Feature Definition Configs\n", + "\n", + "First, we define all the necessary resource key values for authentication. These values are retrieved by using [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) cloud key value store. For authentication, we use Azure CLI credential in this notebook, but you may add secrets' list and get permission for the necessary service principal instead of running `az login --use-device-code`.\n", + "\n", + "Please refer to [A note on using azure key vault to store credentials](https://github.com/feathr-ai/feathr/blob/41e7496b38c43af6d7f8f1de842f657b27840f6d/docs/how-to-guides/feathr-configuration-and-env.md#a-note-on-using-azure-key-vault-to-store-credentials) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "RESOURCE_PREFIX = None # TODO fill the value used to deploy the resources via ARM template\n", + "PROJECT_NAME = \"feathr_getting_started\"\n", + "\n", + "# Currently support: 'azure_synapse', 'databricks', and 'local' \n", + "SPARK_CLUSTER = \"local\"\n", + "\n", + "# TODO fill values to use databricks cluster:\n", + "DATABRICKS_CLUSTER_ID = None # Set Databricks cluster id to use an existing cluster\n", + "DATABRICKS_URL = None # Set Databricks workspace url to use databricks\n", + "\n", + "# TODO fill values to use Azure Synapse cluster:\n", + "AZURE_SYNAPSE_SPARK_POOL = None # Set Azure Synapse Spark pool name\n", + "AZURE_SYNAPSE_URL = None # Set Azure Synapse workspace url to use Azure Synapse\n", + "\n", + "# Data store root path. Could be a local file system path, dbfs or Azure storage path like abfs or wasbs\n", + "DATA_STORE_PATH = TemporaryDirectory().name\n", + "\n", + "# Feathr config file path to use an existing file\n", + "FEATHR_CONFIG_PATH = None\n", + "\n", + "# If set True, use an interactive browser authentication to get the redis password.\n", + "USE_CLI_AUTH = False\n", + "\n", + "REGISTER_FEATURES = False\n", + "\n", + "# (For the notebook test pipeline) If true, use ScrapBook package to collect the results.\n", + "SCRAP_RESULTS = False" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use Databricks as the feathr client's target platform, you may need to set a databricks token to an environment variable like:\n", + "\n", + "`export DATABRICKS_WORKSPACE_TOKEN_VALUE=your-token`\n", + "\n", + "or in the notebook cell,\n", + "\n", + "`os.environ[\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"] = your-token`\n", + "\n", + "If you are running this notebook on Databricks, the token will be automatically retrieved by using the current Databricks notebook context.\n", + "\n", + "On the other hand, to use Azure Synapse cluster, you have to specify the synapse workspace storage key:\n", + "\n", + "`export ADLS_KEY=your-key`\n", + "\n", + "or in the notebook cell,\n", + "\n", + "`os.environ[\"ADLS_KEY\"] = your-key`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if SPARK_CLUSTER == \"azure_synapse\" and not os.environ.get(\"ADLS_KEY\"):\n", + " os.environ[\"ADLS_KEY\"] = add_your_key_here\n", + "elif SPARK_CLUSTER == \"databricks\" and not os.environ.get(\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"):\n", + " os.environ[\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"] = add_your_token_here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Force to use dbfs if the notebook is running on Databricks\n", + "if is_databricks() and not DATA_STORE_PATH.startswith(\"dbfs:\"):\n", + " DATA_STORE_PATH = f\"dbfs:/{DATA_STORE_PATH.lstrip('/')}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if USE_CLI_AUTH:\n", + " !az login --use-device-code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Redis password\n", + "if 'REDIS_PASSWORD' not in os.environ:\n", + " # Try to get all the required credentials from Azure Key Vault\n", + " from azure.identity import AzureCliCredential, DefaultAzureCredential \n", + " from azure.keyvault.secrets import SecretClient\n", + "\n", + " vault_url = f\"https://{RESOURCE_PREFIX}kv.vault.azure.net\"\n", + " if USE_CLI_AUTH:\n", + " credential = AzureCliCredential(additionally_allowed_tenants=['*'],)\n", + " else:\n", + " credential = DefaultAzureCredential(\n", + " exclude_interactive_browser_credential=False,\n", + " additionally_allowed_tenants=['*'],\n", + " )\n", + " secret_client = SecretClient(vault_url=vault_url, credential=credential)\n", + " retrieved_secret = secret_client.get_secret('FEATHR-ONLINE-STORE-CONN').value\n", + " os.environ['REDIS_PASSWORD'] = retrieved_secret.split(\",\")[1].split(\"password=\", 1)[1]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "41d3648a-9bc9-40dc-90da-bc82b21ef9b3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Configurations\n", + "\n", + "Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com//feathr-ai/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "8cd64e3a-376c-48e6-ba41-5197f3591d48", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "if FEATHR_CONFIG_PATH:\n", + " config_path = FEATHR_CONFIG_PATH\n", + "else:\n", + " config_path = generate_config(\n", + " resource_prefix=RESOURCE_PREFIX,\n", + " project_name=PROJECT_NAME,\n", + " spark_config__spark_cluster=SPARK_CLUSTER,\n", + " spark_config__azure_synapse__dev_url=AZURE_SYNAPSE_URL,\n", + " spark_config__azure_synapse__pool_name=AZURE_SYNAPSE_SPARK_POOL,\n", + " spark_config__databricks__workspace_instance_url=DATABRICKS_URL,\n", + " databricks_cluster_id=DATABRICKS_CLUSTER_ID,\n", + " )\n", + "\n", + "with open(config_path, 'r') as f: \n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3fef7f2f-df19-4f53-90a5-ff7999ed983d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Initialize Feathr client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "9713a2df-c7b2-4562-88b0-b7acce3cc43a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client = FeathrClient(config_path=config_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c3b64bda-d42c-4a64-b976-0fb604cf38c5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Prepare the NYC taxi fare dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If the notebook is runnong on Jupyter, start a spark session:\n", + "if is_jupyter():\n", + " spark = (\n", + " SparkSession\n", + " .builder\n", + " .appName(\"feathr\")\n", + " .config(\"spark.jars.packages\", \"org.apache.spark:spark-avro_2.12:3.3.0,io.delta:delta-core_2.12:2.1.1\")\n", + " .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\")\n", + " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\")\n", + " .config(\"spark.ui.port\", \"8080\") # Set ui port other than the default one (4040) so that feathr spark job doesn't fail. \n", + " .getOrCreate()\n", + " )\n", + "\n", + "# Else, you must already have a spark session object available in databricks or synapse notebooks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c4ccd7b3-298a-4e5a-8eec-b7e309db393e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n", + "\n", + "# Download the data file\n", + "df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n", + "df_raw.limit(5).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "7430c942-64e5-4b70-b823-16ce1d1b3cee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Defining features with Feathr\n", + "\n", + "In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n", + "\n", + "* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n", + "* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", + "* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n", + "\n", + "Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n", + "\n", + "There are two types of features -- anchored features and derivated features:\n", + "\n", + "* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n", + "* **Derived features**: Features that are computed on top of other features.\n", + "\n", + "#### Define anchored features\n", + "\n", + "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n", + "TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "a373ecbe-a040-4cd3-9d87-0d5f4c5ba553", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# We define f_trip_distance and f_trip_time_duration features separately\n", + "# so that we can reuse them later for the derived features.\n", + "f_trip_distance = Feature(\n", + " name=\"f_trip_distance\",\n", + " feature_type=FLOAT,\n", + " transform=\"trip_distance\",\n", + ")\n", + "f_trip_time_duration = Feature(\n", + " name=\"f_trip_time_duration\",\n", + " feature_type=FLOAT,\n", + " transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n", + ")\n", + "\n", + "features = [\n", + " f_trip_distance,\n", + " f_trip_time_duration,\n", + " Feature(\n", + " name=\"f_is_long_trip_distance\",\n", + " feature_type=BOOLEAN,\n", + " transform=\"trip_distance > 30.0\",\n", + " ),\n", + " Feature(\n", + " name=\"f_day_of_week\",\n", + " feature_type=INT32,\n", + " transform=\"dayofweek(lpep_dropoff_datetime)\",\n", + " ),\n", + " Feature(\n", + " name=\"f_day_of_month\",\n", + " feature_type=INT32,\n", + " transform=\"dayofmonth(lpep_dropoff_datetime)\",\n", + " ),\n", + " Feature(\n", + " name=\"f_hour_of_day\",\n", + " feature_type=INT32,\n", + " transform=\"hour(lpep_dropoff_datetime)\",\n", + " ),\n", + "]\n", + "\n", + "# After you have defined features, bring them together to build the anchor to the source.\n", + "feature_anchor = FeatureAnchor(\n", + " name=\"feature_anchor\",\n", + " source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n", + " features=features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can define the source with a preprocessing python function. In order to make the source data accessible from the target spark cluster, we upload the data file into either DBFS or Azure Blob Storage if needed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define data source path\n", + "if client.spark_runtime == \"local\" or (client.spark_runtime == \"databricks\" and is_databricks()):\n", + " # In local mode, we can use the same data path as the source.\n", + " # If the notebook is running on databricks, DATA_FILE_PATH should be already a dbfs path.\n", + " data_source_path = DATA_FILE_PATH\n", + "else:\n", + " # Otherwise, upload the local file to the cloud storage (either dbfs or adls).\n", + " data_source_path = client.feathr_spark_launcher.upload_or_get_cloud_path(DATA_FILE_PATH) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocessing(df: DataFrame) -> DataFrame:\n", + " import pyspark.sql.functions as F\n", + " df = df.withColumn(\"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\"))\n", + " return df\n", + "\n", + "batch_source = HdfsSource(\n", + " name=\"nycTaxiBatchSource\",\n", + " path=data_source_path,\n", + " event_timestamp_column=TIMESTAMP_COL,\n", + " preprocessing=preprocessing,\n", + " timestamp_format=TIMESTAMP_FORMAT,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the features with aggregation, the supported functions are as follows:\n", + "\n", + "| Aggregation Function | Input Type | Description |\n", + "| --- | --- | --- |\n", + "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", + "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", + "|LATEST| Any |Returns the latest not-null values from within the defined time window |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agg_key = TypedKey(\n", + " key_column=\"DOLocationID\",\n", + " key_column_type=ValueType.INT32,\n", + " description=\"location id in NYC\",\n", + " full_name=\"nyc_taxi.location_id\",\n", + ")\n", + "\n", + "agg_window = \"90d\"\n", + "\n", + "# Anchored features with aggregations\n", + "agg_features = [\n", + " Feature(\n", + " name=\"f_location_avg_fare\",\n", + " key=agg_key,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(\n", + " agg_expr=\"fare_amount_cents\",\n", + " agg_func=\"AVG\",\n", + " window=agg_window,\n", + " ),\n", + " ),\n", + " Feature(\n", + " name=\"f_location_max_fare\",\n", + " key=agg_key,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(\n", + " agg_expr=\"fare_amount_cents\",\n", + " agg_func=\"MAX\",\n", + " window=agg_window,\n", + " ),\n", + " ),\n", + "]\n", + "\n", + "agg_feature_anchor = FeatureAnchor(\n", + " name=\"agg_feature_anchor\",\n", + " source=batch_source, # External data source for feature. Typically a data table.\n", + " features=agg_features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "d2ecaca9-057e-4b36-811f-320f66f753ed", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Define derived features\n", + "\n", + "We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "270fb11e-8a71-404f-9639-ad29d8e6a2c1", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "derived_features = [\n", + " DerivedFeature(\n", + " name=\"f_trip_time_distance\",\n", + " feature_type=FLOAT,\n", + " input_features=[\n", + " f_trip_distance,\n", + " f_trip_time_duration,\n", + " ],\n", + " transform=\"f_trip_distance / f_trip_time_duration\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "ad102c45-586d-468c-85f0-9454401ef10b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Build features\n", + "\n", + "Finally, we build the features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "91bb5ebb-87e4-470b-b8eb-1c89b351740e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client.build_features(\n", + " anchor_list=[feature_anchor, agg_feature_anchor],\n", + " derived_feature_list=derived_features,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "632d5f46-f9e2-41a8-aab7-34f75206e2aa", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 3. Create Training Data Using Point-in-Time Correct Feature Join\n", + "\n", + "After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n", + "\n", + "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", + "what features and how these features should be joined to the observation data. \n", + "\n", + "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com//feathr-ai/feathr/blob/main/docs/concepts/point-in-time-join.md)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feature_names = [feature.name for feature in features + agg_features + derived_features]\n", + "feature_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_FORMAT = \"parquet\"\n", + "offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f", + "showTitle": false, + "title": "" }, - "vscode": { - "interpreter": { - "hash": "a665b5d41d17b532ea9890333293a1b812fa0b73c9c25c950b3cedf1bebd0438" - } + "scrolled": false + }, + "outputs": [], + "source": [ + "# Features that we want to request. Can use a subset of features\n", + "query = FeatureQuery(\n", + " feature_list=feature_names,\n", + " key=agg_key,\n", + ")\n", + "settings = ObservationSettings(\n", + " observation_path=data_source_path,\n", + " event_timestamp_column=TIMESTAMP_COL,\n", + " timestamp_format=TIMESTAMP_FORMAT,\n", + ")\n", + "client.get_offline_features(\n", + " observation_settings=settings,\n", + " feature_query=query,\n", + " # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n", + " execution_configurations=SparkExecutionConfiguration({\n", + " \"spark.feathr.outputFormat\": DATA_FORMAT,\n", + " }),\n", + " output_path=offline_features_path,\n", + ")\n", + "\n", + "client.wait_job_to_finish(timeout_sec=1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show feature results\n", + "df = get_result_df(\n", + " spark=spark,\n", + " client=client,\n", + " data_format=DATA_FORMAT,\n", + " res_url=offline_features_path,\n", + ")\n", + "df.select(feature_names).limit(5).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## 4. Train a Prediction Model and Evaluate the Features\n", + "\n", + "After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n", + "\n", + "Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Train and Test Data from the Offline Feature Values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train / test split\n", + "train_df, test_df = (\n", + " df # Dataframe that we generated from get_offline_features call.\n", + " .withColumn(\"label\", F.col(\"fare_amount\").cast(\"double\"))\n", + " .where(F.col(\"f_trip_time_duration\") > 0)\n", + " .fillna(0)\n", + " .randomSplit([0.8, 0.2])\n", + ")\n", + "\n", + "print(f\"Num train samples: {train_df.count()}\")\n", + "print(f\"Num test samples: {test_df.count()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Build a ML Pipeline\n", + "\n", + "Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate a feature vector column for SparkML\n", + "vector_assembler = VectorAssembler(\n", + " inputCols=[x for x in df.columns if x in feature_names],\n", + " outputCol=\"features\",\n", + ")\n", + "\n", + "# Define a model\n", + "gbt = GBTRegressor(\n", + " featuresCol=\"features\",\n", + " maxIter=100,\n", + " maxDepth=5,\n", + " maxBins=16,\n", + ")\n", + "\n", + "# Create a ML pipeline\n", + "ml_pipeline = Pipeline(stages=[\n", + " vector_assembler,\n", + " gbt,\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train and Evaluate the Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train a model\n", + "model = ml_pipeline.fit(train_df)\n", + "\n", + "# Make predictions\n", + "predictions = model.transform(test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluate\n", + "evaluator = RegressionEvaluator(\n", + " labelCol=\"label\",\n", + " predictionCol=\"prediction\",\n", + ")\n", + "\n", + "rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n", + "mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n", + "print(f\"RMSE: {rmse}\\nMAE: {mae}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n", + "predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n", + "\n", + "predictions_pdf.plot(\n", + " x=\"index\",\n", + " y=[\"label\", \"prediction\"],\n", + " style=['-', ':'],\n", + " figsize=(20, 10),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predictions_pdf.plot.scatter(\n", + " x=\"label\",\n", + " y=\"prediction\",\n", + " xlim=(0, 100),\n", + " ylim=(0, 100),\n", + " figsize=(10, 10),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Register the Features to Share Across Teams\n", + "\n", + "You can register your features in the centralized registry and share the corresponding project with other team members who want to consume those features and for further use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if REGISTER_FEATURES:\n", + " try:\n", + " client.register_features()\n", + " except KeyError:\n", + " # TODO temporarily go around the \"Already exists\" error\n", + " pass \n", + " print(client.list_registered_features(project_name=PROJECT_NAME))\n", + " # You can get the actual features too by calling client.get_features_from_registry(PROJECT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "5a226026-1c7b-48db-8f91-88d5c2ddf023", + "showTitle": false, + "title": "" } + }, + "source": [ + "## 6. Materialize Feature Values for Online Scoring\n", + "\n", + "While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n", + "\n", + "Note, only the features anchored to offline data source can be materialized." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the last date from the dataset\n", + "backfill_timestamp = (\n", + " df_raw\n", + " .select(F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL))\n", + " .agg({TIMESTAMP_COL: \"max\"})\n", + " .collect()[0][0]\n", + ")\n", + "backfill_timestamp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3b924c66-8634-42fe-90f3-c844487d3f75", + "showTitle": false, + "title": "" + }, + "scrolled": false + }, + "outputs": [], + "source": [ + "FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n", + "\n", + "# Time range to materialize\n", + "backfill_time = BackfillTime(\n", + " start=backfill_timestamp,\n", + " end=backfill_timestamp,\n", + " step=timedelta(days=1),\n", + ")\n", + "\n", + "# Destinations:\n", + "# For online store,\n", + "redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n", + "\n", + "# For offline store,\n", + "# adls_sink = HdfsSink(output_path=)\n", + "\n", + "settings = MaterializationSettings(\n", + " name=FEATURE_TABLE_NAME + \".job\", # job name\n", + " backfill_time=backfill_time,\n", + " sinks=[redis_sink], # or adls_sink\n", + " feature_names=[feature.name for feature in agg_features],\n", + ")\n", + "\n", + "client.materialize_features(\n", + " settings=settings,\n", + " execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n", + ")\n", + "\n", + "client.wait_job_to_finish(timeout_sec=1000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, you can retrieve features for online scoring as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Note, to get a single key, you may use client.get_online_features instead\n", + "materialized_feature_values = client.multi_get_online_features(\n", + " feature_table=FEATURE_TABLE_NAME,\n", + " keys=[\"239\", \"265\"],\n", + " feature_names=[feature.name for feature in agg_features],\n", + ")\n", + "materialized_feature_values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Unregister, delete cached files or do any other cleanups." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Stop the spark session if it is a local session.\n", + "if is_jupyter():\n", + " spark.stop()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Scrap Variables for Testing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if SCRAP_RESULTS:\n", + " # Record results for test pipelines\n", + " import scrapbook as sb\n", + " sb.glue(\"materialized_feature_values\", materialized_feature_values)\n", + " sb.glue(\"rmse\", rmse)\n", + " sb.glue(\"mae\", mae)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "nyc_driver_demo", + "notebookOrigID": 930353059183053, + "widgets": {} + }, + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" }, - "nbformat": 4, - "nbformat_minor": 2 + "vscode": { + "interpreter": { + "hash": "e34a1a57d2e174682770a82d94a178aa36d3ccfaa21227c5d2308e319b7ae532" + } + } + }, + "nbformat": 4, + "nbformat_minor": 1 } diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index 2cacdcabd..52c7f1a8f 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -451,7 +451,6 @@ def get_offline_features(self, output_path: Union[str, Sink], execution_configurations: Union[SparkExecutionConfiguration ,Dict[str,str]] = {}, config_file_name:str = "feature_join_conf/feature_join.conf", - udf_files = None, verbose: bool = False ): """ @@ -522,7 +521,9 @@ def _get_offline_features_with_config(self, job_tags = {OUTPUT_PATH_TAG:feature_join_job_params.job_output_path} # set output format in job tags if it's set by user, so that it can be used to parse the job result in the helper function if execution_configurations is not None and OUTPUT_FORMAT in execution_configurations: - job_tags[OUTPUT_FORMAT]= execution_configurations[OUTPUT_FORMAT] + job_tags[OUTPUT_FORMAT] = execution_configurations[OUTPUT_FORMAT] + else: + job_tags[OUTPUT_FORMAT] = "avro" ''' - Job tags are for job metadata and it's not passed to the actual spark job (i.e. not visible to spark job), more like a platform related thing that Feathr want to add (currently job tags only have job output URL and job output format, ). They are carried over with the job and is visible to every Feathr client. Think this more like some customized metadata for the job which would be weird to be put in the spark job itself. - Job arguments (or sometimes called job parameters)are the arguments which are command line arguments passed into the actual spark job. This is usually highly related with the spark job. In Feathr it's like the input to the scala spark CLI. They are usually not spark specific (for example if we want to specify the location of the feature files, or want to diff --git a/feathr_project/feathr/datasets/__init__.py b/feathr_project/feathr/datasets/__init__.py new file mode 100644 index 000000000..a1e2e5bf3 --- /dev/null +++ b/feathr_project/feathr/datasets/__init__.py @@ -0,0 +1,9 @@ +"""Utilities for downloading sample datasets""" + +from feathr.datasets.constants import ( + NYC_TAXI_SMALL_URL +) + +__all__ = [ + "NYC_TAXI_SMALL_URL", +] diff --git a/feathr_project/feathr/datasets/constants.py b/feathr_project/feathr/datasets/constants.py new file mode 100644 index 000000000..849865570 --- /dev/null +++ b/feathr_project/feathr/datasets/constants.py @@ -0,0 +1,3 @@ +NYC_TAXI_SMALL_URL = ( + "https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv" +) diff --git a/feathr_project/feathr/datasets/nyc_taxi.py b/feathr_project/feathr/datasets/nyc_taxi.py new file mode 100644 index 000000000..e00fa7150 --- /dev/null +++ b/feathr_project/feathr/datasets/nyc_taxi.py @@ -0,0 +1,87 @@ +from pathlib import Path +from tempfile import TemporaryDirectory +from threading import local +from urllib.parse import urlparse + +import pandas as pd +from pyspark.sql import DataFrame, SparkSession + +from feathr.datasets import NYC_TAXI_SMALL_URL +from feathr.datasets.utils import maybe_download +from feathr.utils.platform import is_databricks + + +def get_pandas_df( + local_cache_path: str = None, +) -> pd.DataFrame: + """Get NYC taxi fare prediction data samples as a pandas DataFrame. + + Refs: + https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page + + Args: + local_cache_path (optional): Local cache file path to download the data set. + If local_cache_path is a directory, the source file name will be added. + + Returns: + pandas DataFrame + """ + # if local_cache_path params is not provided then create a temporary folder + if local_cache_path is None: + local_cache_path = TemporaryDirectory().name + + # If local_cache_path is a directory, add the source file name. + src_filepath = Path(urlparse(NYC_TAXI_SMALL_URL).path) + dst_path = Path(local_cache_path) + if dst_path.suffix != src_filepath.suffix: + local_cache_path = str(dst_path.joinpath(src_filepath.name)) + + maybe_download(src_url=NYC_TAXI_SMALL_URL, dst_filepath=local_cache_path) + + pdf = pd.read_csv(local_cache_path) + + return pdf + + +def get_spark_df( + spark: SparkSession, + local_cache_path: str, +) -> DataFrame: + """Get NYC taxi fare prediction data samples as a spark DataFrame. + + Refs: + https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page + + Args: + spark: Spark session. + local_cache_path: Local cache file path to download the data set. + If local_cache_path is a directory, the source file name will be added. + + Returns: + Spark DataFrame + """ + # In spark, local_cache_path should be a persist directory or file path + if local_cache_path is None: + raise ValueError("In spark, `local_cache_path` should be a persist directory or file path.") + + # If local_cache_path is a directory, add the source file name. + src_filepath = Path(urlparse(NYC_TAXI_SMALL_URL).path) + dst_path = Path(local_cache_path) + if dst_path.suffix != src_filepath.suffix: + local_cache_path = str(dst_path.joinpath(src_filepath.name)) + + if is_databricks(): + # Databricks uses "dbfs:/" prefix for spark paths + if not local_cache_path.startswith("dbfs:"): + local_cache_path = f"dbfs:/{local_cache_path.lstrip('/')}" + # Databricks uses "/dbfs/" prefix for python paths + python_local_cache_path = local_cache_path.replace("dbfs:", "/dbfs") + # TODO add "if is_synapse()" + else: + python_local_cache_path = local_cache_path + + maybe_download(src_url=NYC_TAXI_SMALL_URL, dst_filepath=python_local_cache_path) + + df = spark.read.option("header", True).csv(local_cache_path) + + return df diff --git a/feathr_project/feathr/datasets/utils.py b/feathr_project/feathr/datasets/utils.py new file mode 100644 index 000000000..5dcfb6e87 --- /dev/null +++ b/feathr_project/feathr/datasets/utils.py @@ -0,0 +1,64 @@ +"""Dataset utilities +""" +import logging +import math +from pathlib import Path +import requests +from urllib.parse import urlparse + +from tqdm import tqdm + + +log = logging.getLogger(__name__) + + +def maybe_download(src_url: str, dst_filepath: str, expected_bytes=None) -> bool: + """Check if file exists. If not, download and return True. Else, return False. + + Refs: + https://github.com/microsoft/recommenders/blob/main/recommenders/datasets/download_utils.py + + Args: + src_url: Source file URL. + dst_filepath: Destination file path. + expected_bytes (optional): Expected bytes of the file to verify. + + Returns: + bool: Whether the file was downloaded or not + """ + dst_filepath = Path(dst_filepath) + + if dst_filepath.is_file(): + log.info(f"File {str(dst_filepath)} already exists") + return False + + # Check dir if exists. If not, create one + dst_filepath.parent.mkdir(parents=True, exist_ok=True) + + response = requests.get(src_url, stream=True) + if response.status_code == 200: + log.info(f"Downloading {src_url}") + total_size = int(response.headers.get("content-length", 0)) + block_size = 1024 + num_iterables = math.ceil(total_size / block_size) + with open(str(dst_filepath.resolve()), "wb") as file: + for data in tqdm( + response.iter_content(block_size), + total=num_iterables, + unit="KB", + unit_scale=True, + ): + file.write(data) + + # Verify the file size + if expected_bytes is not None and expected_bytes != dst_filepath.stat().st_size: + # Delete the file since the size is not the same as the expected one. + dst_filepath.unlink() + raise IOError(f"Failed to verify {str(dst_filepath)}. Maybe interrupted while downloading?") + else: + return True + + else: + response.raise_for_status() + # If not HTTPError yet still cannot download + raise Exception(f"Problem downloading {src_url}") diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py index 6f3aa5112..a10f30818 100644 --- a/feathr_project/feathr/spark_provider/_databricks_submission.py +++ b/feathr_project/feathr/spark_provider/_databricks_submission.py @@ -1,68 +1,66 @@ -from ast import Raise +from collections import namedtuple import copy import json import os -import time -from collections import namedtuple from os.path import basename from pathlib import Path -from typing import Any, Dict, List, Optional, Union +import time +from typing import Dict, List, Optional, Union from urllib.parse import urlparse from urllib.request import urlopen -import requests from databricks_cli.dbfs.api import DbfsApi from databricks_cli.runs.api import RunsApi from databricks_cli.sdk.api_client import ApiClient +from loguru import logger +import requests +from requests.structures import CaseInsensitiveDict + from feathr.constants import * from feathr.version import get_maven_artifact_fullname from feathr.spark_provider._abc import SparkJobLauncher -from loguru import logger -from requests.structures import CaseInsensitiveDict class _FeathrDatabricksJobLauncher(SparkJobLauncher): """Class to interact with Databricks Spark cluster - This is a light-weight databricks job runner, users should use the provided template json string to get more fine controlled environment for databricks cluster. - For example, user can control whether to use a new cluster to run the job or not, specify the cluster ID, running frequency, node size, workder no., whether to send out failed notification email, etc. - This runner will only fill in necessary arguments in the JSON template. - - This class will read from the provided configs string, and do the following steps. - This default template can be overwritten by users, but users need to make sure the template is compatible with the default template. Specifically: - 1. it's a SparkJarTask (rather than other types of jobs, say NotebookTask or others). See https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runs-submit for more details - 2. Use the Feathr Jar to run the job (hence will add an entry in `libraries` section) - 3. Only supports `new_cluster` type for now - 4. Will override `main_class_name` and `parameters` field in the JSON template `spark_jar_task` field - 5. will override the name of this job + This is a light-weight databricks job runner, users should use the provided template json string to get more fine controlled environment for databricks cluster. + For example, user can control whether to use a new cluster to run the job or not, specify the cluster ID, running frequency, node size, workder no., whether to send out failed notification email, etc. + This runner will only fill in necessary arguments in the JSON template. + + This class will read from the provided configs string, and do the following steps. + This default template can be overwritten by users, but users need to make sure the template is compatible with the default template. Specifically: + 1. it's a SparkJarTask (rather than other types of jobs, say NotebookTask or others). See https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runs-submit for more details + 2. Use the Feathr Jar to run the job (hence will add an entry in `libraries` section) + 3. Will override `main_class_name` and `parameters` field in the JSON template `spark_jar_task` field + 4. will override the name of this job + + Args: + workspace_instance_url (str): the workinstance url. Document to get workspace_instance_url: https://docs.microsoft.com/en-us/azure/databricks/workspace/workspace-details#workspace-url + token_value (str): see here on how to get tokens: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/authentication + config_template (str): config template for databricks cluster. See https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runs-submit for more details. + databricks_work_dir (_type_, optional): databricks_work_dir must start with dbfs:/. Defaults to 'dbfs:/feathr_jobs'. + """ - Args: - workspace_instance_url (str): the workinstance url. Document to get workspace_instance_url: https://docs.microsoft.com/en-us/azure/databricks/workspace/workspace-details#workspace-url - token_value (str): see here on how to get tokens: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/authentication - config_template (str): config template for databricks cluster. See https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runs-submit for more details. - databricks_work_dir (_type_, optional): databricks_work_dir must start with dbfs:/. Defaults to 'dbfs:/feathr_jobs'. - """ def __init__( - self, - workspace_instance_url: str, - token_value: str, - config_template: Union[str,Dict], - databricks_work_dir: str = 'dbfs:/feathr_jobs', + self, + workspace_instance_url: str, + token_value: str, + config_template: Union[str, Dict], + databricks_work_dir: str = "dbfs:/feathr_jobs", ): - - # Below we will use Databricks job APIs (as well as many other APIs) to submit jobs or transfer files # For Job APIs, see https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs # for DBFS APIs, see: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/dbfs self.config_template = config_template # remove possible trailing '/' due to wrong input format - self.workspace_instance_url = workspace_instance_url.rstrip('/') + self.workspace_instance_url = workspace_instance_url.rstrip("/") self.auth_headers = CaseInsensitiveDict() # Authenticate the REST APIs. Documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/authentication - self.auth_headers['Accept'] = 'application/json' - self.auth_headers['Authorization'] = f'Bearer {token_value}' + self.auth_headers["Accept"] = "application/json" + self.auth_headers["Authorization"] = f"Bearer {token_value}" self.databricks_work_dir = databricks_work_dir - self.api_client = ApiClient(host=self.workspace_instance_url,token=token_value) + self.api_client = ApiClient(host=self.workspace_instance_url, token=token_value) def upload_or_get_cloud_path(self, local_path_or_http_path: str): """ @@ -78,7 +76,7 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str): with urlopen(local_path_or_http_path) as f: # use REST API to avoid local temp file data = f.read() - files = {'file': data} + files = {"file": data} # for DBFS APIs, see: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/dbfs r = requests.post(url=self.workspace_instance_url+'/api/2.0/dbfs/put', headers=self.auth_headers, files=files, data={'overwrite': 'true', 'path': cloud_dest_path}) @@ -91,8 +89,12 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str): cloud_dest_path = local_path_or_http_path elif src_parse_result.scheme.startswith(('wasb','s3','gs')): # if the path starts with a location that's not a local path - logger.error("File {} cannot be downloaded. Please upload the file to dbfs manually.", local_path_or_http_path) - raise RuntimeError(f"File {local_path_or_http_path} cannot be downloaded. Please upload the file to dbfs manually.") + logger.error( + "File {} cannot be downloaded. Please upload the file to dbfs manually.", local_path_or_http_path + ) + raise RuntimeError( + f"File {local_path_or_http_path} cannot be downloaded. Please upload the file to dbfs manually." + ) else: # else it should be a local file path or dir if os.path.isdir(local_path_or_http_path): @@ -123,7 +125,18 @@ def _upload_local_file_to_workspace(self, local_path: str) -> str: raise RuntimeError(f"The source path: {local_path}, or the destination path: {cloud_dest_path}, is/are not valid.") from e return cloud_dest_path - def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: str, arguments: List[str], python_files: List[str], reference_files_path: List[str] = [], job_tags: Dict[str, str] = None, configuration: Dict[str, str] = {}, properties: Dict[str, str] = {}): + def submit_feathr_job( + self, + job_name: str, + main_jar_path: str, + main_class_name: str, + arguments: List[str], + python_files: List[str], + reference_files_path: List[str] = [], + job_tags: Dict[str, str] = None, + configuration: Dict[str, str] = {}, + properties: Dict[str, str] = {}, + ): """ submit the feathr job to databricks Refer to the databricks doc for more details on the meaning of the parameters: @@ -147,72 +160,93 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: # otherwise users might have missed the quotes in the config. Treat them as dict # Note that we need to use deep copy here, in order to make `self.config_template` immutable # Otherwise, since we need to change submission_params later, which will modify `self.config_template` and cause unexpected behaviors - submission_params = copy.deepcopy(self.config_template) - - submission_params['run_name'] = job_name - if 'existing_cluster_id' not in submission_params: + submission_params = copy.deepcopy(self.config_template) + + submission_params["run_name"] = job_name + cfg = configuration.copy() + if "existing_cluster_id" in submission_params: + logger.info("Using an existing general purpose cluster to run the feathr job...") + if cfg: + logger.warning( + "Spark execution configuration will be ignored. To use job-specific spark configs, please use a new job cluster or set the configs via Databricks UI." + ) + if job_tags: + logger.warning( + "Job tags will be ignored. To assign job tags to the cluster, please use a new job cluster." + ) + elif "new_cluster" in submission_params: + logger.info("Using a new job cluster to run the feathr job...") # if users don't specify existing_cluster_id # Solving this issue: Handshake fails trying to connect from Azure Databricks to Azure PostgreSQL with SSL # https://docs.microsoft.com/en-us/answers/questions/170730/handshake-fails-trying-to-connect-from-azure-datab.html - configuration['spark.executor.extraJavaOptions'] = '-Djava.security.properties=' - configuration['spark.driver.extraJavaOptions'] = '-Djava.security.properties=' - submission_params['new_cluster']['spark_conf'] = configuration + cfg["spark.executor.extraJavaOptions"] = "-Djava.security.properties=" + cfg["spark.driver.extraJavaOptions"] = "-Djava.security.properties=" + submission_params["new_cluster"]["spark_conf"] = cfg if job_tags: - custom_tags = submission_params['new_cluster'].get('custom_tags', {}) + custom_tags = submission_params["new_cluster"].get("custom_tags", {}) for tag, value in job_tags.items(): custom_tags[tag] = value - submission_params['new_cluster']['custom_tags'] = custom_tags + submission_params["new_cluster"]["custom_tags"] = custom_tags + else: + # TODO we should fail fast -- maybe check this in config verification while initializing the client. + raise ValueError( + "No cluster specifications are found. Either 'existing_cluster_id' or 'new_cluster' should be configured via feathr config." + ) # the feathr main jar file is anyway needed regardless it's pyspark or scala spark if not main_jar_path: logger.info(f"Main JAR file is not set, using default package '{get_maven_artifact_fullname()}' from Maven") submission_params['libraries'][0]['maven'] = { "coordinates": get_maven_artifact_fullname() } else: - submission_params['libraries'][0]['jar'] = self.upload_or_get_cloud_path(main_jar_path) + submission_params["libraries"][0]["jar"] = self.upload_or_get_cloud_path(main_jar_path) # see here for the submission parameter definition https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--request-structure-6 if python_files: # this is a pyspark job. definition here: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--sparkpythontask # the first file is the pyspark driver code. we only need the driver code to execute pyspark - param_and_file_dict = {"parameters": arguments, "python_file": self.upload_or_get_cloud_path(python_files[0])} + param_and_file_dict = { + "parameters": arguments, + "python_file": self.upload_or_get_cloud_path(python_files[0]), + } # indicates this is a pyspark job # `setdefault` method will get the value of the "spark_python_task" item, if the "spark_python_task" item does not exist, insert "spark_python_task" with the value "param_and_file_dict": - submission_params.setdefault('spark_python_task',param_and_file_dict) + submission_params.setdefault("spark_python_task", param_and_file_dict) else: # this is a scala spark job - submission_params['spark_jar_task']['parameters'] = arguments - submission_params['spark_jar_task']['main_class_name'] = main_class_name + submission_params["spark_jar_task"]["parameters"] = arguments + submission_params["spark_jar_task"]["main_class_name"] = main_class_name result = RunsApi(self.api_client).submit_run(submission_params) try: # see if we can parse the returned result - self.res_job_id = result['run_id'] + self.res_job_id = result["run_id"] except: - logger.error("Submitting Feathr job to Databricks cluster failed. Message returned from Databricks: {}", result) + logger.error( + "Submitting Feathr job to Databricks cluster failed. Message returned from Databricks: {}", result + ) exit(1) result = RunsApi(self.api_client).get_run(self.res_job_id) - self.job_url = result['run_page_url'] - logger.info('Feathr job Submitted Successfully. View more details here: {}', self.job_url) + self.job_url = result["run_page_url"] + logger.info("Feathr job Submitted Successfully. View more details here: {}", self.job_url) # return ID as the submission result return self.res_job_id def wait_for_completion(self, timeout_seconds: Optional[int] = 600) -> bool: - """ Returns true if the job completed successfully - """ + """Returns true if the job completed successfully""" start_time = time.time() while (timeout_seconds is None) or (time.time() - start_time < timeout_seconds): status = self.get_status() - logger.debug('Current Spark job status: {}', status) + logger.debug("Current Spark job status: {}", status) # see all the status here: # https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runlifecyclestate # https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runresultstate - if status in {'SUCCESS'}: + if status in {"SUCCESS"}: return True - elif status in {'INTERNAL_ERROR', 'FAILED', 'TIMEDOUT', 'CANCELED'}: + elif status in {"INTERNAL_ERROR", "FAILED", "TIMEDOUT", "CANCELED"}: result = RunsApi(self.api_client).get_run_output(self.res_job_id) # See here for the returned fields: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--response-structure-8 # print out logs and stack trace if the job has failed @@ -225,14 +259,14 @@ def wait_for_completion(self, timeout_seconds: Optional[int] = 600) -> bool: else: time.sleep(30) else: - raise TimeoutError('Timeout waiting for Feathr job to complete') + raise TimeoutError("Timeout waiting for Feathr job to complete") def get_status(self) -> str: assert self.res_job_id is not None result = RunsApi(self.api_client).get_run(self.res_job_id) # first try to get result state. it might not be available, and if that's the case, try to get life_cycle_state # see result structure: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--response-structure-6 - res_state = result['state'].get('result_state') or result['state']['life_cycle_state'] + res_state = result["state"].get("result_state") or result["state"]["life_cycle_state"] assert res_state is not None return res_state @@ -246,7 +280,6 @@ def get_job_result_uri(self) -> str: # in case users call this API even when there's no tags available return None if custom_tags is None else custom_tags[OUTPUT_PATH_TAG] - def get_job_tags(self) -> Dict[str, str]: """Get job tags @@ -257,21 +290,23 @@ def get_job_tags(self) -> Dict[str, str]: # For result structure, see https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--response-structure-6 result = RunsApi(self.api_client).get_run(self.res_job_id) - if 'new_cluster' in result['cluster_spec']: - custom_tags = result['cluster_spec']['new_cluster']['custom_tags'] + if "new_cluster" in result["cluster_spec"]: + custom_tags = result["cluster_spec"]["new_cluster"]["custom_tags"] return custom_tags else: # this is not a new cluster; it's an existing cluster. - logger.warning("Job tags are not available since you are using an existing Databricks cluster. Consider using 'new_cluster' in databricks configuration.") + logger.warning( + "Job tags are not available since you are using an existing Databricks cluster. Consider using 'new_cluster' in databricks configuration." + ) return None - def download_result(self, result_path: str, local_folder: str): """ Supports downloading files from the result folder. Only support paths starts with `dbfs:/` and only support downloading files in one folder (per Spark's design, everything will be in the result folder in a flat manner) """ - if not result_path.startswith('dbfs'): - raise RuntimeError('Currently only paths starting with dbfs is supported for downloading results from a databricks cluster. The path should start with \"dbfs:\" .') + if not result_path.startswith("dbfs"): + raise RuntimeError( + 'Currently only paths starting with dbfs is supported for downloading results from a databricks cluster. The path should start with "dbfs:" .' + ) DbfsApi(self.api_client).cp(recursive=True, overwrite=True, src=result_path, dst=local_folder) - diff --git a/feathr_project/feathr/spark_provider/_localspark_submission.py b/feathr_project/feathr/spark_provider/_localspark_submission.py index a3dd92174..a5ef0e53d 100644 --- a/feathr_project/feathr/spark_provider/_localspark_submission.py +++ b/feathr_project/feathr/spark_provider/_localspark_submission.py @@ -1,3 +1,4 @@ +from copy import deepcopy from datetime import datetime import json import os @@ -10,6 +11,7 @@ from loguru import logger from pyspark import * +from feathr.constants import OUTPUT_PATH_TAG from feathr.version import get_maven_artifact_fullname from feathr.spark_provider._abc import SparkJobLauncher @@ -40,6 +42,7 @@ def __init__( self.retry_sec = retry_sec self.packages = self._get_default_package() self.master = master or "local[*]" + self.job_tags = None def upload_or_get_cloud_path(self, local_path_or_http_path: str): """For Local Spark Case, no need to upload to cloud workspace.""" @@ -52,6 +55,7 @@ def submit_feathr_job( main_class_name: str, arguments: List[str] = None, python_files: List[str] = None, + job_tags: Dict[str, str] = None, configuration: Dict[str, str] = {}, properties: Dict[str, str] = {}, **_, @@ -66,9 +70,10 @@ def submit_feathr_job( main_class_name: name of your main class arguments: all the arguments you want to pass into the spark job python_files: required .zip, .egg, or .py files of spark job + job_tags: tags of the job, for example you might want to put your user ID, or a tag with a certain information configuration: Additional configs for the spark job properties: System properties configuration - **_: Not used arguments in local spark mode, such as reference_files_path and job_tags + **_: Not used arguments in local spark mode, such as reference_files_path """ logger.warning( f"Local Spark Mode only support basic params right now and should be used only for testing purpose." @@ -125,6 +130,8 @@ def submit_feathr_job( logger.info(f"Local Spark job submit with pid: {proc.pid}.") + self.job_tags = deepcopy(job_tags) + return proc def wait_for_completion(self, timeout_seconds: Optional[float] = 500) -> bool: @@ -198,6 +205,22 @@ def get_status(self) -> str: """Get the status of the job, only a placeholder for local spark""" return self.latest_spark_proc.returncode + def get_job_result_uri(self) -> str: + """Get job output path + + Returns: + str: output_path + """ + return self.job_tags.get(OUTPUT_PATH_TAG, None) if self.job_tags else None + + def get_job_tags(self) -> Dict[str, str]: + """Get job tags + + Returns: + Dict[str, str]: a dict of job tags + """ + return self.job_tags + def _init_args(self, job_name: str, confs: Dict[str, str]) -> List[str]: logger.info(f"Spark job: {job_name} is running on local spark with master: {self.master}.") args = [ diff --git a/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py b/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py index 55756ba3d..c4f102566 100644 --- a/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py +++ b/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py @@ -176,6 +176,7 @@ def prepare_pyspark_udf_files(feature_names: List[str], local_workspace_dir): for feature_name in feature_names: if feature_name in features_with_preprocessing: has_py_udf_preprocessing = True + break if has_py_udf_preprocessing: pyspark_driver_path = os.path.join(local_workspace_dir, FEATHR_PYSPARK_DRIVER_FILE_NAME) diff --git a/feathr_project/feathr/utils/config.py b/feathr_project/feathr/utils/config.py new file mode 100644 index 000000000..9a5f5fd89 --- /dev/null +++ b/feathr_project/feathr/utils/config.py @@ -0,0 +1,278 @@ +import collections.abc +from copy import deepcopy +import os +import json +from tempfile import NamedTemporaryFile +from typing import Dict +import yaml + +from feathr.utils.platform import is_databricks + + +DEFAULT_FEATHR_CONFIG = { + "api_version": 1, + "project_config": {}, # "project_name" + "feature_registry": {}, # "api_endpoint" + "spark_config": { + "spark_cluster": "local", # Currently support 'azure_synapse', 'databricks', and 'local' + "spark_result_output_parts": "1", + }, + "offline_store": { + "adls": {"adls_enabled": "true"}, + "wasb": {"wasb_enabled": "true"}, + }, + "online_store": { + "redis": { + # "host" + "port": "6380", + "ssl_enabled": "true", + } + } +} + + +# New databricks job cluster config +DEFAULT_DATABRICKS_CLUSTER_CONFIG = { + "spark_version": "11.2.x-scala2.12", + "node_type_id": "Standard_D3_v2", + "num_workers": 2, + "spark_conf": { + "FEATHR_FILL_IN": "FEATHR_FILL_IN", + # Exclude conflicting packages if use feathr <= v0.8.0: + "spark.jars.excludes": "commons-logging:commons-logging,org.slf4j:slf4j-api,com.google.protobuf:protobuf-java,javax.xml.bind:jaxb-api", + }, +} + + +# New Azure Synapse spark pool config +DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG = { + "executor_size": "Small", + "executor_num": 2, +} + + +def generate_config( + resource_prefix: str, + project_name: str, + output_filepath: str = None, + databricks_workspace_token_value: str = None, + databricks_cluster_id: str = None, + redis_password: str = None, + adls_key: str = None, + use_env_vars: bool = True, + **kwargs, +) -> str: + """Generate a feathr config yaml file. + Note, `use_env_vars` argument gives an option to either use environment variables for generating the config file + or not. Feathr client will use environment variables anyway if they are set. + + Keyword arguments follow the same naming convention as the feathr config. E.g. to set Databricks as the target + cluster, use `spark_config__spark_cluster="databricks"`. + See https://feathr-ai.github.io/feathr/quickstart_synapse.html#step-4-update-feathr-config for more details. + + Note: + This utility function assumes Azure resources are deployed using the Azure Resource Manager (ARM) template, + and infers resource names based on the given `resource_prefix`. If you deploy resources manually, you may need + to pass each resource url manually, e.g. `spark_config__azure_synapse__dev_url="your-resource-url"`. + + Args: + resource_prefix: Resource name prefix used when deploying Feathr resources by using ARM template. + project_name: Feathr project name. + cluster_name (optional): Databricks cluster or Azure Synapse spark pool name to use an existing one. + output_filepath (optional): Output filepath. + use_env_vars (optional): Whether to use environment variables if they are set. + databricks_workspace_token_value (optional): Databricks workspace token. If provided, the value will be stored + as the environment variable. + databricks_cluster_id (optional): Databricks cluster id to use an existing cluster. + redis_password (optional): Redis password. If provided, the value will be stored as the environment variable. + adls_key (optional): ADLS key. If provided, the value will be stored as the environment variable. + + Returns: + str: Generated config file path. This will be identical to `output_filepath` if provided. + """ + # Set keys + if databricks_workspace_token_value: + os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = databricks_workspace_token_value + if redis_password: + os.environ["REDIS_PASSWORD"] = redis_password + if adls_key: + os.environ["ADLS_KEY"] = adls_key + + # Set configs + config = deepcopy(DEFAULT_FEATHR_CONFIG) + config["project_config"]["project_name"] = project_name + config["feature_registry"]["api_endpoint"] = f"https://{resource_prefix}webapp.azurewebsites.net/api/v1" + config["online_store"]["redis"]["host"] = f"{resource_prefix}redis.redis.cache.windows.net" + + # Update configs using kwargs + new_config = _config_kwargs_to_dict(**kwargs) + _update_config(config, new_config) + + # Set platform specific configurations + if config["spark_config"]["spark_cluster"] == "local": + _set_local_spark_config() + elif config["spark_config"]["spark_cluster"] == "azure_synapse": + _set_azure_synapse_config( + config=config, + resource_prefix=resource_prefix, + project_name=project_name, + ) + elif config["spark_config"]["spark_cluster"] == "databricks": + _set_databricks_config( + config=config, + project_name=project_name, + cluster_id=databricks_cluster_id, + ) + + # Maybe update configs with environment variables + if use_env_vars: + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__SPARK_CLUSTER") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__AZURE_SYNAPSE__DEV_URL") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__AZURE_SYNAPSE__WORKSPACE_DIR") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__DATABRICKS__WORK_DIR") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL") + _maybe_update_config_with_env_var(config, "SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE") + + # Verify config + _verify_config(config) + + # Write config to file + if not output_filepath: + output_filepath = NamedTemporaryFile(mode="w", delete=False).name + + with open(output_filepath, "w") as f: + yaml.dump(config, f, default_flow_style=False) + + return output_filepath + + +def _set_local_spark_config(): + """Set environment variables for local spark cluster.""" + os.environ["SPARK_LOCAL_IP"] = os.getenv( + "SPARK_LOCAL_IP", + "127.0.0.1", + ) + + +def _set_azure_synapse_config( + config: Dict, + resource_prefix: str, + project_name: str, +): + """Set configs for Azure Synapse spark cluster.""" + + config["spark_config"]["azure_synapse"] = config["spark_config"].get("azure_synapse", {}) + + if not config["spark_config"]["azure_synapse"].get("dev_url"): + config["spark_config"]["azure_synapse"]["dev_url"] = f"https://{resource_prefix}syws.dev.azuresynapse.net" + + if not config["spark_config"]["azure_synapse"].get("workspace_dir"): + config["spark_config"]["azure_synapse"]["workspace_dir"] =\ + f"abfss://{resource_prefix}fs@{resource_prefix}dls.dfs.core.windows.net/{project_name}" + + for k, v in DEFAULT_AZURE_SYNAPSE_SPARK_POOL_CONFIG.items(): + if not config["spark_config"]["azure_synapse"].get(k): + config["spark_config"]["azure_synapse"][k] = v + + +def _set_databricks_config( + config: Dict, + project_name: str, + cluster_id: str = None, +): + """Set configs for Databricks spark cluster.""" + + config["spark_config"]["databricks"] = config["spark_config"].get("databricks", {}) + + if not config["spark_config"]["databricks"].get("work_dir"): + config["spark_config"]["databricks"]["work_dir"] = f"dbfs:/{project_name}" + + if not config["spark_config"]["databricks"].get("config_template"): + databricks_config = { + "run_name": "FEATHR_FILL_IN", + "libraries": [{"jar": "FEATHR_FILL_IN"}], + "spark_jar_task": { + "main_class_name": "FEATHR_FILL_IN", + "parameters": ["FEATHR_FILL_IN"], + }, + } + if cluster_id is None: + databricks_config["new_cluster"] = DEFAULT_DATABRICKS_CLUSTER_CONFIG + else: + databricks_config["existing_cluster_id"] = cluster_id + + config["spark_config"]["databricks"]["config_template"] = json.dumps(databricks_config) + + +def _config_kwargs_to_dict(**kwargs) -> Dict: + """Parse config's keyword arguments to dictionary. + e.g. `spark_config__spark_cluster="local"` will be parsed to `{"spark_config": {"spark_cluster": "local"}}`. + """ + config = dict() + + for conf_key, conf_value in kwargs.items(): + if conf_value is None: + continue + + conf = config + keys = conf_key.split("__") + for k in keys[:-1]: + if k not in conf: + conf[k] = dict() + conf = conf[k] + conf[keys[-1]] = conf_value + + return config + + +def _update_config(config: Dict, new_config: Dict): + """Update config dictionary with the values in `new_config`.""" + for k, v in new_config.items(): + if k in config and isinstance(v, collections.abc.Mapping): + _update_config(config[k], v) + else: + config[k] = v + + +def _verify_config(config: Dict): + """Verify config.""" + if config["spark_config"]["spark_cluster"] == "azure_synapse": + if not os.environ.get("ADLS_KEY"): + raise ValueError("ADLS_KEY must be set in environment variables") + elif ( + not os.environ.get("SPARK_CONFIG__AZURE_SYNAPSE__DEV_URL") and + config["spark_config"]["azure_synapse"].get("dev_url") is None + ): + raise ValueError("Azure Synapse dev endpoint is not provided.") + elif ( + not os.environ.get("SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME") and + config["spark_config"]["azure_synapse"].get("pool_name") is None + ): + raise ValueError("Azure Synapse pool name is not provided.") + + elif config["spark_config"]["spark_cluster"] == "databricks": + if not os.environ.get("DATABRICKS_WORKSPACE_TOKEN_VALUE"): + raise ValueError("Databricks workspace token is not provided.") + elif ( + not os.environ.get("SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL") and + config["spark_config"]["databricks"].get("workspace_instance_url") is None + ): + raise ValueError("Databricks workspace url is not provided.") + + +def _maybe_update_config_with_env_var(config: Dict, env_var_name: str): + """Update config dictionary with the values in environment variables. + e.g. `SPARK_CONFIG__SPARK_CLUSTER` will be parsed to `{"spark_config": {"spark_cluster": "local"}}`. + """ + if not os.environ.get(env_var_name): + return + + keys = env_var_name.lower().split("__") + conf = config + for k in keys[:-1]: + if k not in conf: + conf[k] = dict() + conf = conf[k] + + conf[keys[-1]] = os.environ[env_var_name] diff --git a/feathr_project/feathr/utils/job_utils.py b/feathr_project/feathr/utils/job_utils.py index 6a6bd63c0..d9c73c355 100644 --- a/feathr_project/feathr/utils/job_utils.py +++ b/feathr_project/feathr/utils/job_utils.py @@ -1,77 +1,187 @@ -from feathr.client import FeathrClient -import os -import glob -from feathr.constants import OUTPUT_FORMAT +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Union + from loguru import logger import pandas as pd -import tempfile -from pandas.errors import EmptyDataError +from pyspark.sql import DataFrame, SparkSession + +from feathr.client import FeathrClient +from feathr.constants import OUTPUT_FORMAT +from feathr.utils.platform import is_databricks + + +def get_result_pandas_df( + client: FeathrClient, + data_format: str = None, + res_url: str = None, + local_cache_path: str = None, +) -> pd.DataFrame: + """Download the job result dataset from cloud as a Pandas DataFrame. + + Args: + client: Feathr client + data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. + Default to use client's job tags if exists. + res_url: Result URL to download files from. Note that this will not block the job so you need to make sure + the job is finished and the result URL contains actual data. Default to use client's job tags if exists. + local_cache_path (optional): Specify the absolute download path. if the user does not provide this, + the function will create a temporary directory. + + Returns: + pandas DataFrame + """ + return get_result_df(client, data_format, res_url, local_cache_path) + + +def get_result_spark_df( + spark: SparkSession, + client: FeathrClient, + data_format: str = None, + res_url: str = None, + local_cache_path: str = None, +) -> DataFrame: + """Download the job result dataset from cloud as a Spark DataFrame. + + Args: + spark: Spark session + client: Feathr client + data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. + Default to use client's job tags if exists. + res_url: Result URL to download files from. Note that this will not block the job so you need to make sure + the job is finished and the result URL contains actual data. Default to use client's job tags if exists. + local_cache_path (optional): Specify the absolute download path. if the user does not provide this, + the function will create a temporary directory. + + Returns: + Spark DataFrame + """ + return get_result_df(client, data_format, res_url, local_cache_path, spark=spark) +def get_result_df( + client: FeathrClient, + data_format: str = None, + res_url: str = None, + local_cache_path: str = None, + spark: SparkSession = None, +) -> Union[DataFrame, pd.DataFrame]: + """Download the job result dataset from cloud as a Spark DataFrame or pandas DataFrame. -def get_result_df(client: FeathrClient, format: str = None, res_url: str = None, local_folder: str = None) -> pd.DataFrame: - """Download the job result dataset from cloud as a Pandas dataframe to make it easier for the client to read. + Args: + client: Feathr client + data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. + Default to use client's job tags if exists. + res_url: Result URL to download files from. Note that this will not block the job so you need to make sure + the job is finished and the result URL contains actual data. Default to use client's job tags if exists. + local_cache_path (optional): Specify the absolute download directory. if the user does not provide this, + the function will create a temporary directory. + spark (optional): Spark session. If provided, the function returns spark Dataframe. + Otherwise, it returns pd.DataFrame. - format: format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. Default to `avro` if not specified. - res_url: output URL to download files. Note that this will not block the job so you need to make sure the job is finished and result URL contains actual data. - local_folder: optional parameter to specify the absolute download path. if the user does not provide this, function will create a temporary directory and delete it after reading the dataframe. + Returns: + Either Spark or pandas DataFrame. """ - # use a result url if it's provided by the user, otherwise use the one provided by the job + if data_format is None: + # May use data format from the job tags + if client.get_job_tags() and client.get_job_tags().get(OUTPUT_FORMAT): + data_format = client.get_job_tags().get(OUTPUT_FORMAT) + else: + raise ValueError("Cannot determine the data format. Please provide the data_format argument.") + + data_format = data_format.lower() + + if is_databricks() and client.spark_runtime != "databricks": + raise RuntimeError(f"The function is called from Databricks but the client.spark_runtime is {client.spark_runtime}.") + + # TODO Loading Synapse Delta table result into pandas has a bug: https://github.com/delta-io/delta-rs/issues/582 + if not spark and client.spark_runtime == "azure_synapse" and data_format == "delta": + raise RuntimeError(f"Loading Delta table result from Azure Synapse into pandas DataFrame is not supported. You maybe able to use spark DataFrame to load the result instead.") + + # use a result url if it's provided by the user, otherwise use the one provided by the job res_url: str = res_url or client.get_job_result_uri(block=True, timeout_sec=1200) if res_url is None: - raise RuntimeError("res_url is None. Please make sure either you provide a res_url or make sure the job finished in FeathrClient has a valid result URI.") + raise ValueError( + "`res_url` is None. Please make sure either you provide a res_url or make sure the job finished in FeathrClient has a valid result URI." + ) - # use user provided format, if there isn't one, then otherwise use the one provided by the job; - # if none of them is available, "avro" is the default format. - format: str = format or client.get_job_tags().get(OUTPUT_FORMAT, "") - if format is None or format == "": - format = "avro" + if client.spark_runtime == "local": + if local_cache_path is not None: + logger.warning( + "In local spark mode, the result files are expected to be stored at a local storage and thus `local_cache_path` argument will be ignored." + ) + local_cache_path = res_url - # if local_folder params is not provided then create a temporary folder - if local_folder is not None: - local_dir_path = local_folder - else: - tmp_dir = tempfile.TemporaryDirectory() - local_dir_path = tmp_dir.name - - client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=local_dir_path) - dataframe_list = [] - # by default the result are in avro format - if format.casefold()=="parquet": - files = glob.glob(os.path.join(local_dir_path, '*.parquet')) - from pyarrow.parquet import ParquetDataset - ds = ParquetDataset(files) - result_df = ds.read().to_pandas() - elif format.casefold()=="delta": - from deltalake import DeltaTable - delta = DeltaTable(local_dir_path) - if not client.spark_runtime == 'azure_synapse': - # don't detect for synapse result with Delta as there's a problem with underlying system - # Issues are tracked here: https://github.com/delta-io/delta-rs/issues/582 - result_df = delta.to_pyarrow_table().to_pandas() + elif client.spark_runtime == "databricks": + if not res_url.startswith("dbfs:"): + logger.warning( + f"In Databricks, the result files are expected to be stored in DBFS, but the res_url {res_url} is not a dbfs path. Prefixing it with 'dbfs:/'" + ) + res_url = f"dbfs:/{res_url.lstrip('/')}" + + if is_databricks(): # Check if the function is being called from Databricks + if local_cache_path is not None: + logger.warning( + "Result files are already in DBFS and thus `local_cache_path` will be ignored." + ) + local_cache_path = res_url + + if local_cache_path is None: + local_cache_path = TemporaryDirectory().name + + if local_cache_path != res_url: + logger.info(f"{res_url} files will be downloaded into {local_cache_path}") + client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=local_cache_path) + + result_df = None + try: + if spark is not None: + if data_format == "csv": + result_df = spark.read.option("header", True).csv(local_cache_path) + else: + result_df = spark.read.format(data_format).load(local_cache_path) else: - logger.info("Please use Azure Synapse to read the result in the Azure Synapse cluster. Reading local results is not supported for Azure Synapse. Empty DataFrame is returned.") - result_df = pd.DataFrame() - elif format.casefold()=="avro": + result_df = _load_files_to_pandas_df( + dir_path=local_cache_path.replace("dbfs:", "/dbfs"), # replace to python path if spark path is provided. + data_format=data_format, + ) + except Exception as e: + logger.error(f"Failed to load result files from {local_cache_path} with format {data_format}.") + raise e + + return result_df + + +def _load_files_to_pandas_df(dir_path: str, data_format: str = "avro") -> pd.DataFrame: + + if data_format == "parquet": + return pd.read_parquet(dir_path) + + elif data_format == "delta": + from deltalake import DeltaTable + delta = DeltaTable(dir_path) + return delta.to_pyarrow_table().to_pandas() + + elif data_format == "avro": import pandavro as pdx - for file in glob.glob(os.path.join(local_dir_path, '*.avro')): - dataframe_list.append(pdx.read_avro(file)) - result_df = pd.concat(dataframe_list, axis=0) - elif format.casefold()=="csv": - for file in glob.glob(os.path.join(local_dir_path, '*.csv')): + if Path(dir_path).is_file(): + return pdx.read_avro(dir_path) + else: try: - df = pd.read_csv(file, index_col=None, header=None) - except EmptyDataError: - # in case there are empty files - df = pd.DataFrame() - dataframe_list.append(df) - result_df = pd.concat(dataframe_list, axis=0) - # Reset index to avoid duplicated indices - result_df.reset_index(drop=True) - else: - raise RuntimeError(f"{format} is currently not supported in get_result_df. Currently only parquet, delta, avro, and csv are supported, please consider writing a customized function to read the result.") + return pd.concat([pdx.read_avro(f) for f in Path(dir_path).glob("*.avro")]).reset_index(drop=True) + except ValueError: # No object to concat when the dir is empty + return pd.DataFrame() - - if local_folder is None: - tmp_dir.cleanup() - return result_df \ No newline at end of file + elif data_format == "csv": + if Path(dir_path).is_file(): + return pd.read_csv(dir_path) + else: + try: + return pd.concat([pd.read_csv(f) for f in Path(dir_path).glob("*.csv")]).reset_index(drop=True) + except ValueError: # No object to concat when the dir is empty + return pd.DataFrame() + + else: + raise ValueError( + f"{data_format} is currently not supported in get_result_df. Currently only parquet, delta, avro, and csv are supported, please consider writing a customized function to read the result." + ) diff --git a/feathr_project/feathr/utils/platform.py b/feathr_project/feathr/utils/platform.py new file mode 100644 index 000000000..8f832f22d --- /dev/null +++ b/feathr_project/feathr/utils/platform.py @@ -0,0 +1,45 @@ +"""Platform utilities. +Refs: https://github.com/microsoft/recommenders/blob/main/recommenders/utils/notebook_utils.py +""" +from pathlib import Path + + +def is_jupyter() -> bool: + """Check if the module is running on Jupyter notebook/console. + Note - there might be better way to check if the code is running on a jupyter notebook or not, + but this hacky way still works. + + Ref: + https://stackoverflow.com/questions/15411967/how-can-i-check-if-code-is-executed-in-the-ipython-notebook + + Returns: + bool: True if the module is running on Jupyter notebook or Jupyter console, False otherwise. + """ + try: + # Pre-loaded module `get_ipython()` tells you whether you are running inside IPython or not. + shell_name = get_ipython().__class__.__name__ + # `ZMQInteractiveShell` tells you if this is an interactive mode (notebook). + if shell_name == "ZMQInteractiveShell": + return True + else: + return False + except NameError: + return False + + +def is_databricks() -> bool: + """Check if the module is running on Databricks. + + Returns: + bool: True if the module is running on Databricks notebook, False otherwise. + """ + try: + if str(Path(".").resolve()) == "/databricks/driver": + return True + else: + return False + except NameError: + return False + + +# TODO maybe add is_synapse() diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/green_tripdata_2020-04.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/green_tripdata_2020-04.csv deleted file mode 100644 index ce34f255a..000000000 --- a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/green_tripdata_2020-04.csv +++ /dev/null @@ -1,14 +0,0 @@ -VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge -2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1,43,151,1,1.01,5.5,0.5,0.5,0,0,,0.3,6.8,2,1,0 -22,2021-01-01 11:25:59,2021-01-01 11:34:44,N,1,166,239,1,2.53,10,0.5,0.5,2.81,0,,0.3,16.86,1,1,2.75 -23,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1,41,42,1,1.12,6,0.5,0.5,1,0,,0.3,8.3,1,1,0 -24,2020-12-31 23:57:51,2021-01-01 23:04:56,N,1,168,75,1,1.99,8,0.5,0.5,0,0,,0.3,9.3,2,1,0 -25,2021-01-01 17:16:36,2021-01-01 17:16:40,N,2,265,265,3,.00,-52,0,-0.5,0,0,,-0.3,-52.8,3,1,0 -12,2021-01-01 00:16:36,2021-01-01 00:16:40,N,2,265,265,3,.00,52,0,0.5,0,0,,0.3,52.8,2,1,0 -42,2021-01-01 05:19:14,2021-01-01 00:19:21,N,5,265,265,1,.00,180,0,0,36.06,0,,0.3,216.36,1,2,0 -52,2021-01-01 00:26:31,2021-01-01 00:28:50,N,1,75,75,6,.45,3.5,0.5,0.5,0.96,0,,0.3,5.76,1,1,0 -2,2021-01-01 00:57:46,2021-01-01 00:57:57,N,1,225,225,1,.00,2.5,0.5,0.5,0,0,,0.3,3.8,2,1,0 -32,2021-01-01 00:58:32,2021-01-01 01:32:34,N,1,225,265,1,12.19,38,0.5,0.5,2.75,0,,0.3,42.05,1,1,0 -2,2021-01-01 18:39:57,2021-01-01 18:55:25,N,1,74,60,1,5.48,18,0.5,0.5,0,0,,0.3,19.3,2,1,0 -15,2021-01-01 00:51:27,2021-01-01 00:57:20,N,1,42,41,2,.90,6,0.5,0.5,0,0,,0.3,7.3,1,1,0 -15,2021-01-01 00:29:05,2021-01-01 00:29:07,N,5,42,264,1,9.00E-02,10,0,0,2.06,0,,0.3,12.36,1,2,0 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/product_detail_mock_data.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/product_detail_mock_data.csv deleted file mode 100644 index 476ea06f3..000000000 --- a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/product_detail_mock_data.csv +++ /dev/null @@ -1,11 +0,0 @@ -product_id,category,price,quantity,recent_sold,made_in_state,discount -1,1,22,100,0,CA,7.5 -2,2,17,300,1,CA,7.5 -3,1,40,0,2,WA,7.5 -4,1,25,100,3,WA,7.5 -5,1,33,0,2,PA,0 -6,2,19,0,2,CA,7.5 -7,2,22,200,1,WA,7.5 -8,2,59,300,0,PA,8.5 -9,0,80,100,1,WA,8.5 -10,0,39,100,0,WA,7.5 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv deleted file mode 100644 index 38fe25ceb..000000000 --- a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv +++ /dev/null @@ -1,35 +0,0 @@ -user_id,product_id,event_timestamp,product_rating -1,1,2021-04-01,4 -1,2,2021-04-01,4 -1,3,2021-04-01,4 -1,4,2021-04-01,4 -1,5,2021-04-01,4 -2,1,2021-04-01,5 -2,2,2021-04-01,5 -2,3,2021-04-01,5 -2,4,2021-04-01,5 -2,5,2021-04-01,5 -3,1,2021-04-01,5 -3,2,2021-04-01,5 -3,3,2021-04-01,5 -3,4,2021-04-01,5 -3,5,2021-04-01,5 -4,1,2021-04-01,1 -4,2,2021-04-01,1 -4,3,2021-04-01,1 -4,4,2021-04-01,1 -4,5,2021-04-01,1 -5,1,2021-04-01,5 -5,2,2021-04-01,5 -6,1,2021-04-01,2 -7,1,2021-04-01,5 -7,2,2021-04-01,5 -7,3,2021-04-01,5 -8,1,2021-04-01,2 -8,2,2021-04-01,2 -8,3,2021-04-01,2 -9,1,2021-04-01,5 -9,2,2021-04-01,5 -9,3,2021-04-01,5 -9,4,2021-04-01,5 -10,1,2021-04-01,3 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_profile_mock_data.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_profile_mock_data.csv deleted file mode 100644 index 6c38f51d7..000000000 --- a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_profile_mock_data.csv +++ /dev/null @@ -1,11 +0,0 @@ -user_id,gender,age,gift_card_balance,number_of_credit_cards,state,tax_rate -1,1,22,100,0,CA,7.5 -2,2,17,300,1,CA,7.5 -3,1,40,0,2,WA,7.5 -4,1,25,100,3,WA,7.5 -5,1,33,0,2,PA,0 -6,2,19,0,2,CA,7.5 -7,2,22,200,1,WA,7.5 -8,2,59,300,0,PA,8.5 -9,0,80,100,1,WA,8.5 -10,0,39,100,0,WA,7.5 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_purchase_history_mock_data.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_purchase_history_mock_data.csv deleted file mode 100644 index 8c8481d1f..000000000 --- a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_purchase_history_mock_data.csv +++ /dev/null @@ -1,31 +0,0 @@ -user_id,purchase_date,purchase_amount -1,2021-01-01,0.33 -1,2021-03-03,574.35 -1,2021-01-03,796.07 -2,2021-01-04,342.15 -2,2021-03-05,280.46 -2,2021-01-06,664.18 -3,2021-01-07,359.02 -3,2021-01-08,357.12 -3,2021-01-09,845.40 -4,2021-01-10,103.92 -4,2021-02-21,670.12 -4,2021-02-12,698.65 -5,2021-01-13,110.52 -5,2021-01-14,931.72 -5,2021-02-15,388.14 -6,2021-01-16,822.96 -6,2021-01-17,292.39 -6,2021-01-18,524.76 -7,2021-01-19,262.00 -7,2021-03-20,715.94 -7,2021-01-21,345.70 -8,2021-01-22,379.00 -8,2021-01-23,194.96 -8,2021-01-24,862.33 -9,2021-01-25,430.41 -9,2021-01-26,398.72 -9,2021-02-27,158.52 -10,2021-01-28,550.01 -10,2021-03-02,157.88 -10,2021-03-03,528.43 \ No newline at end of file diff --git a/feathr_project/setup.py b/feathr_project/setup.py index 8a6b50244..3c3a3f232 100644 --- a/feathr_project/setup.py +++ b/feathr_project/setup.py @@ -3,6 +3,7 @@ from setuptools import setup, find_packages from pathlib import Path + # Use the README.md from /docs root_path = Path(__file__).resolve().parent.parent readme_path = root_path / "docs/README.md" @@ -22,7 +23,7 @@ VERSION = "0.9.0" VERSION = __version__ # noqa -os.environ["FEATHR_VERSION]"] = VERSION +os.environ["FEATHR_VERSION"] = VERSION extras_require=dict( dev=[ diff --git a/feathr_project/test/conftest.py b/feathr_project/test/conftest.py new file mode 100644 index 000000000..c2699e871 --- /dev/null +++ b/feathr_project/test/conftest.py @@ -0,0 +1,57 @@ +from pathlib import Path +from pyspark.sql import SparkSession +import pytest + +from feathr import FeathrClient + + +def pytest_addoption(parser): + """Pytest command line argument options. + E.g. + `python -m pytest feathr_project/test/ --resource-prefix your_feathr_resource_prefix` + """ + parser.addoption( + "--config-path", + action="store", + default=str(Path(__file__).parent.resolve().joinpath("test_user_workspace", "feathr_config.yaml")), + help="Test config path", + ) + + +@pytest.fixture +def config_path(request): + return request.config.getoption("--config-path") + + +@pytest.fixture(scope="session") +def workspace_dir() -> str: + """Workspace directory path containing data files and configs for testing.""" + return str(Path(__file__).parent.resolve().joinpath("test_user_workspace")) + + +@pytest.fixture(scope="function") +def feathr_client(workspace_dir) -> FeathrClient: + """Test function-scoped Feathr client. + Note, cluster target (local, databricks, synapse) maybe overriden by the environment variables set at test machine. + """ + return FeathrClient(config_path=str(Path(workspace_dir, "feathr_config.yaml"))) + + +@pytest.fixture(scope="module") +def spark() -> SparkSession: + """Generate a spark session for tests.""" + # Set ui port other than the default one (4040) so that feathr spark job may not fail. + spark_session = ( + SparkSession.builder + .appName("tests") + .config("spark.jars.packages", ",".join([ + "org.apache.spark:spark-avro_2.12:3.3.0", + "io.delta:delta-core_2.12:2.1.1", + ])) + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") + .config("spark.ui.port", "8080") + .getOrCreate() + ) + yield spark_session + spark_session.stop() diff --git a/feathr_project/test/samples/test_notebooks.py b/feathr_project/test/samples/test_notebooks.py new file mode 100644 index 000000000..c8d1cbefc --- /dev/null +++ b/feathr_project/test/samples/test_notebooks.py @@ -0,0 +1,54 @@ +from pathlib import Path +from tempfile import TemporaryDirectory + +import pytest +try: + import papermill as pm + import scrapbook as sb +except ImportError: + pass # disable error while collecting tests for non-notebook environments + + +SAMPLES_DIR = ( + Path(__file__) + .parent # .../samples + .parent # .../test + .parent # .../feathr_project + .parent # .../feathr (root of the repo) + .joinpath("docs", "samples") +) +NOTEBOOK_PATHS = { + "nyc_taxi_demo": str(SAMPLES_DIR.joinpath("nyc_taxi_demo.ipynb")), +} + + +@pytest.mark.notebooks +def test__nyc_taxi_demo(config_path, tmp_path): + notebook_name = "nyc_taxi_demo" + + output_tmpdir = TemporaryDirectory() + output_notebook_path = str(tmp_path.joinpath(f"{notebook_name}.ipynb")) + + print(f"Running {notebook_name} notebook as {output_notebook_path}") + + pm.execute_notebook( + input_path=NOTEBOOK_PATHS[notebook_name], + output_path=output_notebook_path, + # kernel_name="python3", + parameters=dict( + FEATHR_CONFIG_PATH=config_path, + DATA_STORE_PATH=output_tmpdir.name, + USE_CLI_AUTH=False, + REGISTER_FEATURES=False, + SCRAP_RESULTS=True, + ), + ) + + # Read results from the Scrapbook and assert expected values + nb = sb.read_notebook(output_notebook_path) + outputs = nb.scraps + + assert outputs["materialized_feature_values"].data["239"] == pytest.approx([1480., 5707.], abs=1.) + assert outputs["materialized_feature_values"].data["265"] == pytest.approx([4160., 10000.], abs=1.) + assert outputs["rmse"].data == pytest.approx(5., abs=2.) + assert outputs["mae"].data == pytest.approx(2., abs=1.) diff --git a/feathr_project/test/test_input_output_sources.py b/feathr_project/test/test_input_output_sources.py index f4af85678..ba4b3921a 100644 --- a/feathr_project/test/test_input_output_sources.py +++ b/feathr_project/test/test_input_output_sources.py @@ -10,6 +10,7 @@ from test_fixture import basic_test_setup from test_utils.constants import Constants + # test parquet file read/write without an extension name def test_feathr_get_offline_features_with_parquet(): """ @@ -38,7 +39,7 @@ def test_feathr_get_offline_features_with_parquet(): else: output_path = ''.join(['abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/output','_', str(now.minute), '_', str(now.second), ".parquet"]) - + client.get_offline_features(observation_settings=settings, feature_query=feature_query, output_path=output_path, @@ -47,14 +48,12 @@ def test_feathr_get_offline_features_with_parquet(): # assuming the job can successfully run; otherwise it will throw exception client.wait_job_to_finish(timeout_sec=Constants.SPARK_JOB_TIMEOUT_SECONDS) - + # download result and just assert the returned result is not empty res_df = get_result_df(client) assert res_df.shape[0] > 0 - - # test delta lake read/write without an extension name def test_feathr_get_offline_features_with_delta_lake(): """ @@ -83,7 +82,7 @@ def test_feathr_get_offline_features_with_delta_lake(): else: output_path = ''.join(['abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/output','_', str(now.minute), '_', str(now.second), "_deltalake"]) - + client.get_offline_features(observation_settings=settings, feature_query=feature_query, output_path=output_path, @@ -92,15 +91,13 @@ def test_feathr_get_offline_features_with_delta_lake(): # assuming the job can successfully run; otherwise it will throw exception client.wait_job_to_finish(timeout_sec=Constants.SPARK_JOB_TIMEOUT_SECONDS) - + # wait for a few secs for the resource to come up in the databricks API time.sleep(5) - # download result and just assert the returned result is not empty - res_df = get_result_df(client) - + # download result and just assert the returned result is not empty + # if users are using delta format in synapse, skip this check, due to issue https://github.com/delta-io/delta-rs/issues/582 result_format: str = client.get_job_tags().get(OUTPUT_FORMAT, "") if not (client.spark_runtime == 'azure_synapse' and result_format == 'delta'): - # if users are using delta format in synapse, skip this check, due to issue https://github.com/delta-io/delta-rs/issues/582 + res_df = get_result_df(client) assert res_df.shape[0] > 0 - diff --git a/feathr_project/test/test_user_workspace/feathr_config.yaml b/feathr_project/test/test_user_workspace/feathr_config.yaml index 7d00706fc..87bc2e542 100644 --- a/feathr_project/test/test_user_workspace/feathr_config.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config.yaml @@ -86,7 +86,7 @@ spark_config: feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.9.0.jar" databricks: # workspace instance - workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' + workspace_instance_url: 'https://adb-4121774437039026.6.azuredatabricks.net' workspace_token_value: '' # config string including run time information, spark version, machine size, etc. # the config follows the format in the databricks documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs diff --git a/feathr_project/test/test_user_workspace/mock_results/output-delta/_delta_log/00000000000000000000.json b/feathr_project/test/test_user_workspace/mock_results/output-delta/_delta_log/00000000000000000000.json new file mode 100644 index 000000000..855c52b51 --- /dev/null +++ b/feathr_project/test/test_user_workspace/mock_results/output-delta/_delta_log/00000000000000000000.json @@ -0,0 +1,4 @@ +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} +{"metaData":{"id":"a3a34f62-adf4-428f-9595-dc1a0c1055e7","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"trip_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"VendorID\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"lpep_pickup_datetime\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"lpep_dropoff_datetime\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"store_and_fwd_flag\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"RatecodeID\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"PULocationID\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"DOLocationID\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"passenger_count\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"trip_distance\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"fare_amount\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"extra\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"mta_tax\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"tip_amount\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"tolls_amount\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ehail_fee\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"improvement_surcharge\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"total_amount\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"payment_type\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"trip_type\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"congestion_surcharge\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1667325249843}} +{"add":{"path":"part-00000-5020f59b-ee83-45a6-a2cd-4b9a37427f86-c000.snappy.parquet","partitionValues":{},"size":6277,"modificationTime":1667325251596,"dataChange":true,"stats":"{\"numRecords\":5,\"minValues\":{\"trip_id\":\"0\",\"VendorID\":\"2.0\",\"lpep_pickup_datetime\":\"2020-04-01 00:00:23\",\"lpep_dropoff_datetime\":\"2020-04-01 00:16:13\",\"store_and_fwd_flag\":\"N\",\"RatecodeID\":\"1.0\",\"PULocationID\":\"244\",\"DOLocationID\":\"169\",\"passenger_count\":\"1.0\",\"trip_distance\":\"1.0\",\"fare_amount\":\"12.0\",\"extra\":\"0.5\",\"mta_tax\":\"0.5\",\"tip_amount\":\"0.0\",\"tolls_amount\":\"0.0\",\"improvement_surcharge\":\"0.3\",\"total_amount\":\"10.3\",\"payment_type\":\"1.0\",\"trip_type\":\"1.0\",\"congestion_surcharge\":\"0.0\"},\"maxValues\":{\"trip_id\":\"4\",\"VendorID\":\"2.0\",\"lpep_pickup_datetime\":\"2020-04-01 00:45:06\",\"lpep_dropoff_datetime\":\"2020-04-01 01:04:39\",\"store_and_fwd_flag\":\"N\",\"RatecodeID\":\"1.0\",\"PULocationID\":\"75\",\"DOLocationID\":\"41\",\"passenger_count\":\"3.0\",\"trip_distance\":\"6.79\",\"fare_amount\":\"9.0\",\"extra\":\"0.5\",\"mta_tax\":\"0.5\",\"tip_amount\":\"0.0\",\"tolls_amount\":\"0.0\",\"improvement_surcharge\":\"0.3\",\"total_amount\":\"9.3\",\"payment_type\":\"2.0\",\"trip_type\":\"1.0\",\"congestion_surcharge\":\"0.0\"},\"nullCount\":{\"trip_id\":0,\"VendorID\":0,\"lpep_pickup_datetime\":0,\"lpep_dropoff_datetime\":0,\"store_and_fwd_flag\":0,\"RatecodeID\":0,\"PULocationID\":0,\"DOLocationID\":0,\"passenger_count\":0,\"trip_distance\":0,\"fare_amount\":0,\"extra\":0,\"mta_tax\":0,\"tip_amount\":0,\"tolls_amount\":0,\"ehail_fee\":5,\"improvement_surcharge\":0,\"total_amount\":0,\"payment_type\":0,\"trip_type\":0,\"congestion_surcharge\":0}}"}} +{"commitInfo":{"timestamp":1667325251731,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"6277"},"engineInfo":"Apache-Spark/3.2.2 Delta-Lake/2.1.1","txnId":"a5e436e6-dfb6-4956-9e0c-b31b883128a0"}} diff --git a/feathr_project/test/test_user_workspace/mock_results/output-delta/part-00000-5020f59b-ee83-45a6-a2cd-4b9a37427f86-c000.snappy.parquet b/feathr_project/test/test_user_workspace/mock_results/output-delta/part-00000-5020f59b-ee83-45a6-a2cd-4b9a37427f86-c000.snappy.parquet new file mode 100644 index 000000000..1d8214c42 Binary files /dev/null and b/feathr_project/test/test_user_workspace/mock_results/output-delta/part-00000-5020f59b-ee83-45a6-a2cd-4b9a37427f86-c000.snappy.parquet differ diff --git a/feathr_project/test/test_user_workspace/mock_results/output.avro/part-00000-979daf2d-d172-48cc-a65e-87a89526f97a-c000.avro b/feathr_project/test/test_user_workspace/mock_results/output.avro/part-00000-979daf2d-d172-48cc-a65e-87a89526f97a-c000.avro new file mode 100644 index 000000000..c97dec375 Binary files /dev/null and b/feathr_project/test/test_user_workspace/mock_results/output.avro/part-00000-979daf2d-d172-48cc-a65e-87a89526f97a-c000.avro differ diff --git a/feathr_project/test/test_user_workspace/mock_results/output.csv b/feathr_project/test/test_user_workspace/mock_results/output.csv new file mode 100644 index 000000000..0468eb1b6 --- /dev/null +++ b/feathr_project/test/test_user_workspace/mock_results/output.csv @@ -0,0 +1,6 @@ +trip_id,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge +0,2.0,2020-04-01 00:44:02,2020-04-01 00:52:23,N,1.0,42,41,1.0,1.68,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,1.0,1.0,0.0 +1,2.0,2020-04-01 00:24:39,2020-04-01 00:33:06,N,1.0,244,247,2.0,1.94,9.0,0.5,0.5,0.0,0.0,,0.3,10.3,2.0,1.0,0.0 +2,2.0,2020-04-01 00:45:06,2020-04-01 00:51:13,N,1.0,244,243,3.0,1.0,6.5,0.5,0.5,0.0,0.0,,0.3,7.8,2.0,1.0,0.0 +3,2.0,2020-04-01 00:45:06,2020-04-01 01:04:39,N,1.0,244,243,2.0,2.81,12.0,0.5,0.5,0.0,0.0,,0.3,13.3,2.0,1.0,0.0 +4,2.0,2020-04-01 00:00:23,2020-04-01 00:16:13,N,1.0,75,169,1.0,6.79,21.0,0.5,0.5,0.0,0.0,,0.3,22.3,1.0,1.0,0.0 diff --git a/feathr_project/test/test_user_workspace/mock_results/output.parquet/part-00000-bfa76930-af3c-4d58-a6e6-c1050f57ab99-c000.snappy.parquet b/feathr_project/test/test_user_workspace/mock_results/output.parquet/part-00000-bfa76930-af3c-4d58-a6e6-c1050f57ab99-c000.snappy.parquet new file mode 100644 index 000000000..0e2f9d13f Binary files /dev/null and b/feathr_project/test/test_user_workspace/mock_results/output.parquet/part-00000-bfa76930-af3c-4d58-a6e6-c1050f57ab99-c000.snappy.parquet differ diff --git a/feathr_project/test/test_user_workspace/mock_results/output_dir.csv/part-00000-06dad06f-1275-434b-8d83-6b9ed6c73eab-c000.csv b/feathr_project/test/test_user_workspace/mock_results/output_dir.csv/part-00000-06dad06f-1275-434b-8d83-6b9ed6c73eab-c000.csv new file mode 100644 index 000000000..b5b08ca83 --- /dev/null +++ b/feathr_project/test/test_user_workspace/mock_results/output_dir.csv/part-00000-06dad06f-1275-434b-8d83-6b9ed6c73eab-c000.csv @@ -0,0 +1,5 @@ +0,2.0,2020-04-01 00:44:02,2020-04-01 00:52:23,N,1.0,42,41,1.0,1.68,8.0,0.5,0.5,0.0,0.0,"",0.3,9.3,1.0,1.0,0.0 +1,2.0,2020-04-01 00:24:39,2020-04-01 00:33:06,N,1.0,244,247,2.0,1.94,9.0,0.5,0.5,0.0,0.0,"",0.3,10.3,2.0,1.0,0.0 +2,2.0,2020-04-01 00:45:06,2020-04-01 00:51:13,N,1.0,244,243,3.0,1.0,6.5,0.5,0.5,0.0,0.0,"",0.3,7.8,2.0,1.0,0.0 +3,2.0,2020-04-01 00:45:06,2020-04-01 01:04:39,N,1.0,244,243,2.0,2.81,12.0,0.5,0.5,0.0,0.0,"",0.3,13.3,2.0,1.0,0.0 +4,2.0,2020-04-01 00:00:23,2020-04-01 00:16:13,N,1.0,75,169,1.0,6.79,21.0,0.5,0.5,0.0,0.0,"",0.3,22.3,1.0,1.0,0.0 diff --git a/feathr_project/test/unit/datasets/test_dataset_utils.py b/feathr_project/test/unit/datasets/test_dataset_utils.py new file mode 100644 index 000000000..2aabaa9a1 --- /dev/null +++ b/feathr_project/test/unit/datasets/test_dataset_utils.py @@ -0,0 +1,53 @@ +from pathlib import Path +from tempfile import TemporaryDirectory +from urllib.parse import urlparse + +import pytest + +from feathr.datasets.nyc_taxi import NYC_TAXI_SMALL_URL +from feathr.datasets.utils import maybe_download + + +@pytest.mark.parametrize( + # 3924447 is the nyc_taxi sample data's bytes + "expected_bytes", [3924447, None] +) +def test__maybe_download(expected_bytes: int): + """Test maybe_download utility function w/ nyc_taxi data cached at Azure blob.""" + + tmpdir = TemporaryDirectory() + dst_filepath = Path(tmpdir.name, "data.csv") + + # Assert the data is downloaded + assert maybe_download( + src_url=NYC_TAXI_SMALL_URL, + dst_filepath=str(dst_filepath), + expected_bytes=expected_bytes, + ) + + # Assert the downloaded file exists. + assert dst_filepath.is_file() + + # Assert the data is already exists and thus the function does not download + assert not maybe_download( + src_url=NYC_TAXI_SMALL_URL, + dst_filepath=str(dst_filepath), + expected_bytes=expected_bytes, + ) + + tmpdir.cleanup() + + +def test__maybe_download__raise_exception(): + """Test maby_download utility function to raise IOError when the expected bytes mismatches.""" + + tmpdir = TemporaryDirectory() + + with pytest.raises(IOError): + maybe_download( + src_url=NYC_TAXI_SMALL_URL, + dst_filepath=Path(tmpdir.name, "data.csv").resolve(), + expected_bytes=10, + ) + + tmpdir.cleanup() diff --git a/feathr_project/test/unit/datasets/test_datasets.py b/feathr_project/test/unit/datasets/test_datasets.py new file mode 100644 index 000000000..10d89c673 --- /dev/null +++ b/feathr_project/test/unit/datasets/test_datasets.py @@ -0,0 +1,97 @@ +from pathlib import Path +from unittest.mock import MagicMock + +from pyspark.sql import SparkSession +import pytest +from pytest_mock import MockerFixture + +from feathr.datasets import nyc_taxi + + +TEST_DATASET_DIR = Path(__file__).parent.parent.parent.joinpath("test_user_workspace") +NYC_TAXI_FILE_PATH = str(TEST_DATASET_DIR.joinpath("green_tripdata_2020-04_with_index.csv").resolve()) + + +@pytest.mark.parametrize( + "local_cache_path", + [ + None, # default temporary directory + NYC_TAXI_FILE_PATH, # full filepath + str(Path(NYC_TAXI_FILE_PATH).parent), # directory + ], +) +def test__nyc_taxi__get_pandas_df( + mocker: MockerFixture, + local_cache_path: str, +): + """Test if nyc_taxi.get_pandas_df returns pd.DataFrame. Also check if the proper modules are being called.""" + # Mock maybe_download and TempDirectory + mocked_maybe_download = mocker.patch("feathr.datasets.nyc_taxi.maybe_download") + mocked_tmpdir = MagicMock() + mocked_tmpdir.name = NYC_TAXI_FILE_PATH + mocked_TemporaryDirectory = mocker.patch("feathr.datasets.nyc_taxi.TemporaryDirectory", return_value=mocked_tmpdir) + + pdf = nyc_taxi.get_pandas_df(local_cache_path=local_cache_path) + assert len(pdf) == 35612 + + # Assert mock called + if local_cache_path: + mocked_TemporaryDirectory.assert_not_called() + else: + mocked_TemporaryDirectory.assert_called_once() + + # TODO check this is called w/ file extension added + mocked_maybe_download.assert_called_once_with(src_url=nyc_taxi.NYC_TAXI_SMALL_URL, dst_filepath=NYC_TAXI_FILE_PATH) + + +@pytest.mark.parametrize( + "local_cache_path", [ + NYC_TAXI_FILE_PATH, # full filepath + str(Path(NYC_TAXI_FILE_PATH).parent), # directory + ], +) +def test__nyc_taxi__get_spark_df( + spark, + mocker: MockerFixture, + local_cache_path: str, +): + """Test if nyc_taxi.get_spark_df returns spark.sql.DataFrame.""" + # Mock maybe_download + mocked_maybe_download = mocker.patch("feathr.datasets.nyc_taxi.maybe_download") + + df = nyc_taxi.get_spark_df(spark=spark, local_cache_path=local_cache_path) + assert df.count() == 35612 + + mocked_maybe_download.assert_called_once_with( + src_url=nyc_taxi.NYC_TAXI_SMALL_URL, dst_filepath=NYC_TAXI_FILE_PATH + ) + + +@pytest.mark.parametrize( + "local_cache_path", [ + NYC_TAXI_FILE_PATH, # full filepath + str(Path(NYC_TAXI_FILE_PATH).parent), # directory + ], +) +def test__nyc_taxi__get_spark_df__with_databricks( + mocker: MockerFixture, + local_cache_path: str, +): + # Mock maybe_download and spark session + mocked_maybe_download = mocker.patch("feathr.datasets.nyc_taxi.maybe_download") + mocked_is_databricks = mocker.patch("feathr.datasets.nyc_taxi.is_databricks", return_value=True) + mocked_spark = MagicMock(spec=SparkSession) + + nyc_taxi.get_spark_df(spark=mocked_spark, local_cache_path=local_cache_path) + + # Assert mock called with databricks paths + mocked_is_databricks.assert_called_once() + + expected_dst_filepath = str(Path("/dbfs", NYC_TAXI_FILE_PATH.lstrip("/"))) + mocked_maybe_download.assert_called_once_with( + src_url=nyc_taxi.NYC_TAXI_SMALL_URL, dst_filepath=expected_dst_filepath + ) + + mocked_spark.read.option.return_value.csv.assert_called_once_with( + str(Path("dbfs:", NYC_TAXI_FILE_PATH.lstrip("/"))) + ) diff --git a/feathr_project/test/unit/spark_provider/test_localspark_submission.py b/feathr_project/test/unit/spark_provider/test_localspark_submission.py index 9a9d7238b..992f2015e 100644 --- a/feathr_project/test/unit/spark_provider/test_localspark_submission.py +++ b/feathr_project/test/unit/spark_provider/test_localspark_submission.py @@ -4,6 +4,7 @@ import pytest from pytest_mock import MockerFixture +from feathr.constants import OUTPUT_PATH_TAG from feathr.spark_provider._localspark_submission import _FeathrLocalSparkJobLauncher @@ -15,9 +16,17 @@ def local_spark_job_launcher(tmp_path) -> _FeathrLocalSparkJobLauncher: ) +@pytest.mark.parametrize( + "job_tags,expected_result_uri", [ + (None, None), + ({OUTPUT_PATH_TAG: "output"}, "output"), + ] +) def test__local_spark_job_launcher__submit_feathr_job( mocker: MockerFixture, local_spark_job_launcher: _FeathrLocalSparkJobLauncher, + job_tags: Dict[str, str], + expected_result_uri: str, ): # Mock necessary components local_spark_job_launcher._init_args = MagicMock(return_value=[]) @@ -31,11 +40,16 @@ def test__local_spark_job_launcher__submit_feathr_job( job_name="unit-test", main_jar_path="", main_class_name="", + job_tags=job_tags, ) # Assert if the mocked spark process has called once mocked_spark_proc.assert_called_once() + # Assert job tags + assert local_spark_job_launcher.get_job_tags() == job_tags + assert local_spark_job_launcher.get_job_result_uri() == expected_result_uri + @pytest.mark.parametrize( "confs", [{}, {"spark.feathr.outputFormat": "parquet"}] diff --git a/feathr_project/test/unit/utils/test_config.py b/feathr_project/test/unit/utils/test_config.py new file mode 100644 index 000000000..770980e12 --- /dev/null +++ b/feathr_project/test/unit/utils/test_config.py @@ -0,0 +1,180 @@ +from copy import deepcopy +import os +from pathlib import Path +from unittest.mock import MagicMock +import yaml + +import pytest +from pytest_mock import MockerFixture + +import feathr.utils.config +from feathr.utils.config import generate_config + + +@pytest.mark.parametrize( + "output_filepath", [None, "config.yml"], +) +def test__generate_config__output_filepath( + output_filepath: str, + tmp_path: Path, +): + resource_prefix = "test_prefix" + project_name = "test_project" + + # Use tmp_path so that the test files get cleaned up after the tests + if output_filepath: + output_filepath = str(tmp_path / output_filepath) + + config_filepath = generate_config( + resource_prefix=resource_prefix, + project_name=project_name, + output_filepath=output_filepath, + use_env_vars=False, + ) + + # Assert if the config file was generated in the specified output path. + if output_filepath: + assert output_filepath == config_filepath + + # Assert the generated config string is correct. + with open(config_filepath, "r") as f: + config = yaml.safe_load(f) + + assert config["project_config"]["project_name"] == project_name + assert config["feature_registry"]["api_endpoint"] == f"https://{resource_prefix}webapp.azurewebsites.net/api/v1" + assert config["spark_config"]["spark_cluster"] == "local" + assert config["online_store"]["redis"]["host"] == f"{resource_prefix}redis.redis.cache.windows.net" + + +@pytest.mark.parametrize( + "spark_cluster,env_key,kwargs", + [ + ("local", None, dict()), + ( + "databricks", + "DATABRICKS_WORKSPACE_TOKEN_VALUE", + dict(spark_config__databricks__workspace_instance_url="databricks_url"), + ), + ( + "azure_synapse", + "ADLS_KEY", + dict( + spark_config__azure_synapse__dev_url="synapse_url", + spark_config__azure_synapse__pool_name="pool_name", + ), + ), + ] +) +def test__generate_config__spark_cluster( + mocker: MockerFixture, + spark_cluster: str, + env_key: str, + kwargs: str, +): + """Test if spark cluster specific configs are generated without errors. + TODO - For now, this test doesn't check if the config values are correctly working with the actual Feathr client. + """ + # Mock the os.environ to return the specified env vars + mocker.patch.object(feathr.utils.config.os, "environ", {env_key: "some_value"}) + + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_config__spark_cluster=spark_cluster, + use_env_vars=False, + **kwargs, + ) + + +@pytest.mark.parametrize( + "adls_key,pool_name,expected_error", + [ + ("some_key", "some_name", None), + (None, "some_name", ValueError), + ("some_key", None, ValueError), + ] +) +def test__generate_config__azure_synapse_exceptions( + mocker: MockerFixture, + adls_key: str, + pool_name: str, + expected_error: Exception, +): + """Test if exceptions are raised when databricks url and token are not provided.""" + + # Either env vars or argument should yield the same result + for environ in [{"ADLS_KEY": adls_key}, { + "ADLS_KEY": adls_key, + "SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME": pool_name, + }]: + # Mock the os.environ to return the specified env vars + mocker.patch.object(feathr.utils.config.os, "environ", environ) + + # Test either using env vars or arguments + if "SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME" in environ: + kwargs = dict() + else: + kwargs = dict(spark_config__azure_synapse__pool_name=pool_name) + + if expected_error is None: + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_config__spark_cluster="azure_synapse", + **kwargs, + ) + else: + with pytest.raises(ValueError): + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_config__spark_cluster="azure_synapse", + **kwargs, + ) + + +@pytest.mark.parametrize( + "databricks_token,workspace_url,expected_error", + [ + ("some_token", "some_url", None), + (None, "some_url", ValueError), + ("some_token", None, ValueError), + ] +) +def test__generate_config__databricks_exceptions( + mocker: MockerFixture, + databricks_token: str, + workspace_url: str, + expected_error: Exception, +): + """Test if exceptions are raised when databricks url and token are not provided.""" + + # Either env vars or argument should yield the same result + for environ in [{"DATABRICKS_WORKSPACE_TOKEN_VALUE": databricks_token}, { + "DATABRICKS_WORKSPACE_TOKEN_VALUE": databricks_token, + "SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL": workspace_url, + }]: + # Mock the os.environ to return the specified env vars + mocker.patch.object(feathr.utils.config.os, "environ", environ) + + # Test either using env vars or arguments + if "SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL" in environ: + kwargs = dict() + else: + kwargs = dict(spark_config__databricks__workspace_instance_url=workspace_url) + + if expected_error is None: + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_config__spark_cluster="databricks", + **kwargs, + ) + else: + with pytest.raises(ValueError): + generate_config( + resource_prefix="test_prefix", + project_name="test_project", + spark_config__spark_cluster="databricks", + **kwargs, + ) diff --git a/feathr_project/test/unit/utils/test_job_utils.py b/feathr_project/test/unit/utils/test_job_utils.py new file mode 100644 index 000000000..0909fb56e --- /dev/null +++ b/feathr_project/test/unit/utils/test_job_utils.py @@ -0,0 +1,228 @@ +# TODO with, without optional args +# TODO test with no data files exception and unsupported format exception +from pathlib import Path +from typing import Type +from unittest.mock import MagicMock + +import pandas as pd +import pytest +from pytest_mock import MockerFixture +from pyspark.sql import DataFrame, SparkSession + +from feathr import FeathrClient +from feathr.constants import OUTPUT_FORMAT, OUTPUT_PATH_TAG +from feathr.utils.job_utils import ( + get_result_df, + get_result_pandas_df, + get_result_spark_df, +) + + +def test__get_result_pandas_df(mocker: MockerFixture): + """Test if the base function, get_result_df, called w/ proper args""" + mocked_get_result_df = mocker.patch("feathr.utils.job_utils.get_result_df") + client = MagicMock() + data_format = "some_data_format" + res_url = "some_res_url" + local_cache_path = "some_local_cache_path" + get_result_pandas_df(client, data_format, res_url, local_cache_path) + mocked_get_result_df.assert_called_once_with(client, data_format, res_url, local_cache_path) + + +def test__get_result_spark_df(mocker: MockerFixture): + """Test if the base function, get_result_df, called w/ proper args""" + mocked_get_result_df = mocker.patch("feathr.utils.job_utils.get_result_df") + client = MagicMock() + spark = MagicMock() + data_format = "some_data_format" + res_url = "some_res_url" + local_cache_path = "some_local_cache_path" + get_result_spark_df(spark, client, data_format, res_url, local_cache_path) + mocked_get_result_df.assert_called_once_with(client, data_format, res_url, local_cache_path, spark=spark) + + +@pytest.mark.parametrize( + "is_databricks,spark_runtime,res_url,local_cache_path,expected_local_cache_path", [ + # For local spark results, res_url must be a local path and local_cache_path will be ignored. + (False, "local", "some_res_url", None, "some_res_url"), + (False, "local", "some_res_url", "some_local_cache_path", "some_res_url"), + # For databricks results, res_url must be a dbfs path. + # If the function is called in databricks, local_cache_path will be ignored. + (True, "databricks", "dbfs:/some_res_url", None, "/dbfs/some_res_url"), + (True, "databricks", "dbfs:/some_res_url", "some_local_cache_path", "/dbfs/some_res_url"), + (False, "databricks", "dbfs:/some_res_url", None, "mocked_temp_path"), + (False, "databricks", "dbfs:/some_res_url", "some_local_cache_path", "some_local_cache_path"), + ] +) +def test__get_result_df__with_local_cache_path( + mocker: MockerFixture, + is_databricks: bool, + spark_runtime: str, + res_url: str, + local_cache_path: str, + expected_local_cache_path: str, +): + """Test local_cache_path is used if provided""" + # Mock client + client = MagicMock() + client.spark_runtime = spark_runtime + client.feathr_spark_launcher.download_result = MagicMock() + mocked_load_files_to_pandas_df = mocker.patch("feathr.utils.job_utils._load_files_to_pandas_df") + + # Mock is_databricks + mocker.patch("feathr.utils.job_utils.is_databricks", return_value=is_databricks) + + # Mock temporary file module + mocked_named_temporary_dir = MagicMock() + mocked_named_temporary_dir.name = expected_local_cache_path + mocker.patch("feathr.utils.job_utils.TemporaryDirectory", return_value=mocked_named_temporary_dir) + + data_format = "csv" + get_result_df(client, data_format=data_format, res_url=res_url, local_cache_path=local_cache_path) + + mocked_load_files_to_pandas_df.assert_called_once_with( + dir_path=expected_local_cache_path, + data_format=data_format, + ) + + +@pytest.mark.parametrize( + "is_databricks,spark_runtime,res_url,data_format,expected_error", [ + # Test RuntimeError when the function is running at Databricks but client.spark_runtime is not databricks + (True, "local", "some_url", "some_format", RuntimeError), + (True, "azure_synapse", "some_url", "some_format", RuntimeError), + (True, "databricks", "some_url", "some_format", None), + (False, "local", "some_url", "some_format", None), + (False, "azure_synapse", "some_url", "some_format", None), + (False, "databricks", "some_url", "some_format", None), + # Test ValueError when res_url is None + (True, "databricks", None, "some_format", ValueError), + (False, "local", None, "some_format", ValueError), + (False, "azure_synapse", None, "some_format", ValueError), + (False, "databricks", None, "some_format", ValueError), + # Test ValueError when data_format is None + (True, "databricks", "some_url", None, ValueError), + (False, "local", "some_url", None, ValueError), + (False, "azure_synapse", "some_url", None, ValueError), + (False, "databricks", "some_url", None, ValueError), + ] +) +def test__get_result_df__exceptions( + mocker: MockerFixture, + is_databricks: bool, + spark_runtime: str, + res_url: str, + data_format: str, + expected_error: Type[Exception], +): + """Test exceptions""" + + # Mock is_data_bricks + mocker.patch("feathr.utils.job_utils.is_databricks", return_value=is_databricks) + + # Mock _load_files_to_pandas_df + mocker.patch("feathr.utils.job_utils._load_files_to_pandas_df") + + # Either job tags or argument should yield the same result + for job_tag in [None, {OUTPUT_FORMAT: data_format, OUTPUT_PATH_TAG: res_url}]: + # Mock client + client = MagicMock() + client.get_job_result_uri = MagicMock(return_value=res_url) + client.get_job_tags = MagicMock(return_value=job_tag) + client.spark_runtime = spark_runtime + + if expected_error is None: + get_result_df( + client=client, + res_url=None if job_tag else res_url, + data_format=None if job_tag else data_format, + ) + else: + with pytest.raises(expected_error): + get_result_df( + client=client, + res_url=None if job_tag else res_url, + data_format=None if job_tag else data_format, + ) + + +@pytest.mark.parametrize( + "data_format,output_filename,expected_count", [ + ("csv", "output.csv", 5), + ("csv", "output_dir.csv", 4), # TODO add a header to the csv file and change expected_count to 5 after fixing the bug https://github.com/feathr-ai/feathr/issues/811 + ("parquet", "output.parquet", 5), + ("avro", "output.avro", 5), + ("delta", "output-delta", 5), + ] +) +def test__get_result_df( + workspace_dir: str, + data_format: str, + output_filename: str, + expected_count: int, +): + """Test get_result_df returns pandas DataFrame""" + for spark_runtime in ["local", "databricks", "azure_synapse"]: + # Note: make sure the output file exists in the test_user_workspace + res_url = str(Path(workspace_dir, "mock_results", output_filename)) + local_cache_path = res_url + + # Mock client + client = MagicMock() + client.spark_runtime = spark_runtime + + # Mock feathr_spark_launcher.download_result + if client.spark_runtime == "databricks": + res_url = f"dbfs:/{res_url}" + if client.spark_runtime == "azure_synapse" and data_format == "delta": + # TODO currently pass the delta table test on Synapse result due to the delta table package bug. + continue + + df = get_result_df( + client=client, + data_format=data_format, + res_url=res_url, + local_cache_path=local_cache_path, + ) + assert isinstance(df, pd.DataFrame) + assert len(df) == expected_count + + +@pytest.mark.parametrize( + "data_format,output_filename,expected_count", [ + ("csv", "output.csv", 5), + ("csv", "output_dir.csv", 4), # TODO add a header to the csv file and change expected_count = 5 after fixing the bug https://github.com/feathr-ai/feathr/issues/811 + ("parquet", "output.parquet", 5), + ("avro", "output.avro", 5), + ("delta", "output-delta", 5), + ] +) +def test__get_result_df__with_spark_session( + workspace_dir: str, + spark: SparkSession, + data_format: str, + output_filename: str, + expected_count: int, +): + """Test get_result_df returns spark DataFrame""" + for spark_runtime in ["local", "databricks", "azure_synapse"]: + # Note: make sure the output file exists in the test_user_workspace + res_url = str(Path(workspace_dir, "mock_results", output_filename)) + local_cache_path = res_url + + # Mock client + client = MagicMock() + client.spark_runtime = spark_runtime + + if client.spark_runtime == "databricks": + res_url = f"dbfs:/{res_url}" + + df = get_result_df( + client=client, + data_format=data_format, + res_url=res_url, + spark=spark, + local_cache_path=local_cache_path, + ) + assert isinstance(df, DataFrame) + assert df.count() == expected_count