diff --git a/.github/workflows/pull_request_push_test.yml b/.github/workflows/pull_request_push_test.yml index a5addf9a6..d994dd4e9 100644 --- a/.github/workflows/pull_request_push_test.yml +++ b/.github/workflows/pull_request_push_test.yml @@ -111,6 +111,7 @@ jobs: BLOB_KEY: ${{secrets.BLOB_KEY}} JDBC_SF_PASSWORD: ${{secrets.JDBC_SF_PASSWORD}} KAFKA_SASL_JAAS_CONFIG: ${{secrets.KAFKA_SASL_JAAS_CONFIG}} + MONITORING_DATABASE_SQL_PASSWORD: ${{secrets.MONITORING_DATABASE_SQL_PASSWORD}} SPARK_CONFIG__DATABRICKS__FEATHR_RUNTIME_LOCATION: dbfs:/${{ env.CI_SPARK_REMOTE_JAR_FOLDER}}/${{ env.FEATHR_LOCAL_JAR_NAME}} run: | @@ -177,6 +178,7 @@ jobs: JDBC_DRIVER: ${{secrets.JDBC_DRIVER}} JDBC_SF_PASSWORD: ${{secrets.JDBC_SF_PASSWORD}} KAFKA_SASL_JAAS_CONFIG: ${{secrets.KAFKA_SASL_JAAS_CONFIG}} + MONITORING_DATABASE_SQL_PASSWORD: ${{secrets.MONITORING_DATABASE_SQL_PASSWORD}} SPARK_CONFIG__AZURE_SYNAPSE__FEATHR_RUNTIME_LOCATION: "abfss://${{secrets.SPARK_JAR_BLOB_CONTAINER}}@feathrazuretest3storage.dfs.core.windows.net/${{ env.CI_SPARK_REMOTE_JAR_FOLDER}}/${{ env.FEATHR_LOCAL_JAR_NAME}}" run: | # skip databricks related test as we just ran the test; also seperate databricks and synapse test to make sure there's no write conflict diff --git a/README.md b/README.md index 27f65096a..1ff4d80f6 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,10 @@ # Feathr – An Enterprise-Grade, High Performance Feature Store +[![License](https://img.shields.io/badge/License-Apache%202.0-blue)](https://github.com/linkedin/feathr/blob/main/LICENSE) +[![GitHub Release](https://img.shields.io/github/v/release/linkedin/feathr.svg?style=flat&sort=semver&color=blue)](https://github.com/linkedin/feathr/releases) +[![Docs Latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://linkedin.github.io/feathr/) +[![Python API](https://img.shields.io/readthedocs/feathr?label=Python%20API)](https://feathr.readthedocs.io/en/latest/) + ## What is Feathr? Feathr is the feature store that is used in production in LinkedIn for many years and was open sourced in April 2022. Read our announcement on [Open Sourcing Feathr](https://engineering.linkedin.com/blog/2022/open-sourcing-feathr---linkedin-s-feature-store-for-productive-m) and [Feathr on Azure](https://azure.microsoft.com/en-us/blog/feathr-linkedin-s-feature-store-is-now-available-on-azure/). @@ -44,8 +49,10 @@ pip install git+https://github.com/linkedin/feathr.git#subdirectory=feathr_proje ## ☁️ Running Feathr on Cloud +Feathr has native integrations with Databricks and Azure Synapse: + - Please read the [Quick Start Guide for Feathr on Databricks](./docs/quickstart_databricks.md) to run Feathr with Databricks. -- Please read the [Quick Start Guide for Feathr on Azure Synapse](./docs/quickstart.md) to run Feathr with Azure Synapse. +- Please read the [Quick Start Guide for Feathr on Azure Synapse](./docs/quickstart_synapse.md) to run Feathr with Azure Synapse. ## 🔡 Feathr Examples @@ -116,7 +123,7 @@ Read [Point-in-time Correctness and Point-in-time Join in Feathr](https://linked ### Running Feathr Examples -Follow the [quick start Jupyter Notebook](./feathr_project/feathrcli/data/feathr_user_workspace/product_recommendation_demo.ipynb) to try it out. There is also a companion [quick start guide](https://linkedin.github.io/feathr/quickstart.html) containing a bit more explanation on the notebook. +Follow the [quick start Jupyter Notebook](./feathr_project/feathrcli/data/feathr_user_workspace/product_recommendation_demo.ipynb) to try it out. There is also a companion [quick start guide](https://linkedin.github.io/feathr/quickstart_synapse.html) containing a bit more explanation on the notebook. ## 🗣️ Tech Talks on Feathr diff --git a/docs/README.md b/docs/README.md index 09379a868..6e26207b7 100644 --- a/docs/README.md +++ b/docs/README.md @@ -33,7 +33,7 @@ Feathr automatically computes your feature values and joins them to your trainin Feathr has native cloud integration. To use Feathr on Azure, you only need three steps: -1. Get the `Principal ID` of your account by running `az ad signed-in-user show --query objectId -o tsv` in the link below (Select "Bash" if asked), and write down that value (something like `b65ef2e0-42b8-44a7-9b55-abbccddeefff`). Think this ID as something representing you when accessing Azure, and it will be used to grant permissions in the next step in the UI. +1. Get the `Principal ID` of your account by running `az ad signed-in-user show --query id -o tsv` in the link below (Select "Bash" if asked), and write down that value (something like `b65ef2e0-42b8-44a7-9b55-abbccddeefff`). Think this ID as something representing you when accessing Azure, and it will be used to grant permissions in the next step in the UI. [Launch Cloud Shell](https://shell.azure.com/bash) @@ -141,7 +141,7 @@ Feathr has native integration with Azure and other cloud services, and here's th ## Quickstart -- [Quickstart](quickstart.md) +- [Quickstart for Azure Synapse](quickstart_synapse.md) ## Concepts diff --git a/docs/concepts/concepts.md b/docs/concepts/concepts.md index 8d5713e13..d91f4291c 100644 --- a/docs/concepts/concepts.md +++ b/docs/concepts/concepts.md @@ -1,7 +1,6 @@ --- layout: default title: Feathr Concepts -nav_order: 3 has_children: true permalink: docs/concepts --- diff --git a/docs/concepts/feathr-concepts-for-beginners.md b/docs/concepts/feathr-concepts-for-beginners.md index 918b45c14..d095b259b 100644 --- a/docs/concepts/feathr-concepts-for-beginners.md +++ b/docs/concepts/feathr-concepts-for-beginners.md @@ -107,9 +107,9 @@ client.get_offline_features(observation_settings=settings, ## What is "materialization" in Feathr? -You are very likely to train a machine learning model with the features that you just queried (with `get_offline_features()`). After you have trained a machine learning model, say a fraud detection model, you are likely to put the machine learning model into an online envrionment and do online inference. +You are very likely to train a machine learning model with the features that you just queried (with `get_offline_features()`). After you have trained a machine learning model, say a fraud detection model, you are likely to put the machine learning model into an online environment and do online inference. -In that case, you will need to retrieve the features (for example the user historical spending) in real time, since the fraud detection model is very time sensitive. Usually some key-value store is used for that scenario (for example Redis), and Feathr will help you to materialize features in the online environment for faster inference. That is why you will see something like below, where you specify Redis as the online storage you want to use, and retrieve features from online envrionment using `get_online_features()` from there: +In that case, you will need to retrieve the features (for example the user historical spending) in real time, since the fraud detection model is very time sensitive. Usually some key-value store is used for that scenario (for example Redis), and Feathr will help you to materialize features in the online environment for faster inference. That is why you will see something like below, where you specify Redis as the online storage you want to use, and retrieve features from online environment using `get_online_features()` from there: ```python redisSink = RedisSink(table_name="nycTaxiDemoFeature") @@ -141,4 +141,4 @@ For more details on how to utilize Feathr to perform point-in-time joins, refer ## Next Steps -After you are familar with the above concepts, please check out the [quick start guide](../quickstart.md) to get your hands dirty. Enjoy! \ No newline at end of file +After you are familar with the above concepts, please check out the [quick start guide](../quickstart_synapse.md) to get your hands dirty. Enjoy! \ No newline at end of file diff --git a/docs/dev_guide/dev-guide.md b/docs/dev_guide/dev-guide.md index 60d03e109..6c1dffb37 100644 --- a/docs/dev_guide/dev-guide.md +++ b/docs/dev_guide/dev-guide.md @@ -1,7 +1,6 @@ --- layout: default title: Feathr Developer Guides -nav_order: 5 has_children: true permalink: docs/dev_guide --- diff --git a/docs/how-to-guides/azure-deployment.md b/docs/how-to-guides/azure-deployment.md index adf5e2a48..4df308cdb 100644 --- a/docs/how-to-guides/azure-deployment.md +++ b/docs/how-to-guides/azure-deployment.md @@ -13,7 +13,7 @@ Due to the complexity of the possible cloud environment, it is almost impossible Feathr has native cloud integration and getting started with Feathr is very straightforward. You only need three steps: -1. Get the principal ID of your account by running `az ad signed-in-user show --query objectId -o tsv` in the link below (Select "Bash" if you are asked to choose one), and write down that value (will be something like `b65ef2e0-42b8-44a7-9b55-abbccddeefff`) +1. Get the principal ID of your account by running `az ad signed-in-user show --query id -o tsv` in the link below (Select "Bash" if you are asked to choose one), and write down that value (will be something like `b65ef2e0-42b8-44a7-9b55-abbccddeefff`) [Launch Cloud Shell](https://shell.azure.com/bash) diff --git a/docs/how-to-guides/azure_resource_provision.json b/docs/how-to-guides/azure_resource_provision.json index 33117f60d..ea7c0ad0e 100644 --- a/docs/how-to-guides/azure_resource_provision.json +++ b/docs/how-to-guides/azure_resource_provision.json @@ -1,355 +1,319 @@ { - "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", - "contentVersion": "1.0.0.0", - "parameters": { - "resourcePrefix": { - "minLength": 3, - "maxLength": 15, - "type": "String", - "metadata": { - "description": "Resource prefix for all the resource provisioned. This should be an alphanumeric string." - } - }, - "principalId": { - "type": "String", - "metadata": { - "description": "Specifies the principal ID assigned to the role. You can find it by logging into 'https://shell.azure.com/bash' and run 'az ad signed-in-user show --query objectId -o tsv'" - } - }, - "allowAllConnections": { - "defaultValue": "true", - "allowedValues": [ - "true", - "false" - ], - "type": "String", - "metadata": { - "description": "Specifies whether to allow client IPs to connect to Synapse" - } - }, - "provisionPurview": { - "defaultValue": "true", - "allowedValues": [ - "true", - "false" - ], - "type": "String", - "metadata": { - "description": "Whether or not put purview in the provision script" - } - }, - "provisionEventHub": { - "defaultValue": "true", - "allowedValues": [ - "true", - "false" - ], - "type": "String", - "metadata": { - "description": "Whether or not to deploy eventhub provision script" - } - } + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "resourcePrefix": { + "minLength": 3, + "maxLength": 15, + "type": "String", + "metadata": { + "description": "Resource prefix for all the resource provisioned. This should be an alphanumeric string." + } }, - "variables": { - "location": "[resourceGroup().location]", - "tenantId": "[subscription().tenantId]", - "redisCacheName": "[concat(parameters('resourcePrefix'),'redis' )]", - "keyVaultName": "[concat(parameters('resourcePrefix'),'kv')]", - "eventhubNameSpaceName": "[concat(parameters('resourcePrefix'),'ehns')]", - "eventhubName": "[concat(parameters('resourcePrefix'),'eh')]", - "eventhubSku": "Standard", - "eventhubSkuCapacity": 1, - "keyVault": "[resourceId('Microsoft.KeyVault/vaults', variables('keyVaultName'))]", - "redisCache": "[resourceId('Microsoft.Cache/redis', variables('redisCacheName'))]", - "eventhubNameSpace": "[resourceId('Microsoft.EventHub/namespaces/', variables('eventhubNameSpaceName'))]", - "sparkPoolName": "spark31", - "workspaceName": "[toLower(concat(parameters('resourcePrefix'),'syws'))]", - "dlsName": "[toLower(concat(parameters('resourcePrefix'), 'dls'))]", - "dlsFsName": "[toLower(concat(parameters('resourcePrefix'),'fs'))]", - "dlsAccount": "[resourceId('Microsoft.Storage/storageAccounts', variables('dlsName'))]", - "purviewName": "[concat(parameters('resourcePrefix'),'purview' )]", - "roleDefinitionIdForBlobContributor": "ba92f5b4-2d11-453d-a403-e96b0029c9fe", - "roleDefinitionIdForKeyVaultSecretsUser": "4633458b-17de-408a-b874-0445c86b69e6", - "roleAssignmentNameForBlobContributor": "[guid(parameters('principalId'), variables('roleDefinitionIdForBlobContributor'), resourceGroup().id)]", - "roleAssignmentNameForKeyVaultSecretsUser": "[guid(parameters('principalId'), variables('roleDefinitionIdForKeyVaultSecretsUser'), resourceGroup().id)]" + "principalId": { + "type": "String", + "metadata": { + "description": "Specifies the principal ID assigned to the role. You can find it by logging into 'https://shell.azure.com/bash' and run 'az ad signed-in-user show --query id -o tsv'" + } }, - "functions": [], - "resources": [ - { - "type": "Microsoft.KeyVault/vaults", - "apiVersion": "2021-10-01", - "name": "[variables('keyVaultName')]", - "location": "[variables('location')]", - "properties": { - "tenantId": "[variables('tenantId')]", - "sku": { - "name": "standard", - "family": "A" - }, - "accessPolicies": [], - "enableSoftDelete": true, - "enableRbacAuthorization": true - }, - "resources": [ - { - "type": "Microsoft.KeyVault/vaults/secrets", - "apiVersion": "2021-10-01", - "name": "[concat(variables('keyVaultName'), '/FEATHR-PREFIX')]", - "location": "[resourceGroup().location]", - "dependsOn": [ - "[variables('keyVault')]" - ], - "properties": { - "value": "[parameters('resourcePrefix')]" - } - } - ] + "allowAllConnections": { + "defaultValue": "true", + "allowedValues": ["true", "false"], + "type": "String", + "metadata": { + "description": "Specifies whether to allow client IPs to connect to Synapse" + } + }, + "provisionPurview": { + "defaultValue": "true", + "allowedValues": ["true", "false"], + "type": "String", + "metadata": { + "description": "Whether or not put purview in the provision script" + } + }, + "provisionEventHub": { + "defaultValue": "true", + "allowedValues": ["true", "false"], + "type": "String", + "metadata": { + "description": "Whether or not to deploy eventhub provision script" + } + } + }, + "variables": { + "location": "[resourceGroup().location]", + "tenantId": "[subscription().tenantId]", + "redisCacheName": "[concat(parameters('resourcePrefix'),'redis' )]", + "keyVaultName": "[concat(parameters('resourcePrefix'),'kv')]", + "eventhubNameSpaceName": "[concat(parameters('resourcePrefix'),'ehns')]", + "eventhubName": "[concat(parameters('resourcePrefix'),'eh')]", + "eventhubSku": "Standard", + "eventhubSkuCapacity": 1, + "keyVault": "[resourceId('Microsoft.KeyVault/vaults', variables('keyVaultName'))]", + "redisCache": "[resourceId('Microsoft.Cache/redis', variables('redisCacheName'))]", + "eventhubNameSpace": "[resourceId('Microsoft.EventHub/namespaces/', variables('eventhubNameSpaceName'))]", + "sparkPoolName": "spark31", + "workspaceName": "[toLower(concat(parameters('resourcePrefix'),'syws'))]", + "dlsName": "[toLower(concat(parameters('resourcePrefix'), 'dls'))]", + "dlsFsName": "[toLower(concat(parameters('resourcePrefix'),'fs'))]", + "dlsAccount": "[resourceId('Microsoft.Storage/storageAccounts', variables('dlsName'))]", + "purviewName": "[concat(parameters('resourcePrefix'),'purview' )]", + "roleDefinitionIdForBlobContributor": "ba92f5b4-2d11-453d-a403-e96b0029c9fe", + "roleDefinitionIdForKeyVaultSecretsUser": "4633458b-17de-408a-b874-0445c86b69e6", + "roleAssignmentNameForBlobContributor": "[guid(parameters('principalId'), variables('roleDefinitionIdForBlobContributor'), resourceGroup().id)]", + "roleAssignmentNameForKeyVaultSecretsUser": "[guid(parameters('principalId'), variables('roleDefinitionIdForKeyVaultSecretsUser'), resourceGroup().id)]" + }, + "functions": [], + "resources": [ + { + "type": "Microsoft.KeyVault/vaults", + "apiVersion": "2021-10-01", + "name": "[variables('keyVaultName')]", + "location": "[variables('location')]", + "properties": { + "tenantId": "[variables('tenantId')]", + "sku": { + "name": "standard", + "family": "A" }, + "accessPolicies": [], + "enableSoftDelete": true, + "enableRbacAuthorization": true + }, + "resources": [ { - "type": "Microsoft.Cache/redis", - "apiVersion": "2021-06-01", - "name": "[variables('redisCacheName')]", - "location": "[resourceGroup().location]", - "tags": { - "displayName": "Feathr Online Store" - }, - "properties": { - "redisVersion": "6", - "sku": { - "name": "Basic", - "family": "C", - "capacity": 2 - } - }, - "resources": [ - { - "type": "Microsoft.KeyVault/vaults/secrets", - "apiVersion": "2021-10-01", - "name": "[concat(variables('keyVaultName'), '/FEATHR-ONLINE-STORE-CONN')]", - "location": "[resourceGroup().location]", - "dependsOn": [ - "[variables('keyVault')]", - "[variables('redisCache')]" - ], - "properties": { - "value": "[concat(variables('redisCacheName'),'.redis.cache.windows.net:6380,password=', listKeys(concat('Microsoft.Cache/redis/', variables('redisCacheName')), '2021-06-01').primaryKey, ',ssl=True')]" - } - }, - { - "type": "Microsoft.KeyVault/vaults/secrets", - "apiVersion": "2021-10-01", - "name": "[concat(variables('keyVaultName'), '/REDIS-PASSWORD')]", - "location": "[resourceGroup().location]", - "dependsOn": [ - "[variables('keyVault')]", - "[variables('redisCache')]" - ], - "properties": { - "value": "[listKeys(concat('Microsoft.Cache/redis/', variables('redisCacheName')), '2021-06-01').primaryKey]" - } - } - ] - }, + "type": "Microsoft.KeyVault/vaults/secrets", + "apiVersion": "2021-10-01", + "name": "[concat(variables('keyVaultName'), '/FEATHR-PREFIX')]", + "location": "[resourceGroup().location]", + "dependsOn": ["[variables('keyVault')]"], + "properties": { + "value": "[parameters('resourcePrefix')]" + } + } + ] + }, + { + "type": "Microsoft.Cache/redis", + "apiVersion": "2021-06-01", + "name": "[variables('redisCacheName')]", + "location": "[resourceGroup().location]", + "tags": { + "displayName": "Feathr Online Store" + }, + "properties": { + "redisVersion": "6", + "sku": { + "name": "Basic", + "family": "C", + "capacity": 2 + } + }, + "resources": [ { - "type": "Microsoft.Storage/storageAccounts", - "apiVersion": "2021-08-01", - "name": "[variables('dlsName')]", - "location": "[variables('location')]", - "sku": { - "name": "Standard_LRS", - "tier": "Standard" - }, - "kind": "StorageV2", - "properties": { - "accessTier": "Hot", - "supportsHttpsTrafficOnly": true, - "isHnsEnabled": true - }, - "resources": [ - { - "type": "blobServices/containers", - "apiVersion": "2021-08-01", - "name": "[concat('default/', variables('dlsFsName'))]", - "dependsOn": [ - "[variables('dlsName')]" - ], - "properties": { - "publicAccess": "None" - } - } - ] + "type": "Microsoft.KeyVault/vaults/secrets", + "apiVersion": "2021-10-01", + "name": "[concat(variables('keyVaultName'), '/FEATHR-ONLINE-STORE-CONN')]", + "location": "[resourceGroup().location]", + "dependsOn": ["[variables('keyVault')]", "[variables('redisCache')]"], + "properties": { + "value": "[concat(variables('redisCacheName'),'.redis.cache.windows.net:6380,password=', listKeys(concat('Microsoft.Cache/redis/', variables('redisCacheName')), '2021-06-01').primaryKey, ',ssl=True')]" + } }, { - "condition": "[equals(parameters('provisionPurview'),'true')]", - "type": "Microsoft.Purview/accounts", - "apiVersion": "2021-07-01", - "name": "[variables('purviewName')]", - "location": "[variables('location')]", - "sku": { - "name": "Standard", - "capacity": 1 - }, - "identity": { - "type": "SystemAssigned" - }, - "properties": { - "cloudConnectors": {}, - "publicNetworkAccess": "Enabled" - } - }, + "type": "Microsoft.KeyVault/vaults/secrets", + "apiVersion": "2021-10-01", + "name": "[concat(variables('keyVaultName'), '/REDIS-PASSWORD')]", + "location": "[resourceGroup().location]", + "dependsOn": ["[variables('keyVault')]", "[variables('redisCache')]"], + "properties": { + "value": "[listKeys(concat('Microsoft.Cache/redis/', variables('redisCacheName')), '2021-06-01').primaryKey]" + } + } + ] + }, + { + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2021-08-01", + "name": "[variables('dlsName')]", + "location": "[variables('location')]", + "sku": { + "name": "Standard_LRS", + "tier": "Standard" + }, + "kind": "StorageV2", + "properties": { + "accessTier": "Hot", + "supportsHttpsTrafficOnly": true, + "isHnsEnabled": true + }, + "resources": [ { - "condition": "[equals(parameters('provisionEventHub'),'true')]", - "type": "Microsoft.EventHub/namespaces", - "apiVersion": "2021-11-01", - "name": "[variables('eventhubNameSpaceName')]", - "location": "[variables('location')]", - "sku": { - "name": "[variables('eventhubSku')]", - "tier": "[variables('eventhubSku')]", - "capacity": "[variables('eventhubSkuCapacity')]" - }, - "properties": {}, - "resources": [ - { - "condition": "[equals(parameters('provisionEventHub'),'true')]", - "type": "eventhubs", - "apiVersion": "2021-11-01", - "name": "[variables('eventhubName')]", - "dependsOn": [ - "[variables('eventhubNameSpace')]" - ], - "properties": {} - }, - { - "condition": "[equals(parameters('provisionEventHub'),'true')]", - "type": "Microsoft.KeyVault/vaults/secrets", - "apiVersion": "2021-10-01", - "name": "[concat(variables('keyVaultName'), '/EVENTHUB-POLICY-KEY')]", - "condition": "[equals(parameters('provisionEventHub'),'true')]", - "location": "[resourceGroup().location]", - "dependsOn": [ - "[variables('keyVault')]", - "[variables('eventhubNameSpace')]" - ], - "properties": { - "value": "[if(equals(parameters('provisionEventHub'),'true'),listKeys(resourceId('Microsoft.EventHub/namespaces/AuthorizationRules',variables('eventhubNameSpaceName'),'RootManageSharedAccessKey'),'2021-11-01').primaryConnectionString,'null' )]" - } - } - ] + "type": "blobServices/containers", + "apiVersion": "2021-08-01", + "name": "[concat('default/', variables('dlsFsName'))]", + "dependsOn": ["[variables('dlsName')]"], + "properties": { + "publicAccess": "None" + } + } + ] + }, + { + "condition": "[equals(parameters('provisionPurview'),'true')]", + "type": "Microsoft.Purview/accounts", + "apiVersion": "2021-07-01", + "name": "[variables('purviewName')]", + "location": "[variables('location')]", + "sku": { + "name": "Standard", + "capacity": 1 + }, + "identity": { + "type": "SystemAssigned" + }, + "properties": { + "cloudConnectors": {}, + "publicNetworkAccess": "Enabled" + } + }, + { + "condition": "[equals(parameters('provisionEventHub'),'true')]", + "type": "Microsoft.EventHub/namespaces", + "apiVersion": "2021-11-01", + "name": "[variables('eventhubNameSpaceName')]", + "location": "[variables('location')]", + "sku": { + "name": "[variables('eventhubSku')]", + "tier": "[variables('eventhubSku')]", + "capacity": "[variables('eventhubSkuCapacity')]" + }, + "properties": {}, + "resources": [ + { + "condition": "[equals(parameters('provisionEventHub'),'true')]", + "type": "eventhubs", + "apiVersion": "2021-11-01", + "name": "[variables('eventhubName')]", + "dependsOn": ["[variables('eventhubNameSpace')]"], + "properties": {} }, { - "type": "Microsoft.Synapse/workspaces", - "apiVersion": "2021-06-01", - "name": "[variables('workspaceName')]", - "location": "[variables('location')]", - "dependsOn": [ - "[variables('dlsName')]", - "[variables('dlsFsName')]" - ], - "identity": { - "type": "SystemAssigned" - }, - "properties": { - "defaultDataLakeStorage": { - "accountUrl": "[reference(variables('dlsName')).primaryEndpoints.dfs]", - "filesystem": "[variables('dlsFsName')]" - }, - "managedVirtualNetwork": "default" - }, - "resources": [ - { - "type": "firewallrules", - "apiVersion": "2021-06-01", - "name": "allowAll", - "location": "[variables('location')]", - "dependsOn": [ - "[variables('workspaceName')]" - ], - "properties": { - "startIpAddress": "0.0.0.0", - "endIpAddress": "255.255.255.255" - }, - "condition": "[equals(parameters('allowAllConnections'),'true')]" - }, - { - "type": "firewallrules", - "apiVersion": "2021-06-01", - "name": "AllowAllWindowsAzureIps", - "location": "[variables('location')]", - "dependsOn": [ - "[variables('workspaceName')]" - ], - "properties": { - "startIpAddress": "0.0.0.0", - "endIpAddress": "0.0.0.0" - } - }, - { - "type": "managedIdentitySqlControlSettings", - "apiVersion": "2021-06-01", - "name": "default", - "location": "[variables('location')]", - "dependsOn": [ - "[variables('workspaceName')]" - ], - "properties": { - "grantSqlControlToManagedIdentity": { - "desiredState": "Enabled" - } - } - } - ] + "condition": "[equals(parameters('provisionEventHub'),'true')]", + "type": "Microsoft.KeyVault/vaults/secrets", + "apiVersion": "2021-10-01", + "name": "[concat(variables('keyVaultName'), '/EVENTHUB-POLICY-KEY')]", + "condition": "[equals(parameters('provisionEventHub'),'true')]", + "location": "[resourceGroup().location]", + "dependsOn": [ + "[variables('keyVault')]", + "[variables('eventhubNameSpace')]" + ], + "properties": { + "value": "[if(equals(parameters('provisionEventHub'),'true'),listKeys(resourceId('Microsoft.EventHub/namespaces/AuthorizationRules',variables('eventhubNameSpaceName'),'RootManageSharedAccessKey'),'2021-11-01').primaryConnectionString,'null' )]" + } + } + ] + }, + { + "type": "Microsoft.Synapse/workspaces", + "apiVersion": "2021-06-01", + "name": "[variables('workspaceName')]", + "location": "[variables('location')]", + "dependsOn": ["[variables('dlsName')]", "[variables('dlsFsName')]"], + "identity": { + "type": "SystemAssigned" + }, + "properties": { + "defaultDataLakeStorage": { + "accountUrl": "[reference(variables('dlsName')).primaryEndpoints.dfs]", + "filesystem": "[variables('dlsFsName')]" }, + "managedVirtualNetwork": "default" + }, + "resources": [ { - "type": "Microsoft.Synapse/workspaces/bigDataPools", - "apiVersion": "2021-06-01", - "name": "[concat(variables('workspaceName'), '/', variables('sparkPoolName'))]", - "location": "[variables('location')]", - "dependsOn": [ - "[variables('workspaceName')]" - ], - "properties": { - "autoPause": { - "delayInMinutes": 30, - "enabled": true - }, - "autoScale": { - "enabled": true, - "minNodeCount": 1, - "maxNodeCount": 3 - }, - "sparkVersion": "3.1", - "nodeCount": 3, - "nodeSizeFamily": "MemoryOptimized", - "nodeSize": "Medium" - } + "type": "firewallrules", + "apiVersion": "2021-06-01", + "name": "allowAll", + "location": "[variables('location')]", + "dependsOn": ["[variables('workspaceName')]"], + "properties": { + "startIpAddress": "0.0.0.0", + "endIpAddress": "255.255.255.255" + }, + "condition": "[equals(parameters('allowAllConnections'),'true')]" }, { - "type": "Microsoft.Authorization/roleAssignments", - "apiVersion": "2020-10-01-preview", - "name": "[variables('roleAssignmentNameForBlobContributor')]", - "dependsOn": [ - "[variables('dlsAccount')]" - ], - "properties": { - "roleDefinitionId": "[resourceId('Microsoft.Authorization/roleDefinitions', variables('roleDefinitionIdForBlobContributor'))]", - "principalId": "[parameters('principalId')]", - "scope": "[resourceGroup().id]" - } + "type": "firewallrules", + "apiVersion": "2021-06-01", + "name": "AllowAllWindowsAzureIps", + "location": "[variables('location')]", + "dependsOn": ["[variables('workspaceName')]"], + "properties": { + "startIpAddress": "0.0.0.0", + "endIpAddress": "0.0.0.0" + } }, { - "type": "Microsoft.Authorization/roleAssignments", - "apiVersion": "2020-10-01-preview", - "name": "[variables('roleAssignmentNameForKeyVaultSecretsUser')]", - "dependsOn": [ - "[variables('keyVault')]" - ], - "properties": { - "roleDefinitionId": "[resourceId('Microsoft.Authorization/roleDefinitions', variables('roleDefinitionIdForKeyVaultSecretsUser'))]", - "principalId": "[parameters('principalId')]", - "scope": "[resourceGroup().id]" + "type": "managedIdentitySqlControlSettings", + "apiVersion": "2021-06-01", + "name": "default", + "location": "[variables('location')]", + "dependsOn": ["[variables('workspaceName')]"], + "properties": { + "grantSqlControlToManagedIdentity": { + "desiredState": "Enabled" } + } } - ], - "outputs": {} -} \ No newline at end of file + ] + }, + { + "type": "Microsoft.Synapse/workspaces/bigDataPools", + "apiVersion": "2021-06-01", + "name": "[concat(variables('workspaceName'), '/', variables('sparkPoolName'))]", + "location": "[variables('location')]", + "dependsOn": ["[variables('workspaceName')]"], + "properties": { + "autoPause": { + "delayInMinutes": 30, + "enabled": true + }, + "autoScale": { + "enabled": true, + "minNodeCount": 1, + "maxNodeCount": 3 + }, + "sparkVersion": "3.1", + "nodeCount": 3, + "nodeSizeFamily": "MemoryOptimized", + "nodeSize": "Medium" + } + }, + { + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2020-10-01-preview", + "name": "[variables('roleAssignmentNameForBlobContributor')]", + "dependsOn": ["[variables('dlsAccount')]"], + "properties": { + "roleDefinitionId": "[resourceId('Microsoft.Authorization/roleDefinitions', variables('roleDefinitionIdForBlobContributor'))]", + "principalId": "[parameters('principalId')]", + "scope": "[resourceGroup().id]" + } + }, + { + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2020-10-01-preview", + "name": "[variables('roleAssignmentNameForKeyVaultSecretsUser')]", + "dependsOn": ["[variables('keyVault')]"], + "properties": { + "roleDefinitionId": "[resourceId('Microsoft.Authorization/roleDefinitions', variables('roleDefinitionIdForKeyVaultSecretsUser'))]", + "principalId": "[parameters('principalId')]", + "scope": "[resourceGroup().id]" + } + } + ], + "outputs": {} +} diff --git a/docs/how-to-guides/deployment/deploy.json b/docs/how-to-guides/deployment/deploy.json index 0c8241eef..3e3dd40f5 100644 --- a/docs/how-to-guides/deployment/deploy.json +++ b/docs/how-to-guides/deployment/deploy.json @@ -21,16 +21,13 @@ "principalId": { "type": "string", "metadata": { - "description": "Specifies the principal ID assigned to the role. You can find it by logging into 'https://shell.azure.com/bash' and run 'az ad signed-in-user show --query objectId -o tsv' " + "description": "Specifies the principal ID assigned to the role. You can find it by logging into 'https://shell.azure.com/bash' and run 'az ad signed-in-user show --query id -o tsv' " } }, "allowAllConnections": { "type": "string", "defaultValue": "true", - "allowedValues": [ - "true", - "false" - ], + "allowedValues": ["true", "false"], "metadata": { "description": "Specifies whether to allow client IPs to connect to Synapse" } @@ -635,4 +632,4 @@ "value": "[parameters('resourcePrefix')]" } } -} \ No newline at end of file +} diff --git a/docs/how-to-guides/deployment/main.bicep b/docs/how-to-guides/deployment/main.bicep index aef63e2b4..566ff6668 100644 --- a/docs/how-to-guides/deployment/main.bicep +++ b/docs/how-to-guides/deployment/main.bicep @@ -8,7 +8,7 @@ targetScope = 'subscription' @minLength(3) param resourcePrefix string = 'feathr${take(newGuid(),5)}' -@description('Specifies the principal ID assigned to the role. You can find it by logging into \'https://shell.azure.com/bash\' and run \'az ad signed-in-user show --query objectId -o tsv\' ') +@description('Specifies the principal ID assigned to the role. You can find it by logging into \'https://shell.azure.com/bash\' and run \'az ad signed-in-user show --query id -o tsv\' ') param principalId string @description('Specifies whether to allow client IPs to connect to Synapse') diff --git a/docs/how-to-guides/feathr-configuration-and-env.md b/docs/how-to-guides/feathr-configuration-and-env.md new file mode 100644 index 000000000..382932ea1 --- /dev/null +++ b/docs/how-to-guides/feathr-configuration-and-env.md @@ -0,0 +1,114 @@ +--- +layout: default +title: Configuration, environment variables, and store secrets in a secure way +parent: Feathr How-to Guides +--- + +# Configuration and environment variables in Feathr + +Feathr uses a YAML file and a few environment variables to allow end users to have more flexibility. See the example of the following configurations in [this file](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml). + +In that YAML file, it contains the configurations that are used by Feathr. All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of this config file. For example, `feathr_runtime_location` for databricks can be overwritten by setting this environment variable: `SPARK_CONFIG__DATABRICKS__FEATHR_RUNTIME_LOCATION`. For example, you can set it in python: + +```python +os.environ['SPARK_CONFIG__DATABRICKS__FEATHR_RUNTIME_LOCATION'] = "https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar" +``` + +or in shell environment: + +```bash +export SPARK_CONFIG__DATABRICKS__FEATHR_RUNTIME_LOCATION=https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar +``` + +This allows end users to store the configuration in a secure way, say in Kubernetes secrets, key vault, etc. All the configurations available for end users to configure are listed below. + + +# Default behaviors + +Feathr will get the configurations in the following order: + +1. If the key is set in the environment variable, Feathr will use the value of that environment variable +2. If it's not set in the environment, then a value is retrieved from the feathr_config.yaml file with the same config key. +3. If it's not available in the feathr_config.yaml file, Feathr will try to reterive the value from a key vault service. Currently only Azure Key Vault is supported. + +# A list of environment variables that Feathr uses +| Environment Variable | Description | Required? | +| -------------- | -------------- | -------------- | +| SECRETS__AZURE_KEY_VAULT__NAME | Name of the Azure Key Vault service so that Feathr can get credentials from that service. | Optional | +| AZURE_CLIENT_ID | Client ID for authentication into Azure Services. Read [here](https://docs.microsoft.com/en-us/python/api/azure-identity/azure.identity.environmentcredential?view=azure-python) for more details. | This is required if you are using Service Principal to login with Feathr. | +| AZURE_TENANT_ID |Client ID for authentication into Azure Services. Read [here](https://docs.microsoft.com/en-us/python/api/azure-identity/azure.identity.environmentcredential?view=azure-python) for more details. |This is required if you are using Service Principal to login with Feathr. | +| AZURE_CLIENT_SECRET | Client ID for authentication into Azure Services. Read [here](https://docs.microsoft.com/en-us/python/api/azure-identity/azure.identity.environmentcredential?view=azure-python) for more details. |This is required if you are using Service Principal to login with Feathr. | +| OFFLINE_STORE__ADLS__ADLS_ENABLED | Whether to enable ADLS as offline store or not. |Optional | +| ADLS_ACCOUNT | ADLS account that you connect to. |Required if using ADLS as an offline store. | +| ADLS_KEY | ADLS key that you connect to. |Required if using ADLS as an offline store. | +| OFFLINE_STORE__WASB__WASB_ENABLED | Whether to enable Azure BLOB storage as offline store or not. | +| WASB_ACCOUNT | Azure BLOB Storage account that you connect to.| Required if using Azure BLOB Storage as an offline store. | +| WASB_KEY | Azure BLOB Storage key that you connect to. |Required if using Azure BLOB Storage as an offline store. | +| S3_ACCESS_KEY | AWS S3 access key for the S3 account. |Required if using AWS S3 Storage as an offline store. | +| S3_SECRET_KEY | AWS S3 secret key for the S3 account. |Required if using AWS S3 Storage as an offline store. | +| OFFLINE_STORE__S3__S3_ENABLED | Whether to enable S3 as offline store or not. |Optional | +| OFFLINE_STORE__S3__S3_ENDPOINT | S3 endpoint. If you use S3 endpoint, then you need to provide access key and secret key in the environment variable as well. |Required if using AWS S3 Storage as an offline store. | +| OFFLINE_STORE__JDBC__JDBC_ENABLED | Whether to enable JDBC as offline store or not. |Optional | +| OFFLINE_STORE__JDBC__JDBC_DATABASE | If using JDBC endpoint as offline store, this config specifies the JDBC database to read from. | Required if using JDBC sources as offline store | +| OFFLINE_STORE__JDBC__JDBC_TABLE | If using JDBC endpoint as offline store, this config specifies the JDBC table to read from. Same as `JDBC_TABLE`. |Required if using JDBC sources as offline store | +| JDBC_TABLE | If using JDBC endpoint as offline store, this config specifies the JDBC table to read from |Required if using JDBC sources as offline store | +| JDBC_USER | If using JDBC endpoint as offline store, this config specifies the JDBC user |Required if using JDBC sources as offline store | +| JDBC_PASSWORD | If using JDBC endpoint as offline store, this config specifies the JDBC password |Required if using JDBC sources as offline store | +| KAFKA_SASL_JAAS_CONFIG | If using EventHub as a streaming input source, this configures the KAFKA stream. If using EventHub, read [here](https://github.com/Azure/azure-event-hubs-for-kafka#updating-your-kafka-client-configuration) for how to get this string from the existing string in Azure Portal. | Required if using Kafka/EventHub as streaming source input.| +| PROJECT_CONFIG__PROJECT_NAME | Configures the project name. | Required| +| OFFLINE_STORE__SNOWFLAKE__URL | Configures the Snowflake URL. Usually it's something like `dqllago-ol19457.snowflakecomputing.com`. |Required if using Snowflake as an offline store. | +| OFFLINE_STORE__SNOWFLAKE__USER | Configures the Snowflake user. |Required if using Snowflake as an offline store. | +| OFFLINE_STORE__SNOWFLAKE__ROLE | Configures the Snowflake role. Usually it's something like `ACCOUNTADMIN`. |Required if using Snowflake as an offline store. | +|JDBC_SF_PASSWORD| Configurations for Snowflake password|Required if using Snowflake as an offline store. | +| SPARK_CONFIG__SPARK_CLUSTER | Choice for spark runtime. Currently support: `azure_synapse`, `databricks`. The `databricks` configs will be ignored if `azure_synapse` is set and vice versa. | Required| +| SPARK_CONFIG__SPARK_RESULT_OUTPUT_PARTS | Configure number of parts for the spark output for feature generation job | Required| +| SPARK_CONFIG__AZURE_SYNAPSE__DEV_URL | Dev URL to the synapse cluster. Usually it's something like `https://yourclustername.dev.azuresynapse.net` | Required if using Azure Synapse| +| SPARK_CONFIG__AZURE_SYNAPSE__POOL_NAME | name of the sparkpool that you are going to use |Required if using Azure Synapse| +| SPARK_CONFIG__AZURE_SYNAPSE__WORKSPACE_DIR | A location that Synapse has access to. This workspace dir stores all the required configuration files and the jar resources. All the feature definitions will be uploaded here |Required if using Azure Synapse| +| SPARK_CONFIG__AZURE_SYNAPSE__EXECUTOR_SIZE | Specifies the executor size for the Azure Synapse cluster. Currently the options are `Small`, `Medium`, `Large`. |Required if using Azure Synapse| +| SPARK_CONFIG__AZURE_SYNAPSE__EXECUTOR_NUM | Sepcifies the number of executors for the Azure Synapse cluster |Required if using Azure Synapse| +| SPARK_CONFIG__AZURE_SYNAPSE__FEATHR_RUNTIME_LOCATION | Specifies the Feathr runtime location. Support local paths, path start with `http(s)://`, and paths start with `abfss:/`. If not set, will use the [Feathr package published in Maven](https://search.maven.org/artifact/com.linkedin.feathr/feathr_2.12). |Required if using Azure Synapse| +| SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL | Workspace instance URL for your databricks cluster. Will be something like this: `https://adb-6885802458123232.12.azuredatabricks.net/` |Required if using Databricks| +| SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE | Config string including run time information, spark version, machine size, etc. See [below](#sparkconfigdatabricksconfigtemplate) for more details. |Required if using Databricks| +| SPARK_CONFIG__DATABRICKS__WORK_DIR | Workspace dir for storing all the required configuration files and the jar resources. All the feature definitions will be uploaded here. |Required if using Databricks| +| SPARK_CONFIG__DATABRICKS__FEATHR_RUNTIME_LOCATION | Feathr runtime location. Support local paths, path start with `http(s)://`, and paths start with `dbfs:/`. If not set, will use the [Feathr package published in Maven](https://search.maven.org/artifact/com.linkedin.feathr/feathr_2.12). |Required if using Databricks| +| ONLINE_STORE__REDIS__HOST | Redis host name to access Redis cluster. |Required if using Redis as online store. | +| ONLINE_STORE__REDIS__PORT | Redis port number to access Redis cluster. |Required if using Redis as online store. | +| ONLINE_STORE__REDIS__SSL_ENABLED | Whether SSL is enabled to access Redis cluster. |Required if using Redis as online store. | +| REDIS_PASSWORD | Password for the Redis cluster. |Required if using Redis as online store. | +| FEATURE_REGISTRY__PURVIEW__PURVIEW_NAME | Configure the name of the purview endpoint. |Required if using Purview as the endpoint. | +| FEATURE_REGISTRY__PURVIEW__DELIMITER | See [here](#featureregistrypurviewdelimiter) for more details. | Required| +| FEATURE_REGISTRY__PURVIEW__TYPE_SYSTEM_INITIALIZATION | Controls whether the type system (think this as the "schema" for the registry) will be initialized or not. Usually this is only required to be set to `True` to initialize schema, and then you can set it to `False` to shorten the initialization time. | Required| + +# Explanation for selected configurations +## SPARK_CONFIG__DATABRICKS__CONFIG_TEMPLATE + +Essentially it's a compact JSON string represents the important configurations that you can configure for the databricks cluster that you use. There are parts that marked as "FEATHR_FILL_IN" that Feathr will fill in, but all the other parts are customizable. + +Essentially, the config template represents what is going to be submitted to a databricks cluster, and you can see the structure of this configuration template by visiting the [Databricks job runs API](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runs-submit): + +The most important and useful part would be the `new_cluster` section. For example, you can change`spark_version`, `node_type_id`, `num_workers`, etc. based on your environment. + +```json +{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","node_type_id":"Standard_D3_v2","num_workers":2,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"}},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} +``` + +Another use case is to use `instance_pool_id`, where instead of creating the Spark cluster from scratch every time, you can reuse a pool to run the job to make the run time shorter: + +```json +{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":2,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} + +``` + +Other advanced settings includes `idempotency_token` to guarantee the idempotency of job run requests, etc. + + +## FEATURE_REGISTRY__PURVIEW__DELIMITER +Delimiter indicates that how the project name, feature names etc. are delimited. By default it will be '__'. this is for global reference (mainly for feature sharing). For exmaple, when we setup a project called foo, and we have an anchor called 'taxi_driver' and the feature name is called 'f_daily_trips'. the feature will have a global unique name called 'foo__taxi_driver__f_daily_trips' + + +# A note on using Azure Key Vault to store credentials + +Feathr has native integrations with Azure Key Vault to make it more secure to access resources. However, Azure Key Vault doesn't support the secret name to have underscore `_` in the secret name. Feathr will automatically convert underscore `_` to dash `-`. For example, Feathr will look for `ONLINE-STORE--REDIS--HOST` in Azure Key Vault if the actual environment variable is `ONLINE_STORE__REDIS__HOST`. + +Azure Key Vault is not case sensitive, so `online_store__redis__host` and `ONLINE_STORE__REDIS__HOST` will result in the same request to Azure Key Vault and yield the same result. \ No newline at end of file diff --git a/docs/how-to-guides/how-to-guides.md b/docs/how-to-guides/how-to-guides.md index a68e27077..40a959c72 100644 --- a/docs/how-to-guides/how-to-guides.md +++ b/docs/how-to-guides/how-to-guides.md @@ -1,7 +1,6 @@ --- layout: default title: Feathr How-to Guides -nav_order: 4 has_children: true permalink: docs/how-to-guides --- diff --git a/docs/images/databricks_quickstart1.png b/docs/images/databricks_quickstart1.png new file mode 100644 index 000000000..135e59e8f Binary files /dev/null and b/docs/images/databricks_quickstart1.png differ diff --git a/docs/images/databricks_quickstart2.png b/docs/images/databricks_quickstart2.png new file mode 100644 index 000000000..8b7e911c5 Binary files /dev/null and b/docs/images/databricks_quickstart2.png differ diff --git a/docs/images/product_recommendation.jpg b/docs/images/product_recommendation.jpg new file mode 100644 index 000000000..29b0ad6e7 Binary files /dev/null and b/docs/images/product_recommendation.jpg differ diff --git a/docs/images/product_recommendation_advanced.jpg b/docs/images/product_recommendation_advanced.jpg new file mode 100644 index 000000000..9f7e56fc4 Binary files /dev/null and b/docs/images/product_recommendation_advanced.jpg differ diff --git a/docs/quickstart_databricks.md b/docs/quickstart_databricks.md index 6fd38e3f8..96bf6d508 100644 --- a/docs/quickstart_databricks.md +++ b/docs/quickstart_databricks.md @@ -6,4 +6,15 @@ nav_order: 3 # Feathr Quickstart Guide for Databricks -For Databricks, you can simply upload [this notebook](./samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb) to your Databricks cluster and just run it in the databricks cluster. It has been pre-configured to use the current databricks cluster to submit jobs. \ No newline at end of file +For Databricks, you can simply upload [this notebook](./samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb) to your Databricks cluster and just run it in the databricks cluster. It has been pre-configured to use the current databricks cluster to submit jobs. + +1. Import Notebooks in your databricks cluster: + +![Import Notebooks](./images/databricks_quickstart1.png) + + +2. Paste the [link to databricks getting started notebook](./samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb): + +![Import Notebooks](./images/databricks_quickstart2.png) + +3. Run the whole notebook. It will automatically install Feathr in your cluster and run the feature ingestion jobs. \ No newline at end of file diff --git a/docs/quickstart.md b/docs/quickstart_synapse.md similarity index 96% rename from docs/quickstart.md rename to docs/quickstart_synapse.md index 20a75492f..e3aad375f 100644 --- a/docs/quickstart.md +++ b/docs/quickstart_synapse.md @@ -23,7 +23,7 @@ First step is to provision required cloud resources if you want to use Feathr. F Feathr has native cloud integration. To use Feathr on Azure, you only need three steps: -1. Get the `Principal ID` of your account by running `az ad signed-in-user show --query objectId -o tsv` in the link below (Select "Bash" if asked), and write down that value (something like `b65ef2e0-42b8-44a7-9b55-abbccddeefff`). Think this ID as something representing you when accessing Azure, and it will be used to grant permissions in the next step in the UI. +1. Get the `Principal ID` of your account by running `az ad signed-in-user show --query id -o tsv` in the link below (Select "Bash" if asked), and write down that value (something like `b65ef2e0-42b8-44a7-9b55-abbccddeefff`). Think this ID as something representing you when accessing Azure, and it will be used to grant permissions in the next step in the UI. [Launch Cloud Shell](https://shell.azure.com/bash) diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb index 49e0c1897..8a35fa42f 100644 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb +++ b/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"384e5e16-7213-4186-9d04-09d03b155534","showTitle":false,"title":""}},"source":["# Feathr Feature Store on Databricks Demo Notebook\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. This is a notebook that's specially designed for databricks clusters and is relying on some of the databricks packages such as dbutils.\n","\n","The intent of this notebook is like \"one click run\" without configuring anything, so it has relatively limited capability. \n","\n","- For example, in this notebook there's no feature registry available since that requires running Azure Purview. \n","- Also for online store (Redis), you need to configure the Redis endpoint, otherwise that part will not work. \n","\n","However, the core part of Feathr, especially defining features, get offline features, point-in-time joins etc., should \"just work\". The full-fledged notebook is [located here](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb)."]},{"cell_type":"markdown","metadata":{},"source":["\n","# Notebook Steps\n","\n","This tutorial demonstrates the key capabilities of Feathr, including:\n","\n","1. Install and set up Feathr with Azure\n","2. Create shareable features with Feathr feature definition configs.\n","3. Create a training dataset via point-in-time feature join.\n","4. Compute and write features.\n","5. Train a model using these features to predict fares.\n","6. Materialize feature value to online store.\n","7. Fetch feature value in real-time from online store for online scoring.\n","\n","In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The feature flow is as below:\n","\n","![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/feature_flow.png?raw=true)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f00b9d0b-94d1-418f-89b9-25bbacb8b068","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["! pip install feathr pandavro scikit-learn"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"80223a02-631c-40c8-91b3-a037249ffff9","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["import glob\n","import os\n","import tempfile\n","from datetime import datetime, timedelta\n","from math import sqrt\n","\n","import pandas as pd\n","import pandavro as pdx\n","from feathr import FeathrClient\n","from feathr import BOOLEAN, FLOAT, INT32, ValueType\n","from feathr import Feature, DerivedFeature, FeatureAnchor\n","from feathr import BackfillTime, MaterializationSettings\n","from feathr import FeatureQuery, ObservationSettings\n","from feathr import RedisSink\n","from feathr import INPUT_CONTEXT, HdfsSource\n","from feathr import WindowAggTransformation\n","from feathr import TypedKey\n","from sklearn.metrics import mean_squared_error\n","from sklearn.model_selection import train_test_split\n","from azure.identity import DefaultAzureCredential\n","from azure.keyvault.secrets import SecretClient\n","import json\n","import requests"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"41d3648a-9bc9-40dc-90da-bc82b21ef9b3","showTitle":false,"title":""}},"source":["Get the required databricks credentials automatically:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"331753d6-1850-47b5-ad97-84b7c01d79d1","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["# Get current databricks notebook context\n","ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n","host_name = ctx.tags().get(\"browserHostName\").get()\n","host_token = ctx.apiToken().get()\n","cluster_id = ctx.tags().get(\"clusterId\").get()\n","\n","\n","feathr_runtime_location = \"https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\"\n","\n","databricks_config = {'run_name':'FEATHR_FILL_IN','existing_cluster_id':cluster_id,'libraries':[{'jar':'FEATHR_FILL_IN'}],'spark_jar_task':{'main_class_name':'FEATHR_FILL_IN','parameters':['FEATHR_FILL_IN']}}\n","os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + host_name\n","# os.environ['spark_config__databricks__config_template']='{\"run_name\":\"FEATHR_FILL_IN\",\"new_cluster\":{\"spark_version\":\"10.4.x-scala2.12\",\"node_type_id\":\"Standard_D3_v2\",\"num_workers\":2,\"spark_conf\":{\"FEATHR_FILL_IN\":\"FEATHR_FILL_IN\"}},\"libraries\":[{\"jar\":\"FEATHR_FILL_IN\"}],\"spark_jar_task\":{\"main_class_name\":\"FEATHR_FILL_IN\",\"parameters\":[\"FEATHR_FILL_IN\"]}}'\n","os.environ['spark_config__databricks__config_template']=json.dumps(databricks_config)\n","os.environ['spark_config__databricks__work_dir']='dbfs:/feathr_getting_started'\n","os.environ['spark_config__databricks__feathr_runtime_location']=feathr_runtime_location\n","os.environ['project_config__project_name']='feathr_getting_started'\n","os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = host_token"]},{"cell_type":"markdown","metadata":{},"source":["You need to setup the Redis credentials below in order to push features to online store. You can skip this part if you don't have Redis, but there will be failures for `client.materialize_features(settings)` API."]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Get redis credentials; This is to parse Redis connection string.\n","redis_port=\"\"\n","redis_host=\"\"\n","redis_password=\"\"\n","redis_ssl=\"\"\n","\n","# Set the resource link\n","os.environ['online_store__redis__host'] = redis_host\n","os.environ['online_store__redis__port'] = redis_port\n","os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n","os.environ['REDIS_PASSWORD']=redis_password"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee","showTitle":false,"title":""}},"source":["Configure required credentials (skip if you don't use those):"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8cd64e3a-376c-48e6-ba41-5197f3591d48","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["import tempfile\n","yaml_config = \"\"\"\n","# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n","api_version: 1\n","project_config:\n"," project_name: 'feathr_getting_started2'\n"," required_environment_variables:\n"," - 'REDIS_PASSWORD'\n"," - 'AZURE_CLIENT_ID'\n"," - 'AZURE_TENANT_ID'\n"," - 'AZURE_CLIENT_SECRET'\n","offline_store:\n"," adls:\n"," adls_enabled: true\n"," wasb:\n"," wasb_enabled: true\n"," s3:\n"," s3_enabled: false\n"," s3_endpoint: 's3.amazonaws.com'\n"," jdbc:\n"," jdbc_enabled: false\n"," jdbc_database: 'feathrtestdb'\n"," jdbc_table: 'feathrtesttable'\n"," snowflake:\n"," url: \"dqllago-ol19457.snowflakecomputing.com\"\n"," user: \"feathrintegration\"\n"," role: \"ACCOUNTADMIN\"\n","spark_config:\n"," # choice for spark runtime. Currently support: azure_synapse, databricks\n"," # The `databricks` configs will be ignored if `azure_synapse` is set and vice versa.\n"," spark_cluster: \"databricks\"\n"," spark_result_output_parts: \"1\"\n","\n","online_store:\n"," redis:\n"," host: 'feathrazuretest3redis.redis.cache.windows.net'\n"," port: 6380\n"," ssl_enabled: True\n","feature_registry:\n"," purview:\n"," type_system_initialization: true\n"," purview_name: 'feathrazuretest3-purview1'\n"," delimiter: '__'\n","\"\"\"\n","tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n","with open(tmp.name, \"w\") as text_file:\n"," text_file.write(yaml_config)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3fef7f2f-df19-4f53-90a5-ff7999ed983d","showTitle":false,"title":""}},"source":["# Initialize Feathr Client"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9713a2df-c7b2-4562-88b0-b7acce3cc43a","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["client = FeathrClient(config_path=tmp.name)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c3b64bda-d42c-4a64-b976-0fb604cf38c5","showTitle":false,"title":""}},"source":["## View the data\n","\n","In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The data is as below"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c4ccd7b3-298a-4e5a-8eec-b7e309db393e","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["import pandas as pd\n","pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7430c942-64e5-4b70-b823-16ce1d1b3cee","showTitle":false,"title":""}},"source":["## Defining Features with Feathr\n","\n","In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n","\n","\n","1. The typed key (a.k.a. entity id) identifies the subject of feature, e.g. a user id, 123.\n","2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n","3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"16420730-582e-4e11-a343-efc0ddd35108","showTitle":false,"title":""}},"source":["Note that, in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n","It is merely a function/transformation executing against request data at runtime.\n","For example, the day of week of the request, which is calculated by converting the request UNIX timestamp."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"728d2d5f-c11f-4941-bdc5-48507f5749f1","showTitle":false,"title":""}},"source":["### Define Sources Section with UDFs\n","A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3cc59a0e-a41b-480e-a84e-ca5443d63143","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["batch_source = HdfsSource(name=\"nycTaxiBatchSource\",\n"," path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n"," event_timestamp_column=\"lpep_dropoff_datetime\",\n"," timestamp_format=\"yyyy-MM-dd HH:mm:ss\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"46f863c4-bb81-434a-a448-6b585031a221","showTitle":false,"title":""}},"source":["### Define Anchors and Features\n","A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a373ecbe-a040-4cd3-9d87-0d5f4c5ba553","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["f_trip_distance = Feature(name=\"f_trip_distance\",\n"," feature_type=FLOAT, transform=\"trip_distance\")\n","\n","features = [\n"," f_trip_distance,\n"," Feature(name=\"f_is_long_trip_distance\",\n"," feature_type=BOOLEAN,\n"," transform=\"cast_float(trip_distance)>30\"),\n"," Feature(name=\"f_day_of_week\",\n"," feature_type=INT32,\n"," transform=\"dayofweek(lpep_dropoff_datetime)\"),\n","]\n","\n","request_anchor = FeatureAnchor(name=\"request_features\",\n"," source=INPUT_CONTEXT,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"149f85e2-fa3c-4895-b0c5-de5543ca9b6d","showTitle":false,"title":""}},"source":["### Window aggregation features\n","\n","For window aggregation features, see the supported fields below:\n","\n","Note that the `agg_func` should be any of these:\n","\n","| Aggregation Type | Input Type | Description |\n","| --- | --- | --- |\n","|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n","|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n","|LATEST| Any |Returns the latest not-null values from within the defined time window |\n","\n","\n","After you have defined features and sources, bring them together to build an anchor:\n","\n","\n","Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"05633bc3-9118-449b-9562-45fc437576c2","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["location_id = TypedKey(key_column=\"DOLocationID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"location id in NYC\",\n"," full_name=\"nyc_taxi.location_id\")\n","agg_features = [Feature(name=\"f_location_avg_fare\",\n"," key=location_id,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n"," agg_func=\"AVG\",\n"," window=\"90d\")),\n"," Feature(name=\"f_location_max_fare\",\n"," key=location_id,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n"," agg_func=\"MAX\",\n"," window=\"90d\")),\n"," ]\n","\n","agg_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n"," source=batch_source,\n"," features=agg_features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"d2ecaca9-057e-4b36-811f-320f66f753ed","showTitle":false,"title":""}},"source":["### Derived Features Section\n","Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"270fb11e-8a71-404f-9639-ad29d8e6a2c1","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["\n","f_trip_distance_rounded = DerivedFeature(name=\"f_trip_distance_rounded\",\n"," feature_type=INT32,\n"," input_features=[f_trip_distance],\n"," transform=\"f_trip_distance * 10\")\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ad102c45-586d-468c-85f0-9454401ef10b","showTitle":false,"title":""}},"source":["And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"91bb5ebb-87e4-470b-b8eb-1c89b351740e","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=[\n"," f_trip_distance_rounded])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"632d5f46-f9e2-41a8-aab7-34f75206e2aa","showTitle":false,"title":""}},"source":["## Create training data using point-in-time correct feature join\n","\n","A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n","\n","To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n","what features and how these features should be joined to the observation data. \n","\n","To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["\n","output_path = 'dbfs:/feathrazure_test.avro'\n","\n","\n","feature_query = FeatureQuery(\n"," feature_list=[\"f_location_avg_fare\", \"f_trip_distance_rounded\", \"f_is_long_trip_distance\"], key=location_id)\n","settings = ObservationSettings(\n"," observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n"," event_timestamp_column=\"lpep_dropoff_datetime\",\n"," timestamp_format=\"yyyy-MM-dd HH:mm:ss\")\n","client.get_offline_features(observation_settings=settings,\n"," feature_query=feature_query,\n"," output_path=output_path\n"," )\n","client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"51f078e3-3f8f-4f10-b7f1-499ac8a9ff07","showTitle":false,"title":""}},"source":["## Download the result and show the result\n","\n","Let's use the helper function `get_result_df` to download the result and view it:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"23c797b2-ac1a-4cf3-b0ed-c05216de3f37","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["from feathr.job_utils import get_result_df\n","df_res = get_result_df(client, format=\"avro\", res_url = output_path)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b9be042e-eb12-46b9-9d91-a0e5dd0c704f","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["df_res"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f","showTitle":false,"title":""}},"source":["## Train a machine learning model\n","After getting all the features, let's train a machine learning model with the converted feature by Feathr:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"84745f36-5bac-49c0-903b-38828b923c7c","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["# remove columns\n","from sklearn.ensemble import GradientBoostingRegressor\n","final_df = df_res\n","final_df.drop([\"lpep_pickup_datetime\", \"lpep_dropoff_datetime\",\n"," \"store_and_fwd_flag\"], axis=1, inplace=True, errors='ignore')\n","final_df.fillna(0, inplace=True)\n","final_df['fare_amount'] = final_df['fare_amount'].astype(\"float64\")\n","\n","\n","train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"fare_amount\"], axis=1),\n"," final_df[\"fare_amount\"],\n"," test_size=0.2,\n"," random_state=42)\n","model = GradientBoostingRegressor()\n","model.fit(train_x, train_y)\n","\n","y_predict = model.predict(test_x)\n","\n","y_actual = test_y.values.flatten().tolist()\n","rmse = sqrt(mean_squared_error(y_actual, y_predict))\n","\n","sum_actuals = sum_errors = 0\n","\n","for actual_val, predict_val in zip(y_actual, y_predict):\n"," abs_error = actual_val - predict_val\n"," if abs_error < 0:\n"," abs_error = abs_error * -1\n","\n"," sum_errors = sum_errors + abs_error\n"," sum_actuals = sum_actuals + actual_val\n","\n","mean_abs_percent_error = sum_errors / sum_actuals\n","print(\"Model MAPE:\")\n","print(mean_abs_percent_error)\n","print()\n","print(\"Model Accuracy:\")\n","print(1 - mean_abs_percent_error)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5a226026-1c7b-48db-8f91-88d5c2ddf023","showTitle":false,"title":""}},"source":["## Materialize feature value into offline/online storage\n","\n","While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n","and materialize the feature value to offline and/or online storage. \n","\n","We can push the generated features to the online store like below:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3b924c66-8634-42fe-90f3-c844487d3f75","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["backfill_time = BackfillTime(start=datetime(\n"," 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n","redisSink = RedisSink(table_name=\"nycTaxiDemoFeature\")\n","settings = MaterializationSettings(\"nycTaxiTable\",\n"," backfill_time=backfill_time,\n"," sinks=[redisSink],\n"," feature_names=[\"f_location_avg_fare\", \"f_location_max_fare\"])\n","\n","client.materialize_features(settings)\n","client.wait_job_to_finish(timeout_sec=500)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd","showTitle":false,"title":""}},"source":["We can then get the features from the online store (Redis):"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bef93538-9591-4247-97b6-289d2055b7b1","showTitle":false,"title":""}},"source":["## Fetching feature value for online inference\n","\n","For features that are already materialized by the previous step, their latest value can be queried via the client's\n","`get_online_features` or `multi_get_online_features` API."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0c3d5f35-11a3-4644-9992-5860169d8302","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["res = client.get_online_features('nycTaxiDemoFeature', '265', [\n"," 'f_location_avg_fare', 'f_location_max_fare'])"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4d4699ed-42e6-408f-903d-2f799284f4b6","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["client.multi_get_online_features(\"nycTaxiDemoFeature\", [\"239\", \"265\"], [\n"," 'f_location_avg_fare', 'f_location_max_fare'])"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"nyc_driver_demo","notebookOrigID":930353059183053,"widgets":{}},"interpreter":{"hash":"830c16c5b424e7ff512f67d4056b67cea1a756a7ad6a92c98b9e2b95c5e484ae"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.5"}},"nbformat":4,"nbformat_minor":0} +{"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"384e5e16-7213-4186-9d04-09d03b155534","showTitle":false,"title":""}},"source":["# Feathr Feature Store on Databricks Demo Notebook\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. This is a notebook that's specially designed for databricks clusters and is relying on some of the databricks packages such as dbutils.\n","\n","The intent of this notebook is like \"one click run\" without configuring anything, so it has relatively limited capability. \n","\n","- For example, in this notebook there's no feature registry available since that requires running Azure Purview. \n","- Also for online store (Redis), you need to configure the Redis endpoint, otherwise that part will not work. \n","\n","However, the core part of Feathr, especially defining features, get offline features, point-in-time joins etc., should \"just work\". The full-fledged notebook is [located here](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb)."]},{"cell_type":"markdown","metadata":{},"source":["\n","# Notebook Steps\n","\n","This tutorial demonstrates the key capabilities of Feathr, including:\n","\n","1. Install and set up Feathr with Azure\n","2. Create shareable features with Feathr feature definition configs.\n","3. Create a training dataset via point-in-time feature join.\n","4. Compute and write features.\n","5. Train a model using these features to predict fares.\n","6. Materialize feature value to online store.\n","7. Fetch feature value in real-time from online store for online scoring.\n","\n","In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The feature flow is as below:\n","\n","![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/feature_flow.png?raw=true)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f00b9d0b-94d1-418f-89b9-25bbacb8b068","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["! pip install feathr pandavro scikit-learn"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"80223a02-631c-40c8-91b3-a037249ffff9","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["import glob\n","import os\n","import tempfile\n","from datetime import datetime, timedelta\n","from math import sqrt\n","\n","import pandas as pd\n","import pandavro as pdx\n","from feathr import FeathrClient\n","from feathr import BOOLEAN, FLOAT, INT32, ValueType\n","from feathr import Feature, DerivedFeature, FeatureAnchor\n","from feathr import BackfillTime, MaterializationSettings\n","from feathr import FeatureQuery, ObservationSettings\n","from feathr import RedisSink\n","from feathr import INPUT_CONTEXT, HdfsSource\n","from feathr import WindowAggTransformation\n","from feathr import TypedKey\n","from sklearn.metrics import mean_squared_error\n","from sklearn.model_selection import train_test_split\n","from azure.identity import DefaultAzureCredential\n","from azure.keyvault.secrets import SecretClient\n","import json\n","import requests"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"41d3648a-9bc9-40dc-90da-bc82b21ef9b3","showTitle":false,"title":""}},"source":["Get the required databricks credentials automatically:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"331753d6-1850-47b5-ad97-84b7c01d79d1","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["# Get current databricks notebook context\n","ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n","host_name = ctx.tags().get(\"browserHostName\").get()\n","host_token = ctx.apiToken().get()\n","cluster_id = ctx.tags().get(\"clusterId\").get()\n","\n","\n","feathr_runtime_location = \"https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\"\n","\n","# databricks_config = {'run_name':'FEATHR_FILL_IN','existing_cluster_id':cluster_id,'libraries':[{'jar':'FEATHR_FILL_IN'}],'spark_jar_task':{'main_class_name':'FEATHR_FILL_IN','parameters':['FEATHR_FILL_IN']}}\n","os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + host_name\n","os.environ['spark_config__databricks__config_template']='{\"run_name\":\"FEATHR_FILL_IN\",\"new_cluster\":{\"spark_version\":\"10.4.x-scala2.12\",\"node_type_id\":\"Standard_D3_v2\",\"num_workers\":2,\"spark_conf\":{\"FEATHR_FILL_IN\":\"FEATHR_FILL_IN\"}},\"libraries\":[{\"jar\":\"FEATHR_FILL_IN\"}],\"spark_jar_task\":{\"main_class_name\":\"FEATHR_FILL_IN\",\"parameters\":[\"FEATHR_FILL_IN\"]}}'\n","# os.environ['spark_config__databricks__config_template']=json.dumps(databricks_config)\n","os.environ['spark_config__databricks__work_dir']='dbfs:/feathr_getting_started'\n","os.environ['spark_config__databricks__feathr_runtime_location']=feathr_runtime_location\n","os.environ['project_config__project_name']='feathr_getting_started'\n","os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = host_token"]},{"cell_type":"markdown","metadata":{},"source":["You need to setup the Redis credentials below in order to push features to online store. You can skip this part if you don't have Redis, but there will be failures for `client.materialize_features(settings)` API."]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Get redis credentials; This is to parse Redis connection string.\n","redis_port=\"\"\n","redis_host=\"\"\n","redis_password=\"\"\n","redis_ssl=\"\"\n","\n","# Set the resource link\n","os.environ['online_store__redis__host'] = redis_host\n","os.environ['online_store__redis__port'] = redis_port\n","os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n","os.environ['REDIS_PASSWORD']=redis_password"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee","showTitle":false,"title":""}},"source":["Configure required credentials (skip if you don't use those):"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"8cd64e3a-376c-48e6-ba41-5197f3591d48","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["import tempfile\n","yaml_config = \"\"\"\n","# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n","api_version: 1\n","project_config:\n"," project_name: 'feathr_getting_started2'\n"," required_environment_variables:\n"," - 'REDIS_PASSWORD'\n"," - 'AZURE_CLIENT_ID'\n"," - 'AZURE_TENANT_ID'\n"," - 'AZURE_CLIENT_SECRET'\n","offline_store:\n"," adls:\n"," adls_enabled: true\n"," wasb:\n"," wasb_enabled: true\n"," s3:\n"," s3_enabled: false\n"," s3_endpoint: 's3.amazonaws.com'\n"," jdbc:\n"," jdbc_enabled: false\n"," jdbc_database: 'feathrtestdb'\n"," jdbc_table: 'feathrtesttable'\n"," snowflake:\n"," url: \"dqllago-ol19457.snowflakecomputing.com\"\n"," user: \"feathrintegration\"\n"," role: \"ACCOUNTADMIN\"\n","spark_config:\n"," # choice for spark runtime. Currently support: azure_synapse, databricks\n"," # The `databricks` configs will be ignored if `azure_synapse` is set and vice versa.\n"," spark_cluster: \"databricks\"\n"," spark_result_output_parts: \"1\"\n","\n","online_store:\n"," redis:\n"," host: 'feathrazuretest3redis.redis.cache.windows.net'\n"," port: 6380\n"," ssl_enabled: True\n","feature_registry:\n"," purview:\n"," type_system_initialization: true\n"," purview_name: 'feathrazuretest3-purview1'\n"," delimiter: '__'\n","\"\"\"\n","tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n","with open(tmp.name, \"w\") as text_file:\n"," text_file.write(yaml_config)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3fef7f2f-df19-4f53-90a5-ff7999ed983d","showTitle":false,"title":""}},"source":["# Initialize Feathr Client"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9713a2df-c7b2-4562-88b0-b7acce3cc43a","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["client = FeathrClient(config_path=tmp.name)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c3b64bda-d42c-4a64-b976-0fb604cf38c5","showTitle":false,"title":""}},"source":["## View the data\n","\n","In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The data is as below"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c4ccd7b3-298a-4e5a-8eec-b7e309db393e","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["import pandas as pd\n","pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7430c942-64e5-4b70-b823-16ce1d1b3cee","showTitle":false,"title":""}},"source":["## Defining Features with Feathr\n","\n","In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n","\n","\n","1. The typed key (a.k.a. entity id) identifies the subject of feature, e.g. a user id, 123.\n","2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n","3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"16420730-582e-4e11-a343-efc0ddd35108","showTitle":false,"title":""}},"source":["Note that, in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n","It is merely a function/transformation executing against request data at runtime.\n","For example, the day of week of the request, which is calculated by converting the request UNIX timestamp."]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"728d2d5f-c11f-4941-bdc5-48507f5749f1","showTitle":false,"title":""}},"source":["### Define Sources Section with UDFs\n","A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3cc59a0e-a41b-480e-a84e-ca5443d63143","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["batch_source = HdfsSource(name=\"nycTaxiBatchSource\",\n"," path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n"," event_timestamp_column=\"lpep_dropoff_datetime\",\n"," timestamp_format=\"yyyy-MM-dd HH:mm:ss\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"46f863c4-bb81-434a-a448-6b585031a221","showTitle":false,"title":""}},"source":["### Define Anchors and Features\n","A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a373ecbe-a040-4cd3-9d87-0d5f4c5ba553","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["f_trip_distance = Feature(name=\"f_trip_distance\",\n"," feature_type=FLOAT, transform=\"trip_distance\")\n","\n","features = [\n"," f_trip_distance,\n"," Feature(name=\"f_is_long_trip_distance\",\n"," feature_type=BOOLEAN,\n"," transform=\"cast_float(trip_distance)>30\"),\n"," Feature(name=\"f_day_of_week\",\n"," feature_type=INT32,\n"," transform=\"dayofweek(lpep_dropoff_datetime)\"),\n","]\n","\n","request_anchor = FeatureAnchor(name=\"request_features\",\n"," source=INPUT_CONTEXT,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"149f85e2-fa3c-4895-b0c5-de5543ca9b6d","showTitle":false,"title":""}},"source":["### Window aggregation features\n","\n","For window aggregation features, see the supported fields below:\n","\n","Note that the `agg_func` should be any of these:\n","\n","| Aggregation Type | Input Type | Description |\n","| --- | --- | --- |\n","|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n","|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n","|LATEST| Any |Returns the latest not-null values from within the defined time window |\n","\n","\n","After you have defined features and sources, bring them together to build an anchor:\n","\n","\n","Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"05633bc3-9118-449b-9562-45fc437576c2","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["location_id = TypedKey(key_column=\"DOLocationID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"location id in NYC\",\n"," full_name=\"nyc_taxi.location_id\")\n","agg_features = [Feature(name=\"f_location_avg_fare\",\n"," key=location_id,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n"," agg_func=\"AVG\",\n"," window=\"90d\")),\n"," Feature(name=\"f_location_max_fare\",\n"," key=location_id,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n"," agg_func=\"MAX\",\n"," window=\"90d\")),\n"," ]\n","\n","agg_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n"," source=batch_source,\n"," features=agg_features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"d2ecaca9-057e-4b36-811f-320f66f753ed","showTitle":false,"title":""}},"source":["### Derived Features Section\n","Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"270fb11e-8a71-404f-9639-ad29d8e6a2c1","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["\n","f_trip_distance_rounded = DerivedFeature(name=\"f_trip_distance_rounded\",\n"," feature_type=INT32,\n"," input_features=[f_trip_distance],\n"," transform=\"f_trip_distance * 10\")\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"ad102c45-586d-468c-85f0-9454401ef10b","showTitle":false,"title":""}},"source":["And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"91bb5ebb-87e4-470b-b8eb-1c89b351740e","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=[\n"," f_trip_distance_rounded])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"632d5f46-f9e2-41a8-aab7-34f75206e2aa","showTitle":false,"title":""}},"source":["## Create training data using point-in-time correct feature join\n","\n","A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n","\n","To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n","what features and how these features should be joined to the observation data. \n","\n","To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["\n","output_path = 'dbfs:/feathrazure_test.avro'\n","\n","\n","feature_query = FeatureQuery(\n"," feature_list=[\"f_location_avg_fare\", \"f_trip_distance_rounded\", \"f_is_long_trip_distance\"], key=location_id)\n","settings = ObservationSettings(\n"," observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n"," event_timestamp_column=\"lpep_dropoff_datetime\",\n"," timestamp_format=\"yyyy-MM-dd HH:mm:ss\")\n","client.get_offline_features(observation_settings=settings,\n"," feature_query=feature_query,\n"," output_path=output_path\n"," )\n","client.wait_job_to_finish(timeout_sec=500)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"51f078e3-3f8f-4f10-b7f1-499ac8a9ff07","showTitle":false,"title":""}},"source":["## Download the result and show the result\n","\n","Let's use the helper function `get_result_df` to download the result and view it:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"23c797b2-ac1a-4cf3-b0ed-c05216de3f37","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["from feathr.job_utils import get_result_df\n","df_res = get_result_df(client, format=\"avro\", res_url = output_path)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b9be042e-eb12-46b9-9d91-a0e5dd0c704f","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["df_res"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f","showTitle":false,"title":""}},"source":["## Train a machine learning model\n","After getting all the features, let's train a machine learning model with the converted feature by Feathr:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"84745f36-5bac-49c0-903b-38828b923c7c","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["# remove columns\n","from sklearn.ensemble import GradientBoostingRegressor\n","final_df = df_res\n","final_df.drop([\"lpep_pickup_datetime\", \"lpep_dropoff_datetime\",\n"," \"store_and_fwd_flag\"], axis=1, inplace=True, errors='ignore')\n","final_df.fillna(0, inplace=True)\n","final_df['fare_amount'] = final_df['fare_amount'].astype(\"float64\")\n","\n","\n","train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"fare_amount\"], axis=1),\n"," final_df[\"fare_amount\"],\n"," test_size=0.2,\n"," random_state=42)\n","model = GradientBoostingRegressor()\n","model.fit(train_x, train_y)\n","\n","y_predict = model.predict(test_x)\n","\n","y_actual = test_y.values.flatten().tolist()\n","rmse = sqrt(mean_squared_error(y_actual, y_predict))\n","\n","sum_actuals = sum_errors = 0\n","\n","for actual_val, predict_val in zip(y_actual, y_predict):\n"," abs_error = actual_val - predict_val\n"," if abs_error < 0:\n"," abs_error = abs_error * -1\n","\n"," sum_errors = sum_errors + abs_error\n"," sum_actuals = sum_actuals + actual_val\n","\n","mean_abs_percent_error = sum_errors / sum_actuals\n","print(\"Model MAPE:\")\n","print(mean_abs_percent_error)\n","print()\n","print(\"Model Accuracy:\")\n","print(1 - mean_abs_percent_error)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5a226026-1c7b-48db-8f91-88d5c2ddf023","showTitle":false,"title":""}},"source":["## Materialize feature value into offline/online storage\n","\n","While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n","and materialize the feature value to offline and/or online storage. \n","\n","We can push the generated features to the online store like below:"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3b924c66-8634-42fe-90f3-c844487d3f75","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["backfill_time = BackfillTime(start=datetime(\n"," 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n","redisSink = RedisSink(table_name=\"nycTaxiDemoFeature\")\n","settings = MaterializationSettings(\"nycTaxiTable\",\n"," backfill_time=backfill_time,\n"," sinks=[redisSink],\n"," feature_names=[\"f_location_avg_fare\", \"f_location_max_fare\"])\n","\n","client.materialize_features(settings)\n","client.wait_job_to_finish(timeout_sec=500)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd","showTitle":false,"title":""}},"source":["We can then get the features from the online store (Redis):"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"bef93538-9591-4247-97b6-289d2055b7b1","showTitle":false,"title":""}},"source":["## Fetching feature value for online inference\n","\n","For features that are already materialized by the previous step, their latest value can be queried via the client's\n","`get_online_features` or `multi_get_online_features` API."]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0c3d5f35-11a3-4644-9992-5860169d8302","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["res = client.get_online_features('nycTaxiDemoFeature', '265', [\n"," 'f_location_avg_fare', 'f_location_max_fare'])"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4d4699ed-42e6-408f-903d-2f799284f4b6","showTitle":false,"title":""}},"outputs":[{"data":{"text/html":[""]},"metadata":{"application/vnd.databricks.v1+output":{"arguments":{},"data":"","errorSummary":"","errorTraceType":null,"metadata":{},"type":"ipynbError"}},"output_type":"display_data"}],"source":["client.multi_get_online_features(\"nycTaxiDemoFeature\", [\"239\", \"265\"], [\n"," 'f_location_avg_fare', 'f_location_max_fare'])"]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"nyc_driver_demo","notebookOrigID":930353059183053,"widgets":{}},"interpreter":{"hash":"830c16c5b424e7ff512f67d4056b67cea1a756a7ad6a92c98b9e2b95c5e484ae"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.5"}},"nbformat":4,"nbformat_minor":0} diff --git a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/config/datasource/DataSourceConfigUtils.scala b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/config/datasource/DataSourceConfigUtils.scala index 0c8993e58..a97100e30 100644 --- a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/config/datasource/DataSourceConfigUtils.scala +++ b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/config/datasource/DataSourceConfigUtils.scala @@ -35,6 +35,7 @@ object DataSourceConfigUtils { blobConfigStr = cmdParser.extractOptionalValue("blob-config"), sqlConfigStr = cmdParser.extractOptionalValue("sql-config"), snowflakeConfigStr = cmdParser.extractOptionalValue("snowflake-config"), + monitoringConfigStr = cmdParser.extractOptionalValue("monitoring-config"), kafkaConfigStr = cmdParser.extractOptionalValue("kafka-config") ) } @@ -45,6 +46,7 @@ object DataSourceConfigUtils { BlobResourceInfoSetter.setup(ss, configs.blobConfig, resource) S3ResourceInfoSetter.setup(ss, configs.s3Config, resource) SnowflakeResourceInfoSetter.setup(ss, configs.snowflakeConfig, resource) + MonitoringResourceInfoSetter.setup(ss, configs.monitoringConfig, resource) KafkaResourceInfoSetter.setup(ss, configs.kafkaConfig, resource) } diff --git a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/config/datasource/DataSourceConfigs.scala b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/config/datasource/DataSourceConfigs.scala index a57e84da0..b8156d991 100644 --- a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/config/datasource/DataSourceConfigs.scala +++ b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/config/datasource/DataSourceConfigs.scala @@ -17,6 +17,7 @@ class DataSourceConfigs( val blobConfigStr: Option[String] = None, val sqlConfigStr: Option[String] = None, val snowflakeConfigStr: Option[String] = None, + val monitoringConfigStr: Option[String] = None, val kafkaConfigStr: Option[String] = None ) { val redisConfig: DataSourceConfig = parseConfigStr(redisConfigStr) @@ -25,6 +26,7 @@ class DataSourceConfigs( val blobConfig: DataSourceConfig = parseConfigStr(blobConfigStr) val sqlConfig: DataSourceConfig = parseConfigStr(sqlConfigStr) val snowflakeConfig: DataSourceConfig = parseConfigStr(snowflakeConfigStr) + val monitoringConfig: DataSourceConfig = parseConfigStr(monitoringConfigStr) val kafkaConfig: DataSourceConfig = parseConfigStr(kafkaConfigStr) def parseConfigStr(configStr: Option[String] = None): DataSourceConfig = { diff --git a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/config/datasource/MonitoringResourceInfoSetter.scala b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/config/datasource/MonitoringResourceInfoSetter.scala new file mode 100644 index 000000000..bdb36f3e6 --- /dev/null +++ b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/config/datasource/MonitoringResourceInfoSetter.scala @@ -0,0 +1,26 @@ +package com.linkedin.feathr.offline.config.datasource + +import org.apache.spark.sql.SparkSession + +private[feathr] class MonitoringResourceInfoSetter extends ResourceInfoSetter() { + override val params: List[String] = List() + + override def setupHadoopConfig(ss: SparkSession, context: Option[DataSourceConfig], resource: Option[Resource]): Unit = { + context.foreach(dataSourceConfig => { + ss.conf.set("monitoring_database_url", getAuthFromContext("MONITORING_DATABASE_SQL_URL", dataSourceConfig)) + ss.conf.set("monitoring_database_user", getAuthFromContext("MONITORING_DATABASE_SQL_USER", dataSourceConfig)) + ss.conf.set("monitoring_database_password", getAuthFromContext("MONITORING_DATABASE_SQL_PASSWORD", dataSourceConfig)) + }) + } + + override def getAuthFromConfig(str: String, resource: Resource): String = ??? +} + + +private[feathr] object MonitoringResourceInfoSetter{ + val monitoringSetter = new MonitoringResourceInfoSetter() + + def setup(ss: SparkSession, config: DataSourceConfig, resource: Resource): Unit ={ + monitoringSetter.setup(ss, config, resource) + } +} \ No newline at end of file diff --git a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/generation/outputProcessor/FeatureMonitoringProcessor.scala b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/generation/outputProcessor/FeatureMonitoringProcessor.scala new file mode 100644 index 000000000..47d08c544 --- /dev/null +++ b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/generation/outputProcessor/FeatureMonitoringProcessor.scala @@ -0,0 +1,34 @@ +package com.linkedin.feathr.offline.generation.outputProcessor + +import com.linkedin.feathr.common.Header +import com.linkedin.feathr.common.configObj.generation.OutputProcessorConfig +import com.linkedin.feathr.offline.generation.FeatureGenUtils +import com.linkedin.feathr.offline.generation.outputProcessor.PushToRedisOutputProcessor.TABLE_PARAM_CONFIG_NAME +import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} + +/** + * feature generation output processor used to generate feature monitoring stats and pushed to sink + * @param config config object of output processor, built from the feature generation config + */ + +private[offline] class FeatureMonitoringProcessor(config: OutputProcessorConfig, endTimeOpt: Option[String] = None) extends WriteToHDFSOutputProcessor(config, endTimeOpt, dataLoaderHandlers=List()) { + /** + * process single dataframe, e.g, convert feature data schema + * + * @param ss spark session + * @param df feature dataframe + * @param header meta info of the input dataframe + * @param parentPath path to save feature data + * @return processed dataframe and header + */ + override def processSingle(ss: SparkSession, df: DataFrame, header: Header, parentPath: String): (DataFrame, Header) = { + val keyColumns = FeatureGenUtils.getKeyColumnsFromHeader(header) + + val tableName = config.getParams.getString(TABLE_PARAM_CONFIG_NAME) + val allFeatureCols = header.featureInfoMap.map(x => (x._2.columnName)).toSet + + FeatureMonitoringUtils.writeToRedis(ss, df, tableName, keyColumns, allFeatureCols, SaveMode.Overwrite) + (df, header) + } +} + diff --git a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/generation/outputProcessor/FeatureMonitoringUtils.scala b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/generation/outputProcessor/FeatureMonitoringUtils.scala new file mode 100644 index 000000000..95675e8a4 --- /dev/null +++ b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/generation/outputProcessor/FeatureMonitoringUtils.scala @@ -0,0 +1,126 @@ +package com.linkedin.feathr.offline.generation.outputProcessor + +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} + +object FeatureMonitoringUtils { + def writeToRedis(ss: SparkSession, df: DataFrame, tableName: String, keyColumns: Seq[String], allFeatureCols: Set[String], saveMode: SaveMode): Unit = { + df.show(10) + + val dfSchema = df.schema + dfSchema.indices.foreach(index => { + val field = dfSchema.fields(index) + val fieldName = field.name + if (allFeatureCols.contains(fieldName)) { + field.dataType match { + case DoubleType | FloatType | IntegerType | LongType => + val missing = df.filter(col(fieldName).isNull).count() + val total = df.count() +// +------------+------------+----------+----+---+---+---+--------+ +// |feature_name|feature_type| date|mean|avg|min|max|coverage| +// +------------+------------+----------+----+---+---+---+--------+ +// | f_int| integer|2022-06-09| 0.5|0.5| 0| 1| 1.0| +// +------------+------------+----------+----+---+---+---+--------+ +// +// +------------+------------+----------+------------------+------------------+-------------------+------------------+--------+ +// |feature_name|feature_type| date| mean| avg| min| max|coverage| +// +------------+------------+----------+------------------+------------------+-------------------+------------------+--------+ +// | f_double| double|2022-06-09|0.6061345296768118|0.6061345296768118|0.13751738103840128|0.9651418273038033| 1.0| +// +------------+------------+----------+------------------+------------------+-------------------+------------------+--------+ + val stats_df = df.select( + lit(fieldName).name("feature_name"), + lit(field.dataType.typeName).name("feature_type"), + current_date().name("date"), + mean(df(fieldName)).name("mean"), + avg(df(fieldName)).name("avg"), + min(df(fieldName)).name("min"), + max(df(fieldName)).name("max"), + lit((total - missing) * 1.0 / total).name("coverage") + ) + + stats_df.show() + writeToSql(ss, stats_df, fieldName, saveMode) + case StringType | BooleanType => + // Will add support for more stats as we have more user requirements + // The difficulty with term frequency is that it requires a different table other than the scalar stats. +// val frequencyDf = df +// .select( +// lit(fieldName).name("feature_name"), +// lit(field.dataType.typeName).name("feature_type"), +// current_date(), +// col(fieldName), +// ) +// .groupBy(fieldName) +// .count() +// .select( +// col("*"), +// lit(fieldName).name("feature_name"), +// lit(field.dataType.typeName).name("feature_type"), +// current_date() +// ) +// writeToSql(frequencyDf, fieldName + "_frequency") + + val missing = df.filter(col(fieldName).isNull).count() + val total = df.count() + // cardinality is defined as the number of elements in a set or other grouping, as a property of that grouping. + val cardinality = df.groupBy(fieldName).count().count() + +// +------------+------------+----------+-----+------+--------+-----------+ +// |feature_name|feature_type| date| min| max|coverage|cardinality| +// +------------+------------+----------+-----+------+--------+-----------+ +// | f_string| string|2022-06-09|apple|orange| 0.9| 3| +// +------------+------------+----------+-----+------+--------+-----------+ +// +------------+------------+----------+-----+----+--------+-----------+ +// |feature_name|feature_type| date| min| max|coverage|cardinality| +// +------------+------------+----------+-----+----+--------+-----------+ +// | f_boolean| boolean|2022-06-09|false|true| 1.0| 2| +// +------------+------------+----------+-----+----+--------+-----------+ + val stats_df = df.select( + lit(fieldName).name("feature_name"), + lit(field.dataType.typeName).name("feature_type"), + current_date().name("date"), + min(df(fieldName)).name("min"), + max(df(fieldName)).name("max"), + lit((total - missing) * 1.0 / total).name("coverage"), + lit(cardinality).name("cardinality") + ) + + writeToSql(ss, stats_df, fieldName, saveMode) + case _ => + (rowData: Any) => { + throw new RuntimeException(f"The data type(${field.dataType}) and data (${rowData}) is not supported in monitoring yet.") + } + } + } + }) + } + + /** + * Write the feature monitoring results(usually stats) to SQL database. + */ + private def writeToSql(ss: SparkSession, stats_df: DataFrame, tableName: String, saveMode: SaveMode): Unit = { + if (!ss.sparkContext.isLocal) { + val url = ss.conf.get("monitoring_database_url") + val username = ss.conf.get("monitoring_database_user") + val password = ss.conf.get("monitoring_database_password") + + println("monitoring output:") + println("url: " + url) + println("username: " + username) + + stats_df.write + .format("jdbc") + .option("url", url) + .option("dbtable", tableName) + .option("user", username) + .option("password", password) + .option("ssl", true) + .option("sslmode", "require") + .mode(saveMode) + .save() + } else { + stats_df.show(10) + } + } +} diff --git a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/job/FeatureGenJob.scala b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/job/FeatureGenJob.scala index 394bd66b3..cb9dda05d 100644 --- a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/job/FeatureGenJob.scala +++ b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/job/FeatureGenJob.scala @@ -47,6 +47,7 @@ object FeatureGenJob { "blob-config" -> OptionParam("bc", "Authentication config for Azure Blob Storage (wasb)", "BLOB_CONFIG", ""), "sql-config" -> OptionParam("sqlc", "Authentication config for Azure SQL Database (jdbc)", "SQL_CONFIG", ""), "snowflake-config" -> OptionParam("sfc", "Authentication config for Snowflake Database (jdbc)", "SNOWFLAKE_CONFIG", ""), + "monitoring-config" -> OptionParam("mc", "Feature monitoring related configs", "MONITORING_CONFIG", ""), "kafka-config" -> OptionParam("kc", "Authentication config for Kafka", "KAFKA_CONFIG", "") ) val extraOptions = List(new CmdOption("LOCALMODE", "local-mode", false, "Run in local mode")) @@ -65,6 +66,8 @@ object FeatureGenJob { val dataSourceConfigs = DataSourceConfigUtils.getConfigs(cmdParser) val featureGenJobContext = new FeatureGenJobContext(workDir, paramsOverride, featureConfOverride) + println("dataSourceConfigs: ") + println(dataSourceConfigs) (applicationConfigPath, featureDefinitionsInput, featureGenJobContext, dataSourceConfigs) } @@ -208,7 +211,7 @@ object FeatureGenJob { val feathrClient = FeathrClient.builder(sparkSession) .addFeatureDef(featureConfig) - .addLocalOverrideDef(localFeatureConfigWithOverride) + .addLocalOverrideDef(localFeatureConfigWithOverride) .build() val allAnchoredFeatures = feathrClient.allAnchoredFeatures diff --git a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/job/FeatureGenSpec.scala b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/job/FeatureGenSpec.scala index f974c0aa1..caf6e0e28 100644 --- a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/job/FeatureGenSpec.scala +++ b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/job/FeatureGenSpec.scala @@ -5,7 +5,7 @@ import com.linkedin.feathr.common.configObj.configbuilder.{FeatureGenConfigBuild import com.linkedin.feathr.common.configObj.generation.{FeatureGenConfig, OfflineOperationalConfig, OutputProcessorConfig} import com.linkedin.feathr.common.exception.{ErrorLabel, FeathrDataOutputException} import com.linkedin.feathr.common.{DateParam, DateTimeParam, DateTimeUtils, RichConfig} -import com.linkedin.feathr.offline.generation.outputProcessor.{PushToRedisOutputProcessor, WriteToHDFSOutputProcessor} +import com.linkedin.feathr.offline.generation.outputProcessor.{FeatureMonitoringProcessor, PushToRedisOutputProcessor, WriteToHDFSOutputProcessor} import com.linkedin.feathr.offline.util.{FeatureGenConstants, IncrementalAggUtils} import com.linkedin.feathr.offline.source.dataloader.DataLoaderHandler import com.linkedin.feathr.sparkcommon.OutputProcessor @@ -55,6 +55,10 @@ class FeatureGenSpec(private val featureGenConfig: FeatureGenConfig, dataLoaderH val params = config.getParams val decoratedConfig = OutputProcessorBuilder.build(config.getName, params) new PushToRedisOutputProcessor(decoratedConfig, None) + case FeatureGenConstants.MONITORING_OUTPUT_PROCESSOR_NAME => + val params = config.getParams + val decoratedConfig = OutputProcessorBuilder.build(config.getName, params) + new FeatureMonitoringProcessor(decoratedConfig, None) case _ => throw new FeathrDataOutputException(ErrorLabel.FEATHR_USER_ERROR, "Custom output processor is not yet supported.") } diff --git a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/util/FeatureGenUtils.scala b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/util/FeatureGenUtils.scala index 3ccbe57f7..26339a94f 100644 --- a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/util/FeatureGenUtils.scala +++ b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/util/FeatureGenUtils.scala @@ -16,6 +16,7 @@ import java.util.TimeZone private[offline] object FeatureGenConstants { val HDFS_OUTPUT_PROCESSOR_NAME = "HDFS" val REDIS_OUTPUT_PROCESSOR_NAME = "REDIS" + val MONITORING_OUTPUT_PROCESSOR_NAME = "MONITORING" val OUTPUT_TIME_PATH = "outputTimePath" val SAVE_SCHEMA_META = "saveSchemaMeta" val WORK_DIR = "workDir" diff --git a/feathr-impl/src/test/resources/mockdata/feature_monitoring_mock_data/feature_monitoring_data.csv b/feathr-impl/src/test/resources/mockdata/feature_monitoring_mock_data/feature_monitoring_data.csv new file mode 100644 index 000000000..faf4f804f --- /dev/null +++ b/feathr-impl/src/test/resources/mockdata/feature_monitoring_mock_data/feature_monitoring_data.csv @@ -0,0 +1,11 @@ +user_id,value1,value2,value3,value4,value_string,value_boolean +1,1,2,3,4,apple,true +2,1,2,3,4,apple,false +3,1,2,3,4,,false +4,1,2,3,4,orange,false +5,1,2,3,4,orange,false +6,1,2,3,4,orange,false +7,1,2,3,4,orange,false +8,1,2,3,4,orange,false +9,1,2,3,4,orange,false +10,1,2,3,4,orange,true \ No newline at end of file diff --git a/feathr-impl/src/test/scala/com/linkedin/feathr/offline/FeatureMonitoringIntegTest.scala b/feathr-impl/src/test/scala/com/linkedin/feathr/offline/FeatureMonitoringIntegTest.scala new file mode 100644 index 000000000..d901665f2 --- /dev/null +++ b/feathr-impl/src/test/scala/com/linkedin/feathr/offline/FeatureMonitoringIntegTest.scala @@ -0,0 +1,105 @@ +package com.linkedin.feathr.offline + +import org.testng.annotations.Test + +/** + * Integration tests to test feature monitoring APIs in feathr offline. + */ +class FeatureMonitoringIntegTest extends FeathrIntegTest { + /** + * Test scalar features + */ + @Test(enabled = true) + def testFeatureGenWithApplicationConfig(): Unit = { + val applicationConfig = + s""" + | operational: { + | name: generateWithDefaultParams + | endTime: 2021-01-02 + | endTimeFormat: "yyyy-MM-dd" + | resolution: DAILY + | output:[ + | { + | name: MONITORING + | params: { + | table_name: "monitoringFeatures" + | } + | } + | ] + |} + |features: [f_string, f_int, f_null, f_double, f_null, f_boolean + |] + """.stripMargin + val featureDefConfig = + """ + |anchors: { + | anchor: { + | source: featureMonitoringSource + | key: user_id + | features: { + | f_string: { + | def: "value_string" + | type : { + | type: TENSOR + | tensorCategory: DENSE + | dimensionType: [] + | valType: STRING + | } + | } + | f_int: { + | def: "import java.util.Random; Random random = new Random(); random.nextInt(2)" + | type : { + | type: TENSOR + | tensorCategory: DENSE + | dimensionType: [] + | valType: INT + | } + | } + | f_null: { + | def: "null" + | type : { + | type: TENSOR + | tensorCategory: DENSE + | dimensionType: [] + | valType: DOUBLE + | } + | } + | f_double: { + | def: "import java.util.Random; Random random = new Random(); random.nextDouble()" + | type : { + | type: TENSOR + | tensorCategory: DENSE + | dimensionType: [] + | valType: DOUBLE + | } + | } + | f_boolean: { + | def: "Boolean.valueOf(value_boolean)" + | type : { + | type: TENSOR + | tensorCategory: DENSE + | dimensionType: [] + | valType: BOOLEAN + | } + | } + | } + | } + |} + | + |derivations: { + | f_derived: { + | definition: "f_double * f_double" + | type: NUMERIC + | } + |} + |sources: { + | featureMonitoringSource: { + | location: { path: "/feature_monitoring_mock_data/feature_monitoring_data.csv" } + | } + |} + |""".stripMargin + + val res = localFeatureGenerate(applicationConfig, featureDefConfig) + res.head._2.data.show(100) + } +} diff --git a/feathr_project/feathr/__init__.py b/feathr_project/feathr/__init__.py index b62529dd5..bfa4b0895 100644 --- a/feathr_project/feathr/__init__.py +++ b/feathr_project/feathr/__init__.py @@ -9,6 +9,7 @@ from .definition.source import * from .definition.typed_key import * from .definition.materialization_settings import * +from .definition.monitoring_settings import * from .definition.sink import * from .definition.query_feature_list import * from .definition.lookup_feature import * @@ -19,7 +20,7 @@ from .api.app.core.feathr_api_exception import * # skipped class as they are internal methods: -# RepoDefinitions, HoconConvertible, +# RepoDefinitions, HoconConvertible, # expose the modules so docs can build # referencee: https://stackoverflow.com/questions/15115514/how-do-i-document-classes-without-the-module-name/31594545#31594545 @@ -30,45 +31,45 @@ __all__ = [ - 'FeatureJoinJobParams', - 'FeatureGenerationJobParams', - 'FeathrClient', - 'DerivedFeature', - 'FeatureAnchor', - 'Feature', - 'ValueType', - 'WindowAggTransformation', - 'TypedKey', - 'DUMMYKEY', - 'BackfillTime', - 'MaterializationSettings', - 'RedisSink', - 'FeatureQuery', - 'LookupFeature', - 'Aggregation', - 'get_result_df', - 'AvroJsonSchema', - 'Source', - 'InputContext', - 'HdfsSource', + 'FeatureJoinJobParams', + 'FeatureGenerationJobParams', + 'FeathrClient', + 'DerivedFeature', + 'FeatureAnchor', + 'Feature', + 'ValueType', + 'WindowAggTransformation', + 'TypedKey', + 'DUMMYKEY', + 'BackfillTime', + 'MaterializationSettings', + 'MonitoringSettings', + 'RedisSink', + 'MonitoringSqlSink', + 'FeatureQuery', + 'LookupFeature', + 'Aggregation', + 'get_result_df', + 'AvroJsonSchema', + 'Source', + 'InputContext', + 'HdfsSource', 'KafkaConfig', - 'KafKaSource', - 'ValueType', - 'BooleanFeatureType', - 'Int32FeatureType', - 'Int64FeatureType', - 'FloatFeatureType', - 'DoubleFeatureType', - 'StringFeatureType', + 'KafKaSource', + 'ValueType', + 'BooleanFeatureType', + 'Int32FeatureType', + 'Int64FeatureType', + 'FloatFeatureType', + 'DoubleFeatureType', + 'StringFeatureType', 'BytesFeatureType', - 'FloatVectorFeatureType', - 'Int32VectorFeatureType', - 'Int64VectorFeatureType', - 'DoubleVectorFeatureType', - 'FeatureNameValidationError', - 'ObservationSettings', - 'FeaturePrinter', + 'FloatVectorFeatureType', + 'Int32VectorFeatureType', + 'Int64VectorFeatureType', + 'DoubleVectorFeatureType', + 'FeatureNameValidationError', + 'ObservationSettings', + 'FeaturePrinter', 'SparkExecutionConfiguration', ] - - diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index bd9087861..a4069a9bb 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -22,6 +22,7 @@ from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration from feathr.definition.feature_derivations import DerivedFeature from feathr.definition.materialization_settings import MaterializationSettings +from feathr.definition.monitoring_settings import MonitoringSettings from feathr.protobuf.featureValue_pb2 import FeatureValue from feathr.definition.query_feature_list import FeatureQuery from feathr.definition.settings import ObservationSettings @@ -90,7 +91,7 @@ def __init__(self, config_path:str = "./feathr_config.yaml", local_workspace_dir self.logger = logging.getLogger(__name__) # Redis key separator self._KEY_SEPARATOR = ':' - envutils = _EnvVaraibleUtil(config_path) + self.envutils = _EnvVaraibleUtil(config_path) if local_workspace_dir: self.local_workspace_dir = local_workspace_dir else: @@ -98,32 +99,30 @@ def __init__(self, config_path:str = "./feathr_config.yaml", local_workspace_dir tem_dir_obj = tempfile.TemporaryDirectory() self.local_workspace_dir = tem_dir_obj.name - self.envutils = envutils - if not os.path.exists(config_path): - self.logger.warning('Configuration path does not exist, you need to set the environment variables explicitly. For all the environment variables, please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml') + self.logger.warning('No Configuration file exist at the user provided config_path or the default config_path (./feathr_config.yaml), you need to set the environment variables explicitly. For all the environment variables that you need to set, please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml') # Load all configs from yaml at initialization # DO NOT load any configs from yaml during runtime. - self.project_name = envutils.get_environment_variable_with_default( + self.project_name = self.envutils.get_environment_variable_with_default( 'project_config', 'project_name') # Redis configs - self.redis_host = envutils.get_environment_variable_with_default( + self.redis_host = self.envutils.get_environment_variable_with_default( 'online_store', 'redis', 'host') - self.redis_port = envutils.get_environment_variable_with_default( + self.redis_port = self.envutils.get_environment_variable_with_default( 'online_store', 'redis', 'port') - self.redis_ssl_enabled = envutils.get_environment_variable_with_default( + self.redis_ssl_enabled = self.envutils.get_environment_variable_with_default( 'online_store', 'redis', 'ssl_enabled') # S3 configs - self.s3_endpoint = envutils.get_environment_variable_with_default( + self.s3_endpoint = self.envutils.get_environment_variable_with_default( 'offline_store', 's3', 's3_endpoint') # spark configs - self.output_num_parts = envutils.get_environment_variable_with_default( + self.output_num_parts = self.envutils.get_environment_variable_with_default( 'spark_config', 'spark_result_output_parts') - self.spark_runtime = envutils.get_environment_variable_with_default( + self.spark_runtime = self.envutils.get_environment_variable_with_default( 'spark_config', 'spark_cluster') self.credential = credential @@ -135,22 +134,22 @@ def __init__(self, config_path:str = "./feathr_config.yaml", local_workspace_dir # Spark job submission. The feathr jar hosted in cloud saves the time users needed to upload the jar from # their local env. self._FEATHR_JOB_JAR_PATH = \ - envutils.get_environment_variable_with_default( + self.envutils.get_environment_variable_with_default( 'spark_config', 'azure_synapse', 'feathr_runtime_location') if self.credential is None: self.credential = DefaultAzureCredential(exclude_interactive_browser_credential=False) self.feathr_spark_laucher = _FeathrSynapseJobLauncher( - synapse_dev_url=envutils.get_environment_variable_with_default( + synapse_dev_url=self.envutils.get_environment_variable_with_default( 'spark_config', 'azure_synapse', 'dev_url'), - pool_name=envutils.get_environment_variable_with_default( + pool_name=self.envutils.get_environment_variable_with_default( 'spark_config', 'azure_synapse', 'pool_name'), - datalake_dir=envutils.get_environment_variable_with_default( + datalake_dir=self.envutils.get_environment_variable_with_default( 'spark_config', 'azure_synapse', 'workspace_dir'), - executor_size=envutils.get_environment_variable_with_default( + executor_size=self.envutils.get_environment_variable_with_default( 'spark_config', 'azure_synapse', 'executor_size'), - executors=envutils.get_environment_variable_with_default( + executors=self.envutils.get_environment_variable_with_default( 'spark_config', 'azure_synapse', 'executor_num'), credential=self.credential ) @@ -159,17 +158,17 @@ def __init__(self, config_path:str = "./feathr_config.yaml", local_workspace_dir # Spark job submission. The feathr jar hosted in cloud saves the time users needed to upload the jar from # their local env. self._FEATHR_JOB_JAR_PATH = \ - envutils.get_environment_variable_with_default( + self.envutils.get_environment_variable_with_default( 'spark_config', 'databricks', 'feathr_runtime_location') self.feathr_spark_laucher = _FeathrDatabricksJobLauncher( - workspace_instance_url=envutils.get_environment_variable_with_default( + workspace_instance_url=self.envutils.get_environment_variable_with_default( 'spark_config', 'databricks', 'workspace_instance_url'), - token_value=_EnvVaraibleUtil.get_environment_variable( + token_value=self.envutils.get_environment_variable( 'DATABRICKS_WORKSPACE_TOKEN_VALUE'), - config_template=envutils.get_environment_variable_with_default( + config_template=self.envutils.get_environment_variable_with_default( 'spark_config', 'databricks', 'config_template'), - databricks_work_dir=envutils.get_environment_variable_with_default( + databricks_work_dir=self.envutils.get_environment_variable_with_default( 'spark_config', 'databricks', 'work_dir') ) @@ -177,9 +176,9 @@ def __init__(self, config_path:str = "./feathr_config.yaml", local_workspace_dir # initialize registry - self.registry_delimiter = envutils.get_environment_variable_with_default( + self.registry_delimiter = self.envutils.get_environment_variable_with_default( 'feature_registry', 'purview', 'delimiter') - self.azure_purview_name = envutils.get_environment_variable_with_default( + self.azure_purview_name = self.envutils.get_environment_variable_with_default( 'feature_registry', 'purview', 'purview_name') # initialize the registry no matter whether we set purview name or not, given some of the methods are used there. self.registry = _FeatureRegistry(self.project_name, self.azure_purview_name, self.registry_delimiter, project_registry_tag, config_path = config_path, credential=self.credential) @@ -382,7 +381,7 @@ def _construct_redis_client(self): """Constructs the Redis client. The host, port, credential and other parameters can be set via environment parameters. """ - password = _EnvVaraibleUtil.get_environment_variable(REDIS_PASSWORD) + password = self.envutils.get_environment_variable(REDIS_PASSWORD) host = self.redis_host port = self.redis_port ssl_enabled = self.redis_ssl_enabled @@ -400,7 +399,7 @@ def get_offline_features(self, observation_settings: ObservationSettings, feature_query: Union[FeatureQuery, List[FeatureQuery]], output_path: str, - execution_configuratons: Union[SparkExecutionConfiguration ,Dict[str,str]] = None, + execution_configuratons: Union[SparkExecutionConfiguration ,Dict[str,str]] = {}, udf_files = None, verbose: bool = False ): @@ -449,7 +448,7 @@ def get_offline_features(self, write_to_file(content=config, full_file_name=config_file_path) return self._get_offline_features_with_config(config_file_path, execution_configuratons, udf_files=udf_files) - def _get_offline_features_with_config(self, feature_join_conf_path='feature_join_conf/feature_join.conf', execution_configuratons: Dict[str,str] = None, udf_files=[]): + def _get_offline_features_with_config(self, feature_join_conf_path='feature_join_conf/feature_join.conf', execution_configuratons: Dict[str,str] = {}, udf_files=[]): """Joins the features to your offline observation dataset based on the join config. Args: @@ -524,7 +523,16 @@ def wait_job_to_finish(self, timeout_sec: int = 300): else: raise RuntimeError('Spark job failed.') - def materialize_features(self, settings: MaterializationSettings, execution_configuratons: Union[SparkExecutionConfiguration ,Dict[str,str]] = None, verbose: bool = False): + def monitor_features(self, settings: MonitoringSettings, execution_configuratons: Union[SparkExecutionConfiguration ,Dict[str,str]] = {}, verbose: bool = False): + """Create a offline job to generate statistics to monitor feature data + + Args: + settings: Feature monitoring settings + execution_configuratons: a dict that will be passed to spark job when the job starts up, i.e. the "spark configurations". Note that not all of the configuration will be honored since some of the configurations are managed by the Spark platform, such as Databricks or Azure Synapse. Refer to the [spark documentation](https://spark.apache.org/docs/latest/configuration.html) for a complete list of spark configurations. + """ + self.materialize_features(settings, execution_configuratons, verbose) + + def materialize_features(self, settings: MaterializationSettings, execution_configuratons: Union[SparkExecutionConfiguration ,Dict[str,str]] = {}, verbose: bool = False): """Materialize feature data Args: @@ -557,7 +565,7 @@ def materialize_features(self, settings: MaterializationSettings, execution_conf if verbose and settings: FeaturePrinter.pretty_print_materialize_features(settings) - def _materialize_features_with_config(self, feature_gen_conf_path: str = 'feature_gen_conf/feature_gen.conf',execution_configuratons: Dict[str,str] = None, udf_files=[]): + def _materialize_features_with_config(self, feature_gen_conf_path: str = 'feature_gen_conf/feature_gen.conf',execution_configuratons: Dict[str,str] = {}, udf_files=[]): """Materializes feature data based on the feature generation config. The feature data will be materialized to the destination specified in the feature generation config. @@ -577,26 +585,31 @@ def _materialize_features_with_config(self, feature_gen_conf_path: str = 'featur Job configurations and job arguments (or sometimes called job parameters) have quite some overlaps (i.e. you can achieve the same goal by either using the job arguments/parameters vs. job configurations). But the job tags should just be used for metadata purpose. ''' optional_params = [] - if _EnvVaraibleUtil.get_environment_variable('KAFKA_SASL_JAAS_CONFIG'): + if self.envutils.get_environment_variable('KAFKA_SASL_JAAS_CONFIG'): optional_params = optional_params + ['--kafka-config', self._get_kafka_config_str()] - return self.feathr_spark_laucher.submit_feathr_job( - job_name=self.project_name + '_feathr_feature_materialization_job', - main_jar_path=self._FEATHR_JOB_JAR_PATH, - python_files=cloud_udf_paths, - main_class_name='com.linkedin.feathr.offline.job.FeatureGenJob', - arguments=[ + arguments = [ '--generation-config', self.feathr_spark_laucher.upload_or_get_cloud_path( - generation_config.generation_config_path), + generation_config.generation_config_path), # Local Config, comma seperated file names '--feature-config', self.feathr_spark_laucher.upload_or_get_cloud_path( - generation_config.feature_config), + generation_config.feature_config), '--redis-config', self._getRedisConfigStr(), '--s3-config', self._get_s3_config_str(), '--adls-config', self._get_adls_config_str(), '--blob-config', self._get_blob_config_str(), '--sql-config', self._get_sql_config_str(), - '--snowflake-config', self._get_snowflake_config_str() - ] + optional_params, + '--snowflake-config', self._get_snowflake_config_str(), + ] + optional_params + monitoring_config_str = self._get_monitoring_config_str() + if monitoring_config_str: + arguments.append('--monitoring-config') + arguments.append(monitoring_config_str) + return self.feathr_spark_laucher.submit_feathr_job( + job_name=self.project_name + '_feathr_feature_materialization_job', + main_jar_path=self._FEATHR_JOB_JAR_PATH, + python_files=cloud_udf_paths, + main_class_name='com.linkedin.feathr.offline.job.FeatureGenJob', + arguments=arguments, reference_files_path=[], configuration=execution_configuratons, ) @@ -613,7 +626,7 @@ def wait_job_to_finish(self, timeout_sec: int = 300): def _getRedisConfigStr(self): """Construct the Redis config string. The host, port, credential and other parameters can be set via environment variables.""" - password = _EnvVaraibleUtil.get_environment_variable(REDIS_PASSWORD) + password = self.envutils.get_environment_variable(REDIS_PASSWORD) host = self.redis_host port = self.redis_port ssl_enabled = self.redis_ssl_enabled @@ -631,8 +644,8 @@ def _get_s3_config_str(self): endpoint = self.s3_endpoint # if s3 endpoint is set in the feathr_config, then we need other environment variables # keys can't be only accessed through environment - access_key = _EnvVaraibleUtil.get_environment_variable('S3_ACCESS_KEY') - secret_key = _EnvVaraibleUtil.get_environment_variable('S3_SECRET_KEY') + access_key = self.envutils.get_environment_variable('S3_ACCESS_KEY') + secret_key = self.envutils.get_environment_variable('S3_SECRET_KEY') # HOCCON format will be parsed by the Feathr job config_str = """ S3_ENDPOINT: {S3_ENDPOINT} @@ -644,10 +657,10 @@ def _get_s3_config_str(self): def _get_adls_config_str(self): """Construct the ADLS config string for abfs(s). The Account, access key and other parameters can be set via environment variables.""" - account = _EnvVaraibleUtil.get_environment_variable('ADLS_ACCOUNT') + account = self.envutils.get_environment_variable('ADLS_ACCOUNT') # if ADLS Account is set in the feathr_config, then we need other environment variables # keys can't be only accessed through environment - key = _EnvVaraibleUtil.get_environment_variable('ADLS_KEY') + key = self.envutils.get_environment_variable('ADLS_KEY') # HOCCON format will be parsed by the Feathr job config_str = """ ADLS_ACCOUNT: {ADLS_ACCOUNT} @@ -658,10 +671,10 @@ def _get_adls_config_str(self): def _get_blob_config_str(self): """Construct the Blob config string for wasb(s). The Account, access key and other parameters can be set via environment variables.""" - account = _EnvVaraibleUtil.get_environment_variable('BLOB_ACCOUNT') + account = self.envutils.get_environment_variable('BLOB_ACCOUNT') # if BLOB Account is set in the feathr_config, then we need other environment variables # keys can't be only accessed through environment - key = _EnvVaraibleUtil.get_environment_variable('BLOB_KEY') + key = self.envutils.get_environment_variable('BLOB_KEY') # HOCCON format will be parsed by the Feathr job config_str = """ BLOB_ACCOUNT: {BLOB_ACCOUNT} @@ -672,12 +685,12 @@ def _get_blob_config_str(self): def _get_sql_config_str(self): """Construct the SQL config string for jdbc. The dbtable (query), user, password and other parameters can be set via environment variables.""" - table = _EnvVaraibleUtil.get_environment_variable('JDBC_TABLE') - user = _EnvVaraibleUtil.get_environment_variable('JDBC_USER') - password = _EnvVaraibleUtil.get_environment_variable('JDBC_PASSWORD') - driver = _EnvVaraibleUtil.get_environment_variable('JDBC_DRIVER') - auth_flag = _EnvVaraibleUtil.get_environment_variable('JDBC_AUTH_FLAG') - token = _EnvVaraibleUtil.get_environment_variable('JDBC_TOKEN') + table = self.envutils.get_environment_variable('JDBC_TABLE') + user = self.envutils.get_environment_variable('JDBC_USER') + password = self.envutils.get_environment_variable('JDBC_PASSWORD') + driver = self.envutils.get_environment_variable('JDBC_DRIVER') + auth_flag = self.envutils.get_environment_variable('JDBC_AUTH_FLAG') + token = self.envutils.get_environment_variable('JDBC_TOKEN') # HOCCON format will be parsed by the Feathr job config_str = """ JDBC_TABLE: {JDBC_TABLE} @@ -689,6 +702,22 @@ def _get_sql_config_str(self): """.format(JDBC_TABLE=table, JDBC_USER=user, JDBC_PASSWORD=password, JDBC_DRIVER = driver, JDBC_AUTH_FLAG = auth_flag, JDBC_TOKEN = token) return config_str + def _get_monitoring_config_str(self): + """Construct monitoring-related config string.""" + url = self.envutils.get_environment_variable_with_default('monitoring', 'database', 'sql', 'url') + user = self.envutils.get_environment_variable_with_default('monitoring', 'database', 'sql', 'user') + password = self.envutils.get_environment_variable('MONITORING_DATABASE_SQL_PASSWORD') + if url: + # HOCCON format will be parsed by the Feathr job + config_str = """ + MONITORING_DATABASE_SQL_URL: "{url}" + MONITORING_DATABASE_SQL_USER: {user} + MONITORING_DATABASE_SQL_PASSWORD: {password} + """.format(url=url, user=user, password=password) + return config_str + else: + "" + def _get_snowflake_config_str(self): """Construct the Snowflake config string for jdbc. The url, user, role and other parameters can be set via yaml config. Password can be set via environment variables.""" @@ -708,7 +737,7 @@ def _get_snowflake_config_str(self): def _get_kafka_config_str(self): """Construct the Kafka config string. The endpoint, access key, secret key, and other parameters can be set via environment variables.""" - sasl = _EnvVaraibleUtil.get_environment_variable('KAFKA_SASL_JAAS_CONFIG') + sasl = self.envutils.get_environment_variable('KAFKA_SASL_JAAS_CONFIG') # HOCCON format will be parsed by the Feathr job config_str = """ KAFKA_SASL_JAAS_CONFIG: "{sasl}" diff --git a/feathr_project/feathr/constants.py b/feathr_project/feathr/constants.py index 13adb785b..bbe804fbc 100644 --- a/feathr_project/feathr/constants.py +++ b/feathr_project/feathr/constants.py @@ -15,6 +15,7 @@ REGISTRY_TYPEDEF_VERSION="v1" TYPEDEF_SOURCE=f'feathr_source_{REGISTRY_TYPEDEF_VERSION}' +# TODO: change the name from feathr_workspace_ to feathr_project_ TYPEDEF_FEATHR_PROJECT=f'feathr_workspace_{REGISTRY_TYPEDEF_VERSION}' TYPEDEF_DERIVED_FEATURE=f'feathr_derived_feature_{REGISTRY_TYPEDEF_VERSION}' TYPEDEF_ANCHOR=f'feathr_anchor_{REGISTRY_TYPEDEF_VERSION}' @@ -24,3 +25,4 @@ TYPEDEF_ARRAY_DERIVED_FEATURE=f"array" TYPEDEF_ARRAY_ANCHOR_FEATURE=f"array" +FEATHR_MAVEN_ARTIFACT="com.linkedin.feathr:feathr_2.12:0.4.0" \ No newline at end of file diff --git a/feathr_project/feathr/definition/monitoring_settings.py b/feathr_project/feathr/definition/monitoring_settings.py new file mode 100644 index 000000000..ee39f84d5 --- /dev/null +++ b/feathr_project/feathr/definition/monitoring_settings.py @@ -0,0 +1,8 @@ +from feathr.definition.materialization_settings import MaterializationSettings + + +# it's completely the same as MaterializationSettings. But we renamed it to improve usability. +# In the future, we may want to rely a separate system other than MaterializationSettings to generate stats. +class MonitoringSettings(MaterializationSettings): + """Settings about monitoring features. + """ diff --git a/feathr_project/feathr/definition/sink.py b/feathr_project/feathr/definition/sink.py index 54088c3af..73542fa3b 100644 --- a/feathr_project/feathr/definition/sink.py +++ b/feathr_project/feathr/definition/sink.py @@ -8,6 +8,27 @@ class Sink(HoconConvertible): """ pass +class MonitoringSqlSink(Sink): + """SQL-based sink that stores feature monitoring results. + + Attributes: + table_name: output table name + """ + def __init__(self, table_name: str) -> None: + self.table_name = table_name + + def to_feature_config(self) -> str: + """Produce the config used in feature monitoring""" + tm = Template(""" + { + name: MONITORING + params: { + table_name: "{{source.table_name}}" + } + } + """) + msg = tm.render(source=self) + return msg class RedisSink(Sink): """Redis-based sink use to store online feature data, can be used in batch job or streaming job. diff --git a/feathr_project/feathr/registry/_feature_registry_purview.py b/feathr_project/feathr/registry/_feature_registry_purview.py index 0a8b3af07..ca1f4af96 100644 --- a/feathr_project/feathr/registry/_feature_registry_purview.py +++ b/feathr_project/feathr/registry/_feature_registry_purview.py @@ -11,6 +11,7 @@ from tracemalloc import stop from typing import Dict, List, Optional, Tuple, Union from urllib.parse import urlparse +from time import sleep from azure.identity import DefaultAzureCredential from jinja2 import Template @@ -75,6 +76,7 @@ def _register_feathr_feature_types(self): type_feathr_project = EntityTypeDef( name=TYPEDEF_FEATHR_PROJECT, attributeDefs=[ + # TODO: this should be called "anchors" rather than "anchor_features" to make it less confusing. AtlasAttributeDef( name="anchor_features", typeName=TYPEDEF_ARRAY_ANCHOR, cardinality=Cardinality.SET), AtlasAttributeDef( @@ -219,7 +221,7 @@ def _parse_anchors(self, anchor_list: List[FeatureAnchor]) -> List[AtlasEntity]: # then parse the source of that anchor source_entity = self._parse_source(anchor.source) anchor_fully_qualified_name = self.project_name+self.registry_delimiter+anchor.name - original_id = self.get_feature_id(anchor_fully_qualified_name ) + original_id = self.get_feature_id(anchor_fully_qualified_name, type=TYPEDEF_ANCHOR ) original_anchor = self.get_feature_by_guid(original_id) if original_id else None merged_elements = self._merge_anchor(original_anchor,anchor_feature_entities) anchor_entity = AtlasEntity( @@ -733,18 +735,26 @@ def _delete_all_feathr_entities(self): :param guid: The guid or guids you want to remove. """ - entities = self.purview_client.discovery.search_entities( - "feathr*", limit=20) + # should not be large than this, otherwise the backend might throw out error + batch_delte_size = 100 - # [print(entity) for entity in entities] - guid_list = [entity["id"] for entity in entities] + # use the `query` API so that it can return immediatelly (don't use the search_entity API as it will try to return all the results in a single request) - # should not be large than this, otherwise the backend might throw out error - batch_delte_size = 15 - for i in range(0, len(guid_list), batch_delte_size): - self.purview_client.delete_entity( - guid=guid_list[i:i+batch_delte_size]) + while True: + result = self.purview_client.discovery.query( + "feathr", limit=batch_delte_size) + logger.info("Total number of entities:",result['@search.count'] ) + + # if no results, break: + if result['@search.count'] == 0: + break + entities = result['value'] + guid_list = [entity["id"] for entity in entities] + self.purview_client.delete_entity(guid=guid_list) logger.info("{} feathr entities deleted", batch_delte_size) + # sleep here, otherwise backend might throttle + # process the next batch after sleep + sleep(1) @classmethod def _get_registry_client(self): @@ -753,26 +763,43 @@ def _get_registry_client(self): """ return self.purview_client - def list_registered_features(self, project_name: str = None, limit=50, starting_offset=0) -> List[Dict[str,str]]: + def list_registered_features(self, project_name: str, limit=1000, starting_offset=0) -> List[Dict[str,str]]: """ List all the already registered features. If project_name is not provided or is None, it will return all the registered features; otherwise it will only return only features under this project """ - entities = self.purview_client.discovery.search_entities( - f"entityType:{TYPEDEF_ANCHOR_FEATURE} or entityType:{TYPEDEF_DERIVED_FEATURE}", limit=limit, starting_offset=starting_offset) + feature_list = [] + + if not project_name: + raise RuntimeError("project_name must be specified.") + + # get the corresponding features belongs to a certain project. + # note that we need to use "startswith" to filter out the features that don't belong to this project. + # see syntax here: https://docs.microsoft.com/en-us/rest/api/purview/catalogdataplane/discovery/query#discovery_query_andornested + query_filter = { + "and": [ + { + "or": + [ + {"entityType": TYPEDEF_DERIVED_FEATURE}, + {"entityType": TYPEDEF_ANCHOR_FEATURE} + ] + }, + { + "attributeName": "qualifiedName", + "operator": "startswith", + "attributeValue": project_name + self.registry_delimiter + } + ] + } + result = self.purview_client.discovery.query(filter=query_filter) + + entities = result['value'] + # entities = self.purview_client.discovery.search_entities(query = None, search_filter=query_filter, limit=limit) + for entity in entities: - if project_name: - # if project_name is a valid string, only append entities if the qualified name start with - # project_name+delimiter - qualified_name: str = entity["qualifiedName"] - # split the name based on delimiter - result = qualified_name.split(self.registry_delimiter) - if result[0].casefold() == project_name: - feature_list.append({"name":entity["name"],'id':entity['id'],"qualifiedName":entity['qualifiedName']}) - else: - # otherwise append all the entities - feature_list.append({"name":entity["name"],'id':entity['id'],"qualifiedName":entity['qualifiedName']}) + feature_list.append({"name":entity["name"],'id':entity['id'],"qualifiedName":entity['qualifiedName']}) return feature_list @@ -810,12 +837,27 @@ def get_feature_lineage(self, guid): """ return self.purview_client.get_entity_lineage(guid=guid) - def get_feature_id(self, qualifiedName): + def get_feature_id(self, qualifiedName, type: str): """ Get guid of a feature given its qualifiedName """ - search_term = "qualifiedName:{0}".format(qualifiedName) - entities = self.purview_client.discovery.search_entities(search_term) + # the search term should be full qualified name + # TODO: need to update the calling functions to add `type` field to make it more performant + # purview_client.get_entity(qualifiedName=qualifiedName) might not work here since it requires an additonal typeName parameter + # Currently still use the `query` API to get the result in a "full name match" way. + # self.purview_client.get_entity(qualifiedName=qualifiedName, typeName=type) + + # get the corresponding features belongs to a certain project. + # note that we need to use "eq" to filter exactly this qualified name + # see syntax here: https://docs.microsoft.com/en-us/rest/api/purview/catalogdataplane/discovery/query#discovery_query_andornested + query_filter = { + "attributeName": "qualifiedName", + "operator": "eq", + "attributeValue": qualifiedName + } + result = self.purview_client.discovery.query(keywords = None, filter=query_filter) + entities = result['value'] + # There should be exactly one result, but we don't enforce the check here for entity in entities: if entity.get('qualifiedName') == qualifiedName: return entity.get('id') @@ -829,7 +871,7 @@ def search_features(self, searchTerm): entities = self.purview_client.discovery.search_entities(searchTerm) return entities - def _list_registered_entities_with_details(self, project_name: str = None, entity_type: Union[str, List[str]] = None, limit=50, starting_offset=0,) -> List[Dict]: + def _list_registered_entities_with_details(self, project_name: str, entity_type: Union[str, List[str]] = None, limit=1000, starting_offset=0,) -> List[Dict]: """ List all the already registered entities. entity_type should be one of: SOURCE, DERIVED_FEATURE, ANCHOR, ANCHOR_FEATURE, FEATHR_PROJECT, or a list of those values limit: a maximum 1000 will be enforced at the underlying API @@ -844,30 +886,56 @@ def _list_registered_entities_with_details(self, project_name: str = None, entit raise RuntimeError( f'only SOURCE, DERIVED_FEATURE, ANCHOR, ANCHOR_FEATURE, FEATHR_PROJECT are supported when listing the registered entities, {entity_type} is not one of them.') - # the search grammar is less documented in Atlas/Purview. - # Here's the query grammar: https://atlas.apache.org/2.0.0/Search-Advanced.html - search_string = "".join( - [f" or entityType:{e}" for e in entity_type_list]) - # remvoe the first additional " or " - search_string = search_string[4:] - result_entities = self.purview_client.discovery.search_entities( - search_string, limit=limit, starting_offset=starting_offset) + if project_name is None: + raise RuntimeError("You need to specify a project_name") + # the search grammar: + # https://docs.microsoft.com/en-us/azure/purview/how-to-search-catalog#search-query-syntax + # https://docs.microsoft.com/en-us/rest/api/datacatalog/data-catalog-search-syntax-reference + + # get the corresponding features belongs to a certain project. + # note that we need to use "startswith" to filter out the features that don't belong to this project. + # see syntax here: https://docs.microsoft.com/en-us/rest/api/purview/catalogdataplane/discovery/query#discovery_query_andornested + # this search does the following: + # search all the entities that start with project_name+delimiter for all the search entities + # However, for TYPEDEF_FEATHR_PROJECT, it doesn't have delimiter in the qualifiedName + # Hence if TYPEDEF_FEATHR_PROJECT is in the `entity_type` input, we need to search for that specifically + # and finally "OR" the result to union them + query_filter = { + "or": + [{ + "and": [{ + # this is a list of the entity types that you want to query + "or": [{"entityType": e} for e in entity_type_list] + }, + { + "attributeName": "qualifiedName", + "operator": "startswith", + # use `project_name + self.registry_delimiter` to limit the search results + "attributeValue": project_name + self.registry_delimiter + }]}, + # if we are querying TYPEDEF_FEATHR_PROJECT, then "union" the result by using this query + { + "and": [{ + "or": [{"entityType": TYPEDEF_FEATHR_PROJECT}] if TYPEDEF_FEATHR_PROJECT in entity_type_list else None + }, + { + "attributeName": "qualifiedName", + "operator": "startswith", + "attributeValue": project_name + }]}] + } + # Important properties returned includes: # id (the guid of the entity), name, qualifiedName, @search.score, # and @search.highlights - guid_list = [] - for entity in result_entities: - if project_name: - # if project_name is a valid string, only append entities if the qualified name start with - # project_name+delimiter - qualified_name: str = entity["qualifiedName"] - # split the name based on delimiter - result = qualified_name.split(self.registry_delimiter) - if result[0].casefold() == project_name: - guid_list.append(entity["id"]) - else: - # otherwise append all the entities - guid_list.append(entity["id"]) + # TODO: it might be throttled in the backend and wait for the `pyapacheatlas` to fix this + # https://github.com/wjohnson/pyapacheatlas/issues/206 + # `pyapacheatlas` needs a bit optimization to avoid additional calls. + result_entities = self.purview_client.discovery.search_entities(query=None, search_filter=query_filter, limit = limit) + + # append the guid list. Since we are using project_name + delimiter to search, all the result will be valid. + guid_list = [entity["id"] for entity in result_entities] + entity_res = [] if guid_list is None or len(guid_list)==0 else self.purview_client.get_entity( guid=guid_list)["entities"] return entity_res @@ -879,15 +947,14 @@ def get_features_from_registry(self, project_name: str) -> Tuple[List[FeatureAnc Args: project_name (str): project name. """ - - entities = self._list_registered_entities_with_details(project_name=project_name,entity_type=[TYPEDEF_DERIVED_FEATURE, TYPEDEF_ANCHOR_FEATURE, TYPEDEF_FEATHR_PROJECT]) - if not entities: + all_entities_in_project = self._list_registered_entities_with_details(project_name=project_name,entity_type=[TYPEDEF_DERIVED_FEATURE, TYPEDEF_ANCHOR_FEATURE, TYPEDEF_FEATHR_PROJECT, TYPEDEF_ANCHOR, TYPEDEF_SOURCE]) + if not all_entities_in_project: # if the result is empty return (None, None) # get project entity, the else are feature entities (derived+anchor) - project_entity = [x for x in entities if x['typeName']==TYPEDEF_FEATHR_PROJECT][0] # there's only one available - feature_entities = [x for x in entities if x!=project_entity] + project_entity = [x for x in all_entities_in_project if x['typeName']==TYPEDEF_FEATHR_PROJECT][0] # there's only one available + feature_entities = [x for x in all_entities_in_project if (x['typeName']==TYPEDEF_ANCHOR_FEATURE or x['typeName']==TYPEDEF_DERIVED_FEATURE)] feature_entity_guid_mapping = {x['guid']:x for x in feature_entities} # this is guid for feature anchor (GROUP of anchor features) @@ -900,7 +967,6 @@ def get_features_from_registry(self, project_name: str) -> Tuple[List[FeatureAnc for derived_feature_entity_id in derived_feature_ids: # this will be used to generate DerivedFeature instance derived_feature_key_list = [] - for key in derived_feature_entity_id["attributes"]["key"]: derived_feature_key_list.append(TypedKey(key_column=key["key_column"], key_column_type=key["key_column_type"], full_name=key["full_name"], description=key["description"], key_column_alias=key["key_column_alias"])) @@ -908,30 +974,29 @@ def get_features_from_registry(self, project_name: str) -> Tuple[List[FeatureAnc # for feature anchor (GROUP), input features are splitted into input anchor features & input derived features anchor_feature_guid = [e["guid"] for e in derived_feature_entity_id["attributes"]["input_anchor_features"]] derived_feature_guid = [e["guid"] for e in derived_feature_entity_id["attributes"]["input_derived_features"]] - # for derived features, search all related input features. input_features_guid = self.search_input_anchor_features(derived_feature_guid,feature_entity_guid_mapping) - # chain the input features together - all_input_features = list(itertools.chain.from_iterable( - [self._get_features_by_guid(x) for x in input_features_guid+anchor_feature_guid])) - + # filter out features that is related with this derived feature + all_input_features = self._get_features_by_guid_or_entities(guid_list=input_features_guid+anchor_feature_guid, entity_list=all_entities_in_project) derived_feature_list.append(DerivedFeature(name=derived_feature_entity_id["attributes"]["name"], feature_type=self._get_feature_type_from_hocon(derived_feature_entity_id["attributes"]["type"]), transform=self._get_transformation_from_dict(derived_feature_entity_id["attributes"]['transformation']), key=derived_feature_key_list, input_features= all_input_features, registry_tags=derived_feature_entity_id["attributes"]["tags"])) - anchor_result = self.purview_client.get_entity(guid=anchor_guid)["entities"] + + # anchor_result = self.purview_client.get_entity(guid=anchor_guid)["entities"] + anchor_result = [x for x in all_entities_in_project if x['typeName']==TYPEDEF_ANCHOR] anchor_list = [] + for anchor_entity in anchor_result: feature_guid = [e["guid"] for e in anchor_entity["attributes"]["features"]] anchor_list.append(FeatureAnchor(name=anchor_entity["attributes"]["name"], - source=self._get_source_by_guid(anchor_entity["attributes"]["source"]["guid"]), - features=self._get_features_by_guid(feature_guid), + source=self._get_source_by_guid(anchor_entity["attributes"]["source"]["guid"], entity_list = all_entities_in_project), + features=self._get_features_by_guid_or_entities(guid_list = feature_guid, entity_list=all_entities_in_project), registry_tags=anchor_entity["attributes"]["tags"])) - return (anchor_list, derived_feature_list) def search_input_anchor_features(self,derived_guids,feature_entity_guid_mapping) ->List[str]: @@ -978,9 +1043,13 @@ def feathr_udf2(df) udf_source_code = [line+'\n' for line in udf_source_code_striped] return " ".join(udf_source_code) - def _get_source_by_guid(self, guid) -> Source: + def _get_source_by_guid(self, guid, entity_list) -> Source: + """give a entity list and the target GUID for the source entity, return a python `Source` object. + """ # TODO: currently return HDFS source by default. For JDBC source, it's currently implemented using HDFS Source so we should split in the future - source_entity = self.purview_client.get_entity(guid=guid)["entities"][0] + + # there should be only one entity available + source_entity = [x for x in entity_list if x['guid'] == guid][0] # if source_entity["attributes"]["path"] is INPUT_CONTEXT, it will also be assigned to this returned object return HdfsSource(name=source_entity["attributes"]["name"], @@ -1045,8 +1114,21 @@ def _get_transformation_from_dict(self, input: Dict) -> FeatureType: # no transformation function observed return None - def _get_features_by_guid(self, guid) -> List[FeatureAnchor]: - feature_entities = self.purview_client.get_entity(guid=guid)["entities"] + def _get_features_by_guid_or_entities(self, guid_list, entity_list) -> List[FeatureAnchor]: + """return a python list of the features that are referenced by a list of guids. + If entity_list is provided, use entity_list to reconstruct those features + This is for "anchor feature" only. + """ + if not entity_list: + feature_entities = self.purview_client.get_entity(guid=guid_list)["entities"] + else: + guid_set = set(guid_list) + feature_entities = [x for x in entity_list if x['guid'] in guid_set] + + # raise error if we cannot find all the guid + if len(feature_entities) != len(guid_list): + raise RuntimeError("Number of `feature_entities` is less than provided GUID list for search. The project might be broken.") + feature_list=[] key_list = [] for feature_entity in feature_entities: diff --git a/feathr_project/feathr/secrets/akv_client.py b/feathr_project/feathr/secrets/akv_client.py index 84c0ff379..cdec01e12 100644 --- a/feathr_project/feathr/secrets/akv_client.py +++ b/feathr_project/feathr/secrets/akv_client.py @@ -8,15 +8,24 @@ def __init__(self, akv_name: str): self.akv_name = akv_name self.secret_client = None - def get_akv_secret(self, secret_name: str): + def get_feathr_akv_secret(self, secret_name: str): + """Get Feathr Secrets from Azure Key Vault. Note that this function will replace '_' in `secret_name` with '-' since Azure Key Vault doesn't support it + + Returns: + _type_: _description_ + """ if self.secret_client is None: self.secret_client = SecretClient( vault_url = f"https://{self.akv_name}.vault.azure.net", credential=DefaultAzureCredential() ) try: - secret = self.secret_client.get_secret(secret_name) - logger.debug(f"Secret: {secret_name} is retrieved from Key Vault {self.akv_name}.") + # replace '_' with '-' since Azure Key Vault doesn't support it + variable_replaced = secret_name.replace('_','-') #.upper() + logger.info('Fetching the secret {} from Key Vault {}.', variable_replaced, self.akv_name) + secret = self.secret_client.get_secret(variable_replaced) + logger.info('Secret {} fetched from Key Vault {}.', variable_replaced, self.akv_name) return secret.value except ResourceNotFoundError as e: - logger.error(f"Secret: {secret_name} cannot be found in Key Vault {self.akv_name}.") \ No newline at end of file + logger.error(f"Secret {secret_name} cannot be found in Key Vault {self.akv_name}.") + raise \ No newline at end of file diff --git a/feathr_project/feathr/spark_provider/.gitignore b/feathr_project/feathr/spark_provider/.gitignore new file mode 100644 index 000000000..ba64b52e6 --- /dev/null +++ b/feathr_project/feathr/spark_provider/.gitignore @@ -0,0 +1 @@ +!noop-1.0.jar \ No newline at end of file diff --git a/feathr_project/feathr/spark_provider/_abc.py b/feathr_project/feathr/spark_provider/_abc.py index b7ecc907d..998b9e88d 100644 --- a/feathr_project/feathr/spark_provider/_abc.py +++ b/feathr_project/feathr/spark_provider/_abc.py @@ -9,7 +9,7 @@ class SparkJobLauncher(ABC): @abstractmethod def upload_or_get_cloud_path(self, local_path_or_http_path: str): """upload a file from local path or an http path to the current work directory. Should support transferring file from an http path to cloud working storage, or upload directly from a local storage. - + Args: local_path_or_http_path (str): local path or http path """ @@ -19,7 +19,7 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str): @abstractmethod def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: str, arguments: List[str], reference_files_path: List[str], job_tags: Dict[str, str] = None, - configuration: Dict[str, str] = None): + configuration: Dict[str, str] = {}): """ Submits the feathr job diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py index 3eca8a3a1..26d2bbe86 100644 --- a/feathr_project/feathr/spark_provider/_databricks_submission.py +++ b/feathr_project/feathr/spark_provider/_databricks_submission.py @@ -85,7 +85,7 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str): elif src_parse_result.scheme.startswith('dbfs'): # passed a cloud path logger.info( - 'Skipping file {} as the file starts with dbfs:/', local_path_or_http_path) + 'Skip uploading file {} as the file starts with dbfs:/', local_path_or_http_path) returned_path = local_path_or_http_path elif src_parse_result.scheme.startswith(('wasb','s3','gs')): # if the path starts with a location that's not a local path @@ -111,12 +111,12 @@ def upload_local_file(self, local_path: str) -> str: file_name = os.path.basename(local_path) # returned paths for the uploaded file returned_path = os.path.join(self.databricks_work_dir, file_name) - # `local_path_or_http_path` will be either string or PathLib object, so normalize it to string + # `local_path_or_http_path` will be either string or PathLib object, so normalize it to string local_path = str(local_path) DbfsApi(self.api_client).cp(recursive=True, overwrite=True, src=local_path, dst=returned_path) return returned_path - def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: str, arguments: List[str], python_files: List[str], reference_files_path: List[str] = [], job_tags: Dict[str, str] = None, configuration: Dict[str, str] = None): + def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: str, arguments: List[str], python_files: List[str], reference_files_path: List[str] = [], job_tags: Dict[str, str] = None, configuration: Dict[str, str] = {}): """ submit the feathr job to databricks Refer to the databricks doc for more details on the meaning of the parameters: @@ -140,10 +140,18 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: submission_params['run_name'] = job_name if 'existing_cluster_id' not in submission_params: # if users don't specify existing_cluster_id + # Solving this issue: Handshake fails trying to connect from Azure Databricks to Azure PostgreSQL with SSL + # https://docs.microsoft.com/en-us/answers/questions/170730/handshake-fails-trying-to-connect-from-azure-datab.html + configuration['spark.executor.extraJavaOptions'] = '-Djava.security.properties=' + configuration['spark.driver.extraJavaOptions'] = '-Djava.security.properties=' submission_params['new_cluster']['spark_conf'] = configuration submission_params['new_cluster']['custom_tags'] = job_tags # the feathr main jar file is anyway needed regardless it's pyspark or scala spark - submission_params['libraries'][0]['jar'] = self.upload_or_get_cloud_path(main_jar_path) + if not main_jar_path: + logger.info(f"Main JAR file is not set, using default package '{FEATHR_MAVEN_ARTIFACT}' from Maven") + submission_params['libraries'][0]['maven'] = { "coordinates": FEATHR_MAVEN_ARTIFACT } + else: + submission_params['libraries'][0]['jar'] = self.upload_or_get_cloud_path(main_jar_path) # see here for the submission parameter definition https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--request-structure-6 if python_files: # this is a pyspark job. definition here: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--sparkpythontask @@ -227,7 +235,7 @@ def get_job_tags(self) -> Dict[str, str]: assert self.res_job_id is not None # For result structure, see https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--response-structure-6 result = RunsApi(self.api_client).get_run(self.res_job_id) - + if 'new_cluster' in result['cluster_spec']: custom_tags = result['cluster_spec']['new_cluster']['custom_tags'] return custom_tags @@ -235,7 +243,7 @@ def get_job_tags(self) -> Dict[str, str]: # this is not a new cluster; it's an existing cluster. logger.warning("Job tags are not available since you are using an existing Databricks cluster. Consider using 'new_cluster' in databricks configuration.") return None - + def download_result(self, result_path: str, local_folder: str): """ diff --git a/feathr_project/feathr/spark_provider/_synapse_submission.py b/feathr_project/feathr/spark_provider/_synapse_submission.py index adfa8e973..ba2d975f8 100644 --- a/feathr_project/feathr/spark_provider/_synapse_submission.py +++ b/feathr_project/feathr/spark_provider/_synapse_submission.py @@ -1,4 +1,6 @@ +from copy import deepcopy import os +import pathlib import re import time import urllib.request @@ -43,7 +45,8 @@ class _FeathrSynapseJobLauncher(SparkJobLauncher): """ Submits spark jobs to a Synapse spark cluster. """ - def __init__(self, synapse_dev_url: str, pool_name: str, datalake_dir: str, executor_size: str, executors: int, credential = None): + + def __init__(self, synapse_dev_url: str, pool_name: str, datalake_dir: str, executor_size: str, executors: int, credential=None): # use DeviceCodeCredential if EnvironmentCredential is not available self.credential = credential # use the same credential for authentication to avoid further login. @@ -60,9 +63,11 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str): Supports transferring file from an http path to cloud working storage, or upload directly from a local storage. """ logger.info('Uploading {} to cloud..', local_path_or_http_path) - res_path = self._datalake.upload_file_to_workdir(local_path_or_http_path) + res_path = self._datalake.upload_file_to_workdir( + local_path_or_http_path) - logger.info('{} is uploaded to location: {}', local_path_or_http_path, res_path) + logger.info('{} is uploaded to location: {}', + local_path_or_http_path, res_path) return res_path def download_result(self, result_path: str, local_folder: str): @@ -74,7 +79,7 @@ def download_result(self, result_path: str, local_folder: str): def submit_feathr_job(self, job_name: str, main_jar_path: str = None, main_class_name: str = None, arguments: List[str] = None, python_files: List[str]= None, reference_files_path: List[str] = None, job_tags: Dict[str, str] = None, - configuration: Dict[str, str] = None): + configuration: Dict[str, str] = {}): """ Submits the feathr job Refer to the Apache Livy doc for more details on the meaning of the parameters: @@ -92,21 +97,53 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str = None, main_clas job_name (str): name of the job main_jar_path (str): main file paths, usually your main jar file main_class_name (str): name of your main class - arguments (str): all the arugments you want to pass into the spark job - job_tags (str): tags of the job, for exmaple you might want to put your user ID, or a tag with a certain information + arguments (str): all the arguments you want to pass into the spark job + job_tags (str): tags of the job, for example you might want to put your user ID, or a tag with a certain information configuration (Dict[str, str]): Additional configs for the spark job """ - assert main_jar_path, 'main_jar_path should not be none or empty but it is none or empty.' - if main_jar_path.startswith('abfs'): - main_jar_cloud_path = main_jar_path - logger.info( - 'Cloud path {} is used for running the job: {}', main_jar_path, job_name) + + if configuration: + cfg = configuration.copy() # We don't want to mess up input parameters + else: + cfg = {} + if not main_jar_path: + # We don't have the main jar, use Maven + # Add Maven dependency to the job configuration + if "spark.jars.packages" in cfg: + cfg["spark.jars.packages"] = ",".join( + [cfg["spark.jars.packages"], FEATHR_MAVEN_ARTIFACT]) + else: + cfg["spark.jars.packages"] = FEATHR_MAVEN_ARTIFACT + + if not python_files: + # This is a JAR job + # Azure Synapse/Livy doesn't allow JAR job starts from Maven directly, we must have a jar file uploaded. + # so we have to use a dummy jar as the main file. + logger.info(f"Main JAR file is not set, using default package '{FEATHR_MAVEN_ARTIFACT}' from Maven") + # Use the no-op jar as the main file + # This is a dummy jar which contains only one `org.example.Noop` class with one empty `main` function which does nothing + current_dir = pathlib.Path(__file__).parent.resolve() + main_jar_path = os.path.join(current_dir, "noop-1.0.jar") + else: + # This is a PySpark job, no more things to do + pass + main_jar_cloud_path = None + if main_jar_path: + # Now we have a main jar, either feathr or noop + if main_jar_path.startswith('abfs'): + main_jar_cloud_path = main_jar_path + logger.info( + 'Cloud path {} is used for running the job: {}', main_jar_path, job_name) + else: + logger.info('Uploading jar from {} to cloud for running job: {}', + main_jar_path, job_name) + main_jar_cloud_path = self._datalake.upload_file_to_workdir(main_jar_path) + logger.info('{} is uploaded to {} for running job: {}', + main_jar_path, main_jar_cloud_path, job_name) else: - logger.info('Uploading jar from {} to cloud for running job: {}', - main_jar_path, job_name) - main_jar_cloud_path = self._datalake.upload_file_to_workdir(main_jar_path) - logger.info('{} is uploaded to {} for running job: {}', - main_jar_path, main_jar_cloud_path, job_name) + # We don't have the main Jar, and this is a PySpark job so we don't use `noop.jar` either + # Keep `main_jar_cloud_path` as `None` as we already added maven package into cfg + pass reference_file_paths = [] for file_path in reference_files_path: @@ -120,7 +157,7 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str = None, main_clas arguments=arguments, reference_files=reference_files_path, tags=job_tags, - configuration=configuration) + configuration=cfg) logger.info('See submitted job here: https://web.azuresynapse.net/en-us/monitoring/sparkapplication') return self.current_job_info @@ -247,8 +284,13 @@ def create_spark_batch_job(self, job_name, main_file, class_name=None, executor_cores = self.EXECUTOR_SIZE[self._executor_size]['Cores'] executor_memory = self.EXECUTOR_SIZE[self._executor_size]['Memory'] - # need to put the jar in as dependencies for pyspark job - jars = jars + [main_file] + # If we have a main jar, it needs to be added as dependencies for pyspark job + # Otherwise it's a PySpark job with Feathr JAR from Maven + if main_file: + jars = jars + [main_file] + elif not python_files: + # These 2 parameters should not be empty at the same time + raise ValueError("Main JAR is not set for the Spark job") # If file=main_file, then it's using only Scala Spark # If file=python_files[0], then it's using Pyspark @@ -319,7 +361,7 @@ def __init__(self, datalake_dir, credential=None): self.dir_client = self.file_system_client.get_directory_client('/') self.datalake_dir = datalake_dir + \ - '/' if datalake_dir[-1] != '/' else datalake_dir + '/' if datalake_dir[-1] != '/' else datalake_dir def upload_file_to_workdir(self, src_file_path: str) -> str: """ @@ -340,7 +382,7 @@ def upload_file_to_workdir(self, src_file_path: str) -> str: logger.info("{} is downloaded and then uploaded to location: {}", src_file_path, returned_path) elif src_parse_result.scheme.startswith('abfs') or src_parse_result.scheme.startswith('wasb'): # passed a cloud path - logger.info("Skipping file {} as it's already in the cloud", src_file_path) + logger.info("Skip uploading file {} as it's already in the cloud", src_file_path) returned_path = src_file_path else: # else it should be a local file path or dir @@ -394,7 +436,7 @@ def download_file(self, target_adls_directory: str, local_dir_cache: str): for folder in result_folders: folder_name = basename(folder) file_in_folder = [os.path.join(folder_name, basename(file_path.name)) for file_path in self.file_system_client.get_paths( - path=folder, recursive=False) if not file_path.is_directory] + path=folder, recursive=False) if not file_path.is_directory] local_paths = [os.path.join(local_dir_cache, file_name) for file_name in file_in_folder] self._download_file_list(local_paths, file_in_folder, directory_client) @@ -405,7 +447,7 @@ def download_file(self, target_adls_directory: str, local_dir_cache: str): self._download_file_list(local_paths, result_paths, directory_client) logger.info('Finish downloading files from {} to {}.', - target_adls_directory,local_dir_cache) + target_adls_directory, local_dir_cache) def _download_file_list(self, local_paths: List[str], result_paths, directory_client): ''' diff --git a/feathr_project/feathr/spark_provider/noop-1.0.jar b/feathr_project/feathr/spark_provider/noop-1.0.jar new file mode 100644 index 000000000..6b3b9ba56 Binary files /dev/null and b/feathr_project/feathr/spark_provider/noop-1.0.jar differ diff --git a/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py b/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py index 183c727da..70341b339 100644 --- a/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py +++ b/feathr_project/feathr/udf/_preprocessing_pyudf_manager.py @@ -52,7 +52,7 @@ def build_anchor_preprocessing_metadata(anchor_list, local_workspace_dir): feature_names_to_func_mapping[string_feature_list] = _PreprocessingPyudfManager._parse_function_str_for_name(anchor.source.preprocessing) else: # it's a callable function - feature_names_to_func_mapping[string_feature_list] = anchor.source.preprocessing.__name__ + feature_names_to_func_mapping[string_feature_list] = anchor.source.preprocessing.__name__ if not features_with_preprocessing: return @@ -87,9 +87,9 @@ def _parse_function_str_for_name(source: str) -> str: @staticmethod def persist_pyspark_udf_to_file(user_func, local_workspace_dir): - """persist the pyspark UDF to a file in `local_workspace_dir` for later usage. - The user_func could be either a string that represents a function body, or a callable object. - The reason being - if we are defining a regular Python function, it will be a callable object; + """persist the pyspark UDF to a file in `local_workspace_dir` for later usage. + The user_func could be either a string that represents a function body, or a callable object. + The reason being - if we are defining a regular Python function, it will be a callable object; however if we reterive features from registry, the current implementation is to use plain strings to store the function body. In that case, the user_fuc will be string. """ if isinstance(user_func, str): diff --git a/feathr_project/feathr/utils/_envvariableutil.py b/feathr_project/feathr/utils/_envvariableutil.py index c4038736b..4f93c6249 100644 --- a/feathr_project/feathr/utils/_envvariableutil.py +++ b/feathr_project/feathr/utils/_envvariableutil.py @@ -2,20 +2,25 @@ import yaml from loguru import logger from feathr.secrets.akv_client import AzureKeyVaultClient - +from azure.core.exceptions import ResourceNotFoundError class _EnvVaraibleUtil(object): def __init__(self, config_path): self.config_path = config_path - self.akv_client = None + # Set to none first to avoid invalid reference + self.akv_name = None + self.akv_name = self.get_environment_variable_with_default( 'secrets', 'azure_key_vault', 'name') + self.akv_client = AzureKeyVaultClient(self.akv_name) if self.akv_name else None def get_environment_variable_with_default(self, *args): """Gets the environment variable for the variable key. Args: *args: list of keys in feathr_config.yaml file Return: - A environment variable for the variable key. If it's not set in the environment, then a default is retrieved - from the feathr_config.yaml file with the same config key. + A environment variable for the variable key. It will retrieve the value of the environment variables in the following order: + If the key is set in the environment variable, Feathr will use the value of that environment variable + If it's not set in the environment, then a default is retrieved from the feathr_config.yaml file with the same config key. + If it's not available in the feathr_config.yaml file, Feathr will try to reterive the value from key vault """ # if envs exist, just return the existing env variable without reading the file @@ -24,51 +29,58 @@ def get_environment_variable_with_default(self, *args): # make it work for lower case and upper case. env_variable = os.environ.get( env_keyword, os.environ.get(upper_env_keyword)) + + # If the key is set in the environment variable, Feathr will use the value of that environment variable if env_variable: return env_variable - # if the config path doesn't exist, just return - try: - assert os.path.exists(os.path.abspath(self.config_path)) - except: - logger.info("{} is not set and configuration file {} cannot be found. One of those should be set." , env_keyword, self.config_path) + # If it's not set in the environment, then a default is retrieved from the feathr_config.yaml file with the same config key. + if os.path.exists(os.path.abspath(self.config_path)): + with open(os.path.abspath(self.config_path), 'r') as stream: + try: + yaml_config = yaml.safe_load(stream) + # concat all layers and check in environment variable + yaml_layer = yaml_config - with open(os.path.abspath(self.config_path), 'r') as stream: + # resolve one layer after another + for arg in args: + yaml_layer = yaml_layer[arg] + return yaml_layer + except KeyError as exc: + logger.info("{} not found in the config file.", env_keyword) + except yaml.YAMLError as exc: + logger.warning(exc) + + # If it's not available in the feathr_config.yaml file, Feathr will try to reterive the value from key vault + if self.akv_name: try: - yaml_config = yaml.safe_load(stream) - # concat all layers - # check in environment variable - yaml_layer = yaml_config - - # resolve one layer after another - for arg in args: - yaml_layer = yaml_layer[arg] - return yaml_layer - except KeyError as exc: - logger.info(exc) + return self.akv_client.get_feathr_akv_secret(env_keyword) + except ResourceNotFoundError: + # print out warning message if cannot find the env variable in all the resources + logger.warning('Environment variable {} not found in environment variable, default YAML config file, or key vault service.', env_keyword) return "" - except yaml.YAMLError as exc: - logger.info(exc) - @staticmethod - def get_environment_variable(variable_key): + def get_environment_variable(self, variable_key): """Gets the environment variable for the variable key. + Args: variable_key: environment variable key that is used to retrieve the environment variable Return: - A environment variable for the variable key. + A environment variable for the variable key. It will retrieve the value of the environment variables in the following order: + If the key is set in the environment variable, Feathr will use the value of that environment variable + If it's not available in the environment variable file, Feathr will try to reterive the value from key vault Raises: ValueError: If the environment variable is not set for this key, an exception is thrown. """ - password = os.environ.get(variable_key) - if not password: - logger.info(variable_key + - ' is not set in the environment variables.') - akv_name = os.environ.get("KEY_VAULT_NAME") - if akv_name: - logger.info('Fetching the value {} from Key Vault.', variable_key) - akv_client = AzureKeyVaultClient(akv_name) - password = akv_client.get_akv_secret(variable_key) - return password + env_var_value = os.environ.get(variable_key) + + if env_var_value: + return env_var_value + # If it's not available in the environment variable file, Feathr will try to reterive the value from key vault + logger.info(variable_key + ' is not set in the environment variables.') + + if self.akv_name: + return self.akv_client.get_feathr_akv_secret(variable_key) + \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml b/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml index ad26efe4e..c307d268a 100644 --- a/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml +++ b/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml @@ -117,4 +117,7 @@ feature_registry: # controls whether the type system will be initialized or not. Usually this is only required to be executed once. type_system_initialization: false - \ No newline at end of file + +secrets: + azure_key_vault: + name: feathrazuretest3-kv \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/product_detail_mock_data.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/product_detail_mock_data.csv new file mode 100644 index 000000000..476ea06f3 --- /dev/null +++ b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/product_detail_mock_data.csv @@ -0,0 +1,11 @@ +product_id,category,price,quantity,recent_sold,made_in_state,discount +1,1,22,100,0,CA,7.5 +2,2,17,300,1,CA,7.5 +3,1,40,0,2,WA,7.5 +4,1,25,100,3,WA,7.5 +5,1,33,0,2,PA,0 +6,2,19,0,2,CA,7.5 +7,2,22,200,1,WA,7.5 +8,2,59,300,0,PA,8.5 +9,0,80,100,1,WA,8.5 +10,0,39,100,0,WA,7.5 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv index b180b2825..38fe25ceb 100644 --- a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv +++ b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv @@ -1,11 +1,35 @@ -user_id,event_timestamp,product_rating -1,2021-04-01,4 -2,2021-04-01,5 -3,2021-04-01,5 -4,2021-04-01,1 -5,2021-04-01,5 -6,2021-04-01,2 -7,2021-04-01,5 -8,2021-04-01,2 -9,2021-04-01,5 -10,2021-04-01,3 \ No newline at end of file +user_id,product_id,event_timestamp,product_rating +1,1,2021-04-01,4 +1,2,2021-04-01,4 +1,3,2021-04-01,4 +1,4,2021-04-01,4 +1,5,2021-04-01,4 +2,1,2021-04-01,5 +2,2,2021-04-01,5 +2,3,2021-04-01,5 +2,4,2021-04-01,5 +2,5,2021-04-01,5 +3,1,2021-04-01,5 +3,2,2021-04-01,5 +3,3,2021-04-01,5 +3,4,2021-04-01,5 +3,5,2021-04-01,5 +4,1,2021-04-01,1 +4,2,2021-04-01,1 +4,3,2021-04-01,1 +4,4,2021-04-01,1 +4,5,2021-04-01,1 +5,1,2021-04-01,5 +5,2,2021-04-01,5 +6,1,2021-04-01,2 +7,1,2021-04-01,5 +7,2,2021-04-01,5 +7,3,2021-04-01,5 +8,1,2021-04-01,2 +8,2,2021-04-01,2 +8,3,2021-04-01,2 +9,1,2021-04-01,5 +9,2,2021-04-01,5 +9,3,2021-04-01,5 +9,4,2021-04-01,5 +10,1,2021-04-01,3 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb b/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb index 3ccd51814..0ed133169 100644 --- a/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb +++ b/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb @@ -1,736 +1,736 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Feathr Feature Store on Azure Demo Notebook\n", - "\n", - "This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. It includes these steps:\n", - "\n", - "\n", - "This tutorial demonstrates the key capabilities of Feathr, including:\n", - "\n", - "1. Install and set up Feathr with Azure\n", - "2. Create shareable features with Feathr feature definition configs.\n", - "3. Create a training dataset via point-in-time feature join.\n", - "4. Compute and write features.\n", - "5. Train a model using these features to predict fares.\n", - "6. Materialize feature value to online store.\n", - "7. Fetch feature value in real-time from online store for online scoring.\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The feature flow is as below:\n", - "\n", - "![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/feature_flow.png?raw=true)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Use Quick Start Template to Provision Azure Resources\n", - "\n", - "Feathr has native cloud integration. To use Feathr on Azure, you only need three steps:\n", - "\n", - "- Get the `Principal ID` of your account by running `az ad signed-in-user show --query objectId -o tsv` in the link below (Select \"Bash\" if asked), and write down that value (something like `b65ef2e0-42b8-44a7-9b55-abbccddeefff`). Think this ID as something representing you when accessing Azure, and it will be used to grant permissions in the next step in the UI.\n", - "\n", - "[Launch Cloud Shell](https://shell.azure.com/bash)\n", - "\n", - "- Click the button below to deploy a minimal set of Feathr resources for demo purpose. You will need to fill in the `Principal ID` and `Resource Prefix`. You will need \"Owner\" permission of the selected subscription.\n", - "\n", - "[![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Flinkedin%2Ffeathr%2Fmain%2Fdocs%2Fhow-to-guides%2Fazure_resource_provision.json)\n", - "\n", - "- Run the cells below.\n", - "\n", - "And the architecture is as below. In the above template, we are using Synapse as Spark provider, use Azure Data Lake Gen2 as offline store, and use Redis as online store, Azure Purview (Apache Atlas compatible) as feature reigstry. \n", - "\n", - "\n", - "![Architecture](https://github.com/linkedin/feathr/blob/main/docs/images/architecture.png?raw=true)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Install Feathr \n", - "\n", - "Install Feathr using pip:\n", - "\n", - "`pip install -U feathr pandavro scikit-learn`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Configure the required environment with Feathr Quick Start Template\n", - "\n", - "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. Run the code below to install Feathr, login to Azure to get the required credentials to access more cloud resources." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**REQUIRED STEP: Fill in the resource prefix when provisioning the resources**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "resource_prefix = \"feathr_resource_prefix\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! pip install feathr azure-cli pandavro scikit-learn" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Login to Azure with a device code (You will see instructions in the output):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! az login --use-device-code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import glob\n", - "import os\n", - "import tempfile\n", - "from datetime import datetime, timedelta\n", - "from math import sqrt\n", - "\n", - "import pandas as pd\n", - "import pandavro as pdx\n", - "from feathr import FeathrClient\n", - "from feathr import BOOLEAN, FLOAT, INT32, ValueType\n", - "from feathr import Feature, DerivedFeature, FeatureAnchor\n", - "from feathr import BackfillTime, MaterializationSettings\n", - "from feathr import FeatureQuery, ObservationSettings\n", - "from feathr import RedisSink\n", - "from feathr import INPUT_CONTEXT, HdfsSource\n", - "from feathr import WindowAggTransformation\n", - "from feathr import TypedKey\n", - "from sklearn.metrics import mean_squared_error\n", - "from sklearn.model_selection import train_test_split\n", - "from azure.identity import DefaultAzureCredential\n", - "from azure.keyvault.secrets import SecretClient\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get all the required credentials from Azure KeyVault" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get all the required credentials from Azure Key Vault\n", - "key_vault_name=resource_prefix+\"kv\"\n", - "synapse_workspace_url=resource_prefix+\"syws\"\n", - "adls_account=resource_prefix+\"dls\"\n", - "adls_fs_name=resource_prefix+\"fs\"\n", - "purview_name=resource_prefix+\"purview\"\n", - "key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n", - "credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n", - "client = SecretClient(vault_url=key_vault_uri, credential=credential)\n", - "secretName = \"FEATHR-ONLINE-STORE-CONN\"\n", - "retrieved_secret = client.get_secret(secretName).value\n", - "\n", - "# Get redis credentials; This is to parse Redis connection string.\n", - "redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n", - "redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n", - "redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n", - "redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n", - "\n", - "# Set the resource link\n", - "os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n", - "os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n", - "os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n", - "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", - "os.environ['online_store__redis__host'] = redis_host\n", - "os.environ['online_store__redis__port'] = redis_port\n", - "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", - "os.environ['REDIS_PASSWORD']=redis_password\n", - "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", - "feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Configure the required environment (Don't need to update if using the above Quick Start Template)\n", - "\n", - "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. If you use Feathr CLI to create a workspace, you should have a folder with a file called `feathr_config.yaml` in it with all the required configurations. Otherwise, update the configuration below.\n", - "\n", - "The code below will write this configuration string to a temporary location and load it to Feathr. Please still refer to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It should also have more explanations on the meaning of each variable." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import tempfile\n", - "yaml_config = \"\"\"\n", - "# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", - "api_version: 1\n", - "project_config:\n", - " project_name: 'feathr_getting_started'\n", - " required_environment_variables:\n", - " - 'REDIS_PASSWORD'\n", - " - 'AZURE_CLIENT_ID'\n", - " - 'AZURE_TENANT_ID'\n", - " - 'AZURE_CLIENT_SECRET'\n", - "offline_store:\n", - " adls:\n", - " adls_enabled: true\n", - " wasb:\n", - " wasb_enabled: true\n", - " s3:\n", - " s3_enabled: false\n", - " s3_endpoint: 's3.amazonaws.com'\n", - " jdbc:\n", - " jdbc_enabled: false\n", - " jdbc_database: 'feathrtestdb'\n", - " jdbc_table: 'feathrtesttable'\n", - " snowflake:\n", - " url: \"dqllago-ol19457.snowflakecomputing.com\"\n", - " user: \"feathrintegration\"\n", - " role: \"ACCOUNTADMIN\"\n", - "spark_config:\n", - " spark_cluster: 'azure_synapse'\n", - " spark_result_output_parts: '1'\n", - " azure_synapse:\n", - " dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n", - " pool_name: 'spark3'\n", - " workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_getting_started'\n", - " executor_size: 'Small'\n", - " executor_num: 4\n", - " feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n", - " databricks:\n", - " workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n", - " config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n", - " work_dir: 'dbfs:/feathr_getting_started'\n", - " feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n", - "online_store:\n", - " redis:\n", - " host: 'feathrazuretest3redis.redis.cache.windows.net'\n", - " port: 6380\n", - " ssl_enabled: True\n", - "feature_registry:\n", - " purview:\n", - " type_system_initialization: true\n", - " purview_name: 'feathrazuretest3-purview1'\n", - " delimiter: '__'\n", - "\"\"\"\n", - "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", - "with open(tmp.name, \"w\") as text_file:\n", - " text_file.write(yaml_config)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup necessary environment variables (Skip if using the above Quick Start Template)\n", - "\n", - "You should setup the environment variables in order to run this sample. More environment variables can be set by referring to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It also has more explanations on the meaning of each variable.\n", - "\n", - "To run this notebook, for Azure users, you need AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET and REDIS_PASSWORD.\n", - "To run this notebook, for Databricks useres, you need DATABRICKS_WORKSPACE_TOKEN_VALUE and REDIS_PASSWORD." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Initialize Feathr Client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = FeathrClient(config_path=tmp.name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## View the data\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The data is as below" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Defining Features with Feathr\n", - "\n", - "In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n", - "\n", - "\n", - "1. The typed key (a.k.a. entity id) identifies the subject of feature, e.g. a user id, 123.\n", - "2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", - "3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that, in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n", - "It is merely a function/transformation executing against request data at runtime.\n", - "For example, the day of week of the request, which is calculated by converting the request UNIX timestamp.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Sources Section with UDFs\n", - "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql import SparkSession, DataFrame\n", - "def feathr_udf_day_calc(df: DataFrame) -> DataFrame:\n", - " from pyspark.sql.functions import dayofweek, dayofyear, col\n", - " df = df.withColumn(\"fare_amount_cents\", col(\"fare_amount\")*100)\n", - " return df\n", - "\n", - "batch_source = HdfsSource(name=\"nycTaxiBatchSource\",\n", - " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", - " event_timestamp_column=\"lpep_dropoff_datetime\",\n", - " preprocessing=feathr_udf_day_calc,\n", - " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Anchors and Features\n", - "A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "f_trip_distance = Feature(name=\"f_trip_distance\",\n", - " feature_type=FLOAT, transform=\"trip_distance\")\n", - "f_trip_time_duration = Feature(name=\"f_trip_time_duration\",\n", - " feature_type=INT32,\n", - " transform=\"(to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime))/60\")\n", - "\n", - "features = [\n", - " f_trip_distance,\n", - " f_trip_time_duration,\n", - " Feature(name=\"f_is_long_trip_distance\",\n", - " feature_type=BOOLEAN,\n", - " transform=\"cast_float(trip_distance)>30\"),\n", - " Feature(name=\"f_day_of_week\",\n", - " feature_type=INT32,\n", - " transform=\"dayofweek(lpep_dropoff_datetime)\"),\n", - "]\n", - "\n", - "request_anchor = FeatureAnchor(name=\"request_features\",\n", - " source=INPUT_CONTEXT,\n", - " features=features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Window aggregation features\n", - "\n", - "For window aggregation features, see the supported fields below:\n", - "\n", - "Note that the `agg_func` should be any of these:\n", - "\n", - "| Aggregation Type | Input Type | Description |\n", - "| --- | --- | --- |\n", - "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", - "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", - "|LATEST| Any |Returns the latest not-null values from within the defined time window |\n", - "\n", - "\n", - "After you have defined features and sources, bring them together to build an anchor:\n", - "\n", - "\n", - "Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "location_id = TypedKey(key_column=\"DOLocationID\",\n", - " key_column_type=ValueType.INT32,\n", - " description=\"location id in NYC\",\n", - " full_name=\"nyc_taxi.location_id\")\n", - "agg_features = [Feature(name=\"f_location_avg_fare\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", - " agg_func=\"AVG\",\n", - " window=\"90d\")),\n", - " Feature(name=\"f_location_max_fare\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", - " agg_func=\"MAX\",\n", - " window=\"90d\")),\n", - " Feature(name=\"f_location_total_fare_cents\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"fare_amount_cents\",\n", - " agg_func=\"SUM\",\n", - " window=\"90d\")),\n", - " ]\n", - "\n", - "agg_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n", - " source=batch_source,\n", - " features=agg_features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Derived Features Section\n", - "Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "f_trip_time_distance = DerivedFeature(name=\"f_trip_time_distance\",\n", - " feature_type=FLOAT,\n", - " input_features=[\n", - " f_trip_distance, f_trip_time_duration],\n", - " transform=\"f_trip_distance * f_trip_time_duration\")\n", - "\n", - "f_trip_time_rounded = DerivedFeature(name=\"f_trip_time_rounded\",\n", - " feature_type=INT32,\n", - " input_features=[f_trip_time_duration],\n", - " transform=\"f_trip_time_duration % 10\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=[\n", - " f_trip_time_distance, f_trip_time_rounded])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create training data using point-in-time correct feature join\n", - "\n", - "A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n", - "\n", - "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", - "what features and how these features should be joined to the observation data. \n", - "\n", - "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if client.spark_runtime == 'databricks':\n", - " output_path = 'dbfs:/feathrazure_test.avro'\n", - "else:\n", - " output_path = feathr_output_path\n", - "\n", - "\n", - "feature_query = FeatureQuery(\n", - " feature_list=[\"f_location_avg_fare\", \"f_trip_time_rounded\", \"f_is_long_trip_distance\", \"f_location_total_fare_cents\"], key=location_id)\n", - "settings = ObservationSettings(\n", - " observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", - " event_timestamp_column=\"lpep_dropoff_datetime\",\n", - " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")\n", - "client.get_offline_features(observation_settings=settings,\n", - " feature_query=feature_query,\n", - " output_path=output_path)\n", - "client.wait_job_to_finish(timeout_sec=500)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download the result and show the result\n", - "\n", - "Let's use the helper function `get_result_df` to download the result and view it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_result_df(client: FeathrClient) -> pd.DataFrame:\n", - " \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n", - " res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n", - " tmp_dir = tempfile.TemporaryDirectory()\n", - " client.feathr_spark_laucher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n", - " dataframe_list = []\n", - " # assuming the result are in avro format\n", - " for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n", - " dataframe_list.append(pdx.read_avro(file))\n", - " vertical_concat_df = pd.concat(dataframe_list, axis=0)\n", - " tmp_dir.cleanup()\n", - " return vertical_concat_df\n", - "\n", - "df_res = get_result_df(client)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_res" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train a machine learning model\n", - "After getting all the features, let's train a machine learning model with the converted feature by Feathr:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# remove columns\n", - "from sklearn.ensemble import GradientBoostingRegressor\n", - "final_df = df_res\n", - "final_df.drop([\"lpep_pickup_datetime\", \"lpep_dropoff_datetime\",\n", - " \"store_and_fwd_flag\"], axis=1, inplace=True, errors='ignore')\n", - "final_df.fillna(0, inplace=True)\n", - "final_df['fare_amount'] = final_df['fare_amount'].astype(\"float64\")\n", - "\n", - "\n", - "train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"fare_amount\"], axis=1),\n", - " final_df[\"fare_amount\"],\n", - " test_size=0.2,\n", - " random_state=42)\n", - "model = GradientBoostingRegressor()\n", - "model.fit(train_x, train_y)\n", - "\n", - "y_predict = model.predict(test_x)\n", - "\n", - "y_actual = test_y.values.flatten().tolist()\n", - "rmse = sqrt(mean_squared_error(y_actual, y_predict))\n", - "\n", - "sum_actuals = sum_errors = 0\n", - "\n", - "for actual_val, predict_val in zip(y_actual, y_predict):\n", - " abs_error = actual_val - predict_val\n", - " if abs_error < 0:\n", - " abs_error = abs_error * -1\n", - "\n", - " sum_errors = sum_errors + abs_error\n", - " sum_actuals = sum_actuals + actual_val\n", - "\n", - "mean_abs_percent_error = sum_errors / sum_actuals\n", - "print(\"Model MAPE:\")\n", - "print(mean_abs_percent_error)\n", - "print()\n", - "print(\"Model Accuracy:\")\n", - "print(1 - mean_abs_percent_error)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Materialize feature value into offline/online storage\n", - "\n", - "While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n", - "and materialize the feature value to offline and/or online storage. \n", - "\n", - "We can push the generated features to the online store like below:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "backfill_time = BackfillTime(start=datetime(\n", - " 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n", - "redisSink = RedisSink(table_name=\"nycTaxiDemoFeature\")\n", - "settings = MaterializationSettings(\"nycTaxiTable\",\n", - " backfill_time=backfill_time,\n", - " sinks=[redisSink],\n", - " feature_names=[\"f_location_avg_fare\", \"f_location_max_fare\"])\n", - "\n", - "client.materialize_features(settings)\n", - "client.wait_job_to_finish(timeout_sec=500)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can then get the features from the online store (Redis):\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Fetching feature value for online inference\n", - "\n", - "For features that are already materialized by the previous step, their latest value can be queried via the client's\n", - "`get_online_features` or `multi_get_online_features` API." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "res = client.get_online_features('nycTaxiDemoFeature', '265', [\n", - " 'f_location_avg_fare', 'f_location_max_fare'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.multi_get_online_features(\"nycTaxiDemoFeature\", [\"239\", \"265\"], [\n", - " 'f_location_avg_fare', 'f_location_max_fare'])\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Registering and Fetching features\n", - "\n", - "We can also register the features with an Apache Atlas compatible service, such as Azure Purview, and share the registered features across teams:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.register_features()\n", - "client.list_registered_features(project_name=\"feathr_getting_started\")" - ] - } - ], - "metadata": { - "interpreter": { - "hash": "b3c5d8fd79e029a19bf620c04a250a0cafa2291ba3ed87972a3e2a099b099985" - }, - "kernelspec": { - "display_name": "Python 3.9.12 ('product_env': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feathr Feature Store on Azure Demo Notebook\n", + "\n", + "This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. It includes these steps:\n", + "\n", + "\n", + "This tutorial demonstrates the key capabilities of Feathr, including:\n", + "\n", + "1. Install and set up Feathr with Azure\n", + "2. Create shareable features with Feathr feature definition configs.\n", + "3. Create a training dataset via point-in-time feature join.\n", + "4. Compute and write features.\n", + "5. Train a model using these features to predict fares.\n", + "6. Materialize feature value to online store.\n", + "7. Fetch feature value in real-time from online store for online scoring.\n", + "\n", + "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The feature flow is as below:\n", + "\n", + "![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/feature_flow.png?raw=true)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Use Quick Start Template to Provision Azure Resources\n", + "\n", + "Feathr has native cloud integration. To use Feathr on Azure, you only need three steps:\n", + "\n", + "- Get the `Principal ID` of your account by running `az ad signed-in-user show --query id -o tsv` in the link below (Select \"Bash\" if asked), and write down that value (something like `b65ef2e0-42b8-44a7-9b55-abbccddeefff`). Think this ID as something representing you when accessing Azure, and it will be used to grant permissions in the next step in the UI.\n", + "\n", + "[Launch Cloud Shell](https://shell.azure.com/bash)\n", + "\n", + "- Click the button below to deploy a minimal set of Feathr resources for demo purpose. You will need to fill in the `Principal ID` and `Resource Prefix`. You will need \"Owner\" permission of the selected subscription.\n", + "\n", + "[![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Flinkedin%2Ffeathr%2Fmain%2Fdocs%2Fhow-to-guides%2Fazure_resource_provision.json)\n", + "\n", + "- Run the cells below.\n", + "\n", + "And the architecture is as below. In the above template, we are using Synapse as Spark provider, use Azure Data Lake Gen2 as offline store, and use Redis as online store, Azure Purview (Apache Atlas compatible) as feature reigstry. \n", + "\n", + "\n", + "![Architecture](https://github.com/linkedin/feathr/blob/main/docs/images/architecture.png?raw=true)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Install Feathr \n", + "\n", + "Install Feathr using pip:\n", + "\n", + "`pip install -U feathr pandavro scikit-learn`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Configure the required environment with Feathr Quick Start Template\n", + "\n", + "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. Run the code below to install Feathr, login to Azure to get the required credentials to access more cloud resources." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**REQUIRED STEP: Fill in the resource prefix when provisioning the resources**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "resource_prefix = \"feathr_resource_prefix\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install feathr azure-cli pandavro scikit-learn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Login to Azure with a device code (You will see instructions in the output):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! az login --use-device-code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "import os\n", + "import tempfile\n", + "from datetime import datetime, timedelta\n", + "from math import sqrt\n", + "\n", + "import pandas as pd\n", + "import pandavro as pdx\n", + "from feathr import FeathrClient\n", + "from feathr import BOOLEAN, FLOAT, INT32, ValueType\n", + "from feathr import Feature, DerivedFeature, FeatureAnchor\n", + "from feathr import BackfillTime, MaterializationSettings\n", + "from feathr import FeatureQuery, ObservationSettings\n", + "from feathr import RedisSink\n", + "from feathr import INPUT_CONTEXT, HdfsSource\n", + "from feathr import WindowAggTransformation\n", + "from feathr import TypedKey\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.model_selection import train_test_split\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.keyvault.secrets import SecretClient\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get all the required credentials from Azure KeyVault" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get all the required credentials from Azure Key Vault\n", + "key_vault_name=resource_prefix+\"kv\"\n", + "synapse_workspace_url=resource_prefix+\"syws\"\n", + "adls_account=resource_prefix+\"dls\"\n", + "adls_fs_name=resource_prefix+\"fs\"\n", + "purview_name=resource_prefix+\"purview\"\n", + "key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n", + "credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n", + "client = SecretClient(vault_url=key_vault_uri, credential=credential)\n", + "secretName = \"FEATHR-ONLINE-STORE-CONN\"\n", + "retrieved_secret = client.get_secret(secretName).value\n", + "\n", + "# Get redis credentials; This is to parse Redis connection string.\n", + "redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n", + "redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n", + "redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n", + "redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n", + "\n", + "# Set the resource link\n", + "os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n", + "os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n", + "os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n", + "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", + "os.environ['online_store__redis__host'] = redis_host\n", + "os.environ['online_store__redis__port'] = redis_port\n", + "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", + "os.environ['REDIS_PASSWORD']=redis_password\n", + "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", + "feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Configure the required environment (Don't need to update if using the above Quick Start Template)\n", + "\n", + "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. If you use Feathr CLI to create a workspace, you should have a folder with a file called `feathr_config.yaml` in it with all the required configurations. Otherwise, update the configuration below.\n", + "\n", + "The code below will write this configuration string to a temporary location and load it to Feathr. Please still refer to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It should also have more explanations on the meaning of each variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tempfile\n", + "yaml_config = \"\"\"\n", + "# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", + "api_version: 1\n", + "project_config:\n", + " project_name: 'feathr_getting_started'\n", + " required_environment_variables:\n", + " - 'REDIS_PASSWORD'\n", + " - 'AZURE_CLIENT_ID'\n", + " - 'AZURE_TENANT_ID'\n", + " - 'AZURE_CLIENT_SECRET'\n", + "offline_store:\n", + " adls:\n", + " adls_enabled: true\n", + " wasb:\n", + " wasb_enabled: true\n", + " s3:\n", + " s3_enabled: false\n", + " s3_endpoint: 's3.amazonaws.com'\n", + " jdbc:\n", + " jdbc_enabled: false\n", + " jdbc_database: 'feathrtestdb'\n", + " jdbc_table: 'feathrtesttable'\n", + " snowflake:\n", + " url: \"dqllago-ol19457.snowflakecomputing.com\"\n", + " user: \"feathrintegration\"\n", + " role: \"ACCOUNTADMIN\"\n", + "spark_config:\n", + " spark_cluster: 'azure_synapse'\n", + " spark_result_output_parts: '1'\n", + " azure_synapse:\n", + " dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n", + " pool_name: 'spark3'\n", + " workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_getting_started'\n", + " executor_size: 'Small'\n", + " executor_num: 4\n", + " feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n", + " databricks:\n", + " workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n", + " config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n", + " work_dir: 'dbfs:/feathr_getting_started'\n", + " feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n", + "online_store:\n", + " redis:\n", + " host: 'feathrazuretest3redis.redis.cache.windows.net'\n", + " port: 6380\n", + " ssl_enabled: True\n", + "feature_registry:\n", + " purview:\n", + " type_system_initialization: true\n", + " purview_name: 'feathrazuretest3-purview1'\n", + " delimiter: '__'\n", + "\"\"\"\n", + "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", + "with open(tmp.name, \"w\") as text_file:\n", + " text_file.write(yaml_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup necessary environment variables (Skip if using the above Quick Start Template)\n", + "\n", + "You should setup the environment variables in order to run this sample. More environment variables can be set by referring to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It also has more explanations on the meaning of each variable.\n", + "\n", + "To run this notebook, for Azure users, you need AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET and REDIS_PASSWORD.\n", + "To run this notebook, for Databricks useres, you need DATABRICKS_WORKSPACE_TOKEN_VALUE and REDIS_PASSWORD." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Initialize Feathr Client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = FeathrClient(config_path=tmp.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## View the data\n", + "\n", + "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The data is as below" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Defining Features with Feathr\n", + "\n", + "In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n", + "\n", + "\n", + "1. The typed key (a.k.a. entity id) identifies the subject of feature, e.g. a user id, 123.\n", + "2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", + "3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that, in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n", + "It is merely a function/transformation executing against request data at runtime.\n", + "For example, the day of week of the request, which is calculated by converting the request UNIX timestamp.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Sources Section with UDFs\n", + "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession, DataFrame\n", + "def feathr_udf_day_calc(df: DataFrame) -> DataFrame:\n", + " from pyspark.sql.functions import dayofweek, dayofyear, col\n", + " df = df.withColumn(\"fare_amount_cents\", col(\"fare_amount\")*100)\n", + " return df\n", + "\n", + "batch_source = HdfsSource(name=\"nycTaxiBatchSource\",\n", + " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", + " event_timestamp_column=\"lpep_dropoff_datetime\",\n", + " preprocessing=feathr_udf_day_calc,\n", + " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Anchors and Features\n", + "A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "f_trip_distance = Feature(name=\"f_trip_distance\",\n", + " feature_type=FLOAT, transform=\"trip_distance\")\n", + "f_trip_time_duration = Feature(name=\"f_trip_time_duration\",\n", + " feature_type=INT32,\n", + " transform=\"(to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime))/60\")\n", + "\n", + "features = [\n", + " f_trip_distance,\n", + " f_trip_time_duration,\n", + " Feature(name=\"f_is_long_trip_distance\",\n", + " feature_type=BOOLEAN,\n", + " transform=\"cast_float(trip_distance)>30\"),\n", + " Feature(name=\"f_day_of_week\",\n", + " feature_type=INT32,\n", + " transform=\"dayofweek(lpep_dropoff_datetime)\"),\n", + "]\n", + "\n", + "request_anchor = FeatureAnchor(name=\"request_features\",\n", + " source=INPUT_CONTEXT,\n", + " features=features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Window aggregation features\n", + "\n", + "For window aggregation features, see the supported fields below:\n", + "\n", + "Note that the `agg_func` should be any of these:\n", + "\n", + "| Aggregation Type | Input Type | Description |\n", + "| --- | --- | --- |\n", + "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", + "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", + "|LATEST| Any |Returns the latest not-null values from within the defined time window |\n", + "\n", + "\n", + "After you have defined features and sources, bring them together to build an anchor:\n", + "\n", + "\n", + "Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "location_id = TypedKey(key_column=\"DOLocationID\",\n", + " key_column_type=ValueType.INT32,\n", + " description=\"location id in NYC\",\n", + " full_name=\"nyc_taxi.location_id\")\n", + "agg_features = [Feature(name=\"f_location_avg_fare\",\n", + " key=location_id,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", + " agg_func=\"AVG\",\n", + " window=\"90d\")),\n", + " Feature(name=\"f_location_max_fare\",\n", + " key=location_id,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", + " agg_func=\"MAX\",\n", + " window=\"90d\")),\n", + " Feature(name=\"f_location_total_fare_cents\",\n", + " key=location_id,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(agg_expr=\"fare_amount_cents\",\n", + " agg_func=\"SUM\",\n", + " window=\"90d\")),\n", + " ]\n", + "\n", + "agg_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n", + " source=batch_source,\n", + " features=agg_features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Derived Features Section\n", + "Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "f_trip_time_distance = DerivedFeature(name=\"f_trip_time_distance\",\n", + " feature_type=FLOAT,\n", + " input_features=[\n", + " f_trip_distance, f_trip_time_duration],\n", + " transform=\"f_trip_distance * f_trip_time_duration\")\n", + "\n", + "f_trip_time_rounded = DerivedFeature(name=\"f_trip_time_rounded\",\n", + " feature_type=INT32,\n", + " input_features=[f_trip_time_duration],\n", + " transform=\"f_trip_time_duration % 10\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=[\n", + " f_trip_time_distance, f_trip_time_rounded])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create training data using point-in-time correct feature join\n", + "\n", + "A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n", + "\n", + "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", + "what features and how these features should be joined to the observation data. \n", + "\n", + "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if client.spark_runtime == 'databricks':\n", + " output_path = 'dbfs:/feathrazure_test.avro'\n", + "else:\n", + " output_path = feathr_output_path\n", + "\n", + "\n", + "feature_query = FeatureQuery(\n", + " feature_list=[\"f_location_avg_fare\", \"f_trip_time_rounded\", \"f_is_long_trip_distance\", \"f_location_total_fare_cents\"], key=location_id)\n", + "settings = ObservationSettings(\n", + " observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", + " event_timestamp_column=\"lpep_dropoff_datetime\",\n", + " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")\n", + "client.get_offline_features(observation_settings=settings,\n", + " feature_query=feature_query,\n", + " output_path=output_path)\n", + "client.wait_job_to_finish(timeout_sec=500)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download the result and show the result\n", + "\n", + "Let's use the helper function `get_result_df` to download the result and view it:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_result_df(client: FeathrClient) -> pd.DataFrame:\n", + " \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n", + " res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n", + " tmp_dir = tempfile.TemporaryDirectory()\n", + " client.feathr_spark_laucher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n", + " dataframe_list = []\n", + " # assuming the result are in avro format\n", + " for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n", + " dataframe_list.append(pdx.read_avro(file))\n", + " vertical_concat_df = pd.concat(dataframe_list, axis=0)\n", + " tmp_dir.cleanup()\n", + " return vertical_concat_df\n", + "\n", + "df_res = get_result_df(client)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_res" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train a machine learning model\n", + "After getting all the features, let's train a machine learning model with the converted feature by Feathr:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# remove columns\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "final_df = df_res\n", + "final_df.drop([\"lpep_pickup_datetime\", \"lpep_dropoff_datetime\",\n", + " \"store_and_fwd_flag\"], axis=1, inplace=True, errors='ignore')\n", + "final_df.fillna(0, inplace=True)\n", + "final_df['fare_amount'] = final_df['fare_amount'].astype(\"float64\")\n", + "\n", + "\n", + "train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"fare_amount\"], axis=1),\n", + " final_df[\"fare_amount\"],\n", + " test_size=0.2,\n", + " random_state=42)\n", + "model = GradientBoostingRegressor()\n", + "model.fit(train_x, train_y)\n", + "\n", + "y_predict = model.predict(test_x)\n", + "\n", + "y_actual = test_y.values.flatten().tolist()\n", + "rmse = sqrt(mean_squared_error(y_actual, y_predict))\n", + "\n", + "sum_actuals = sum_errors = 0\n", + "\n", + "for actual_val, predict_val in zip(y_actual, y_predict):\n", + " abs_error = actual_val - predict_val\n", + " if abs_error < 0:\n", + " abs_error = abs_error * -1\n", + "\n", + " sum_errors = sum_errors + abs_error\n", + " sum_actuals = sum_actuals + actual_val\n", + "\n", + "mean_abs_percent_error = sum_errors / sum_actuals\n", + "print(\"Model MAPE:\")\n", + "print(mean_abs_percent_error)\n", + "print()\n", + "print(\"Model Accuracy:\")\n", + "print(1 - mean_abs_percent_error)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Materialize feature value into offline/online storage\n", + "\n", + "While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n", + "and materialize the feature value to offline and/or online storage. \n", + "\n", + "We can push the generated features to the online store like below:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "backfill_time = BackfillTime(start=datetime(\n", + " 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n", + "redisSink = RedisSink(table_name=\"nycTaxiDemoFeature\")\n", + "settings = MaterializationSettings(\"nycTaxiTable\",\n", + " backfill_time=backfill_time,\n", + " sinks=[redisSink],\n", + " feature_names=[\"f_location_avg_fare\", \"f_location_max_fare\"])\n", + "\n", + "client.materialize_features(settings)\n", + "client.wait_job_to_finish(timeout_sec=500)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then get the features from the online store (Redis):\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetching feature value for online inference\n", + "\n", + "For features that are already materialized by the previous step, their latest value can be queried via the client's\n", + "`get_online_features` or `multi_get_online_features` API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "res = client.get_online_features('nycTaxiDemoFeature', '265', [\n", + " 'f_location_avg_fare', 'f_location_max_fare'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.multi_get_online_features(\"nycTaxiDemoFeature\", [\"239\", \"265\"], [\n", + " 'f_location_avg_fare', 'f_location_max_fare'])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Registering and Fetching features\n", + "\n", + "We can also register the features with an Apache Atlas compatible service, such as Azure Purview, and share the registered features across teams:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.register_features()\n", + "client.list_registered_features(project_name=\"feathr_getting_started\")" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "b3c5d8fd79e029a19bf620c04a250a0cafa2291ba3ed87972a3e2a099b099985" + }, + "kernelspec": { + "display_name": "Python 3.9.12 ('product_env': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/product_recommendation_demo.ipynb b/feathr_project/feathrcli/data/feathr_user_workspace/product_recommendation_demo.ipynb index 99bf54254..d3c297eae 100644 --- a/feathr_project/feathrcli/data/feathr_user_workspace/product_recommendation_demo.ipynb +++ b/feathr_project/feathrcli/data/feathr_user_workspace/product_recommendation_demo.ipynb @@ -1,755 +1,755 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Feathr Feature Store on Azure Demo Notebook\n", - "\n", - "This notebook illustrates the use of Feature Store to create a model that predicts product ratings. It includes these steps:\n", - "\n", - "\n", - "This tutorial demonstrates the key capabilities of Feathr, including:\n", - "\n", - "1. Install and set up Feathr with Azure\n", - "2. Create shareable features with Feathr feature definition configs.\n", - "3. Create a training dataset via point-in-time feature join.\n", - "4. Compute and write features.\n", - "5. Train a model using these features to predict fares.\n", - "6. Materialize feature value to online store.\n", - "7. Fetch feature value in real-time from online store for online scoring.\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predict users rating for a product. The feature flow is as below:\n", - "![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/product_recommendation_overview.png?raw=true)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Use Quick Start Template to Provision Azure Resources\n", - "\n", - "Feathr has native cloud integration. To use Feathr on Azure, you only need three steps:\n", - "\n", - "- Get the `Principal ID` of your account by running `az ad signed-in-user show --query objectId -o tsv` in the link below (Select \"Bash\" if asked), and write down that value (something like `b65ef2e0-42b8-44a7-9b55-abbccddeefff`). Think this ID as something representing you when accessing Azure, and it will be used to grant permissions in the next step in the UI.\n", - "\n", - "[Launch Cloud Shell](https://shell.azure.com/bash)\n", - "\n", - "- Click the button below to deploy a minimal set of Feathr resources for demo purpose. You will need to fill in the `Principal ID` and `Resource Prefix`. You will need \"Owner\" permission of the selected subscription.\n", - "\n", - "[![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Flinkedin%2Ffeathr%2Fmain%2Fdocs%2Fhow-to-guides%2Fazure_resource_provision.json)\n", - "\n", - "- Run the cells below.\n", - "\n", - "And the architecture is as below. In the above template, we are using Synapse as Spark provider, use Azure Data Lake Gen2 as offline store, and use Redis as online store, Azure Purview (Apache Atlas compatible) as feature reigstry. \n", - "\n", - "\n", - "![Architecture](https://github.com/linkedin/feathr/blob/main/docs/images/architecture.png?raw=true)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Install Feathr \n", - "\n", - "Install Feathr using pip:\n", - "\n", - "`pip install -U feathr pandavro scikit-learn`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Configure the required environment with Feathr Quick Start Template\n", - "\n", - "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. Run the code below to install Feathr, login to Azure to get the required credentials to access more cloud resources." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**REQUIRED STEP: Fill in the resource prefix when provisioning the resources**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "resource_prefix = \"feathr_resource_prefix\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! pip install feathr azure-cli pandavro scikit-learn" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Login to Azure with a device code (You will see instructions in the output):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! az login --use-device-code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import glob\n", - "import os\n", - "import tempfile\n", - "from datetime import datetime, timedelta\n", - "from math import sqrt\n", - "\n", - "import pandas as pd\n", - "import pandavro as pdx\n", - "from feathr import FeathrClient\n", - "from feathr import BOOLEAN, FLOAT, INT32, ValueType\n", - "from feathr import Feature, DerivedFeature, FeatureAnchor\n", - "from feathr import BackfillTime, MaterializationSettings\n", - "from feathr import FeatureQuery, ObservationSettings\n", - "from feathr import RedisSink\n", - "from feathr import INPUT_CONTEXT, HdfsSource\n", - "from feathr import WindowAggTransformation\n", - "from feathr import TypedKey\n", - "from sklearn.metrics import mean_squared_error\n", - "from sklearn.model_selection import train_test_split\n", - "from azure.identity import DefaultAzureCredential\n", - "from azure.keyvault.secrets import SecretClient\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get all the required credentials from Azure KeyVault" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get all the required credentials from Azure Key Vault\n", - "key_vault_name=resource_prefix+\"kv\"\n", - "synapse_workspace_url=resource_prefix+\"syws\"\n", - "adls_account=resource_prefix+\"dls\"\n", - "adls_fs_name=resource_prefix+\"fs\"\n", - "purview_name=resource_prefix+\"purview\"\n", - "key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n", - "credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n", - "client = SecretClient(vault_url=key_vault_uri, credential=credential)\n", - "secretName = \"FEATHR-ONLINE-STORE-CONN\"\n", - "retrieved_secret = client.get_secret(secretName).value\n", - "\n", - "# Get redis credentials; This is to parse Redis connection string.\n", - "redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n", - "redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n", - "redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n", - "redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n", - "\n", - "# Set the resource link\n", - "os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n", - "os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n", - "os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n", - "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", - "os.environ['online_store__redis__host'] = redis_host\n", - "os.environ['online_store__redis__port'] = redis_port\n", - "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", - "os.environ['REDIS_PASSWORD']=redis_password\n", - "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", - "feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite: Configure the required environment (Don't need to update if using the above Quick Start Template)\n", - "\n", - "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. If you use Feathr CLI to create a workspace, you should have a folder with a file called `feathr_config.yaml` in it with all the required configurations. Otherwise, update the configuration below.\n", - "\n", - "The code below will write this configuration string to a temporary location and load it to Feathr. Please still refer to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It should also have more explanations on the meaning of each variable." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import tempfile\n", - "yaml_config = \"\"\"\n", - "# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", - "api_version: 1\n", - "project_config:\n", - " project_name: 'feathr_getting_started'\n", - " required_environment_variables:\n", - " - 'REDIS_PASSWORD'\n", - " - 'AZURE_CLIENT_ID'\n", - " - 'AZURE_TENANT_ID'\n", - " - 'AZURE_CLIENT_SECRET'\n", - "offline_store:\n", - " adls:\n", - " adls_enabled: true\n", - " wasb:\n", - " wasb_enabled: true\n", - " s3:\n", - " s3_enabled: false\n", - " s3_endpoint: 's3.amazonaws.com'\n", - " jdbc:\n", - " jdbc_enabled: false\n", - " jdbc_database: 'feathrtestdb'\n", - " jdbc_table: 'feathrtesttable'\n", - " snowflake:\n", - " url: \"dqllago-ol19457.snowflakecomputing.com\"\n", - " user: \"feathrintegration\"\n", - " role: \"ACCOUNTADMIN\"\n", - "spark_config:\n", - " spark_cluster: 'azure_synapse'\n", - " spark_result_output_parts: '1'\n", - " azure_synapse:\n", - " dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n", - " pool_name: 'spark3'\n", - " workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_getting_started'\n", - " executor_size: 'Small'\n", - " executor_num: 4\n", - " feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n", - " databricks:\n", - " workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n", - " config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n", - " work_dir: 'dbfs:/feathr_getting_started'\n", - " feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n", - "online_store:\n", - " redis:\n", - " host: 'feathrazuretest3redis.redis.cache.windows.net'\n", - " port: 6380\n", - " ssl_enabled: True\n", - "feature_registry:\n", - " purview:\n", - " type_system_initialization: true\n", - " purview_name: 'feathrazuretest3-purview1'\n", - " delimiter: '__'\n", - "\"\"\"\n", - "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", - "with open(tmp.name, \"w\") as text_file:\n", - " text_file.write(yaml_config)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup necessary environment variables (Skip if using the above Quick Start Template)\n", - "\n", - "You should setup the environment variables in order to run this sample. More environment variables can be set by referring to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It also has more explanations on the meaning of each variable.\n", - "\n", - "To run this notebook, for Azure users, you need AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET and REDIS_PASSWORD.\n", - "To run this notebook, for Databricks useres, you need DATABRICKS_WORKSPACE_TOKEN_VALUE and REDIS_PASSWORD." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Initialize Feathr Client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = FeathrClient(config_path=tmp.name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## View the data\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts users product rating. To make it simple, let's just predict users' rating for one product. (We will expand the exmaple to predict ratings for arbitrary product in future tutorials.)\n", - "\n", - "We have 3 datasets to work with: one observation dataset(a.k.a. label dataset) and two raw datasets to generate features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Observation dataset(a.k.a. label dataset)\n", - "# Observation dataset usually comes with a event_timestamp to denote when the observation happened.\n", - "# The label here is product_rating. Our model objective is to predict a user's rating for this product.\n", - "import pandas as pd\n", - "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_observation_mock_data.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# User profile dataset\n", - "# Used to generate user features\n", - "import pandas as pd\n", - "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_profile_mock_data.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# User purchase history dataset.\n", - "# Used to generate user features. This is activity type data, so we need to use aggregation to genearte features.\n", - "import pandas as pd\n", - "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_purchase_history_mock_data.csv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Defining Features with Feathr\n", - "\n", - "In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n", - "\n", - "\n", - "1. The typed key (a.k.a. entity id) identifies the subject of feature, e.g. a user id, 123.\n", - "2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", - "3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that, in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n", - "It is merely a function/transformation executing against request data at runtime.\n", - "For example, the day of week of the request, which is calculated by converting the request UNIX timestamp.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Sources Section with UDFs\n", - "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql import SparkSession, DataFrame\n", - "def feathr_udf_preprocessing(df: DataFrame) -> DataFrame:\n", - " from pyspark.sql.functions import col\n", - " df = df.withColumn(\"tax_rate_decimal\", col(\"tax_rate\")/100)\n", - " df.show(10)\n", - " return df\n", - "\n", - "batch_source = HdfsSource(name=\"userProfileData\",\n", - " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_profile_mock_data.csv\",\n", - " preprocessing=feathr_udf_preprocessing)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Anchors and Features\n", - "A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "user_id = TypedKey(key_column=\"user_id\",\n", - " key_column_type=ValueType.INT32,\n", - " description=\"user id\",\n", - " full_name=\"product_recommendation.user_id\")\n", - "\n", - "feature_user_age = Feature(name=\"feature_user_age\",\n", - " key=user_id,\n", - " feature_type=INT32, transform=\"age\")\n", - "feature_user_tax_rate = Feature(name=\"feature_user_tax_rate\",\n", - " key=user_id,\n", - " feature_type=FLOAT,\n", - " transform=\"tax_rate_decimal\")\n", - "feature_user_gift_card_balance = Feature(name=\"feature_user_gift_card_balance\",\n", - " key=user_id,\n", - " feature_type=FLOAT,\n", - " transform=\"gift_card_balance\")\n", - "feature_user_has_valid_credit_card = Feature(name=\"feature_user_has_valid_credit_card\",\n", - " key=user_id,\n", - " feature_type=BOOLEAN,\n", - " transform=\"number_of_credit_cards > 0\")\n", - " \n", - "features = [\n", - " feature_user_age,\n", - " feature_user_tax_rate,\n", - " feature_user_gift_card_balance,\n", - " feature_user_has_valid_credit_card\n", - "]\n", - "\n", - "request_anchor = FeatureAnchor(name=\"anchored_features\",\n", - " source=batch_source,\n", - " features=features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Window aggregation features\n", - "\n", - "For window aggregation features, see the supported fields below:\n", - "\n", - "Note that the `agg_func` should be any of these:\n", - "\n", - "| Aggregation Type | Input Type | Description |\n", - "| --- | --- | --- |\n", - "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", - "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", - "|LATEST| Any |Returns the latest not-null values from within the defined time window |\n", - "\n", - "\n", - "After you have defined features and sources, bring them together to build an anchor:\n", - "\n", - "\n", - "Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "purchase_history_data = HdfsSource(name=\"purchase_history_data\",\n", - " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_purchase_history_mock_data.csv\",\n", - " event_timestamp_column=\"purchase_date\",\n", - " timestamp_format=\"yyyy-MM-dd\")\n", - " \n", - "agg_features = [Feature(name=\"feature_user_totla_purchase_in_90days\",\n", - " key=user_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(purchase_amount)\",\n", - " agg_func=\"AVG\",\n", - " window=\"90d\"))\n", - " ]\n", - "\n", - "agg_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n", - " source=purchase_history_data,\n", - " features=agg_features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Derived Features Section\n", - "Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "feature_user_purchasing_power = DerivedFeature(name=\"feature_user_purchasing_power\",\n", - " key=user_id,\n", - " feature_type=FLOAT,\n", - " input_features=[\n", - " feature_user_gift_card_balance, feature_user_has_valid_credit_card],\n", - " transform=\"feature_user_gift_card_balance + if_else(feature_user_has_valid_credit_card, 100, 0)\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=[\n", - " feature_user_purchasing_power])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create training data using point-in-time correct feature join\n", - "\n", - "A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n", - "\n", - "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", - "what features and how these features should be joined to the observation data. \n", - "\n", - "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if client.spark_runtime == 'databricks':\n", - " output_path = 'dbfs:/feathrazure_test.avro'\n", - "else:\n", - " output_path = feathr_output_path\n", - "\n", - "\n", - "feature_query = FeatureQuery(\n", - " feature_list=[\"feature_user_age\", \n", - " \"feature_user_tax_rate\", \n", - " \"feature_user_gift_card_balance\", \n", - " \"feature_user_has_valid_credit_card\", \n", - " \"feature_user_totla_purchase_in_90days\",\n", - " \"feature_user_purchasing_power\"], key=user_id)\n", - "settings = ObservationSettings(\n", - " observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_observation_mock_data.csv\",\n", - " event_timestamp_column=\"event_timestamp\",\n", - " timestamp_format=\"yyyy-MM-dd\")\n", - "client.get_offline_features(observation_settings=settings,\n", - " feature_query=feature_query,\n", - " output_path=output_path)\n", - "client.wait_job_to_finish(timeout_sec=500)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download the result and show the result\n", - "\n", - "Let's use the helper function `get_result_df` to download the result and view it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_result_df(client: FeathrClient) -> pd.DataFrame:\n", - " \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n", - " res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n", - " tmp_dir = tempfile.TemporaryDirectory()\n", - " client.feathr_spark_laucher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n", - " dataframe_list = []\n", - " # assuming the result are in avro format\n", - " for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n", - " dataframe_list.append(pdx.read_avro(file))\n", - " vertical_concat_df = pd.concat(dataframe_list, axis=0)\n", - " tmp_dir.cleanup()\n", - " return vertical_concat_df\n", - "\n", - "df_res = get_result_df(client)\n", - "\n", - "df_res" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train a machine learning model\n", - "After getting all the features, let's train a machine learning model with the converted feature by Feathr:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# drop non-feature columns\n", - "from sklearn.ensemble import GradientBoostingRegressor\n", - "final_df = df_res\n", - "final_df.drop([\"event_timestamp\"], axis=1, inplace=True, errors='ignore')\n", - "final_df.fillna(0, inplace=True)\n", - "final_df['product_rating'] = final_df['product_rating'].astype(\"float64\")\n", - "\n", - "train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"product_rating\"], axis=1),\n", - " final_df[\"product_rating\"],\n", - " test_size=0.2,\n", - " random_state=42)\n", - "model = GradientBoostingRegressor()\n", - "model.fit(train_x, train_y)\n", - "\n", - "y_predict = model.predict(test_x)\n", - "\n", - "y_actual = test_y.values.flatten().tolist()\n", - "rmse = sqrt(mean_squared_error(y_actual, y_predict))\n", - "\n", - "sum_actuals = sum_errors = 0\n", - "\n", - "for actual_val, predict_val in zip(y_actual, y_predict):\n", - " abs_error = actual_val - predict_val\n", - " if abs_error < 0:\n", - " abs_error = abs_error * -1\n", - "\n", - " sum_errors = sum_errors + abs_error\n", - " sum_actuals = sum_actuals + actual_val\n", - "\n", - "mean_abs_percent_error = sum_errors / sum_actuals\n", - "print(\"Model MAPE:\")\n", - "print(mean_abs_percent_error)\n", - "print()\n", - "print(\"Model Accuracy:\")\n", - "print(1 - mean_abs_percent_error)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Materialize feature value into offline/online storage\n", - "\n", - "While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n", - "and materialize the feature value to offline and/or online storage. \n", - "\n", - "We can push the generated features to the online store like below:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "backfill_time = BackfillTime(start=datetime(\n", - " 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n", - "redisSink = RedisSink(table_name=\"productRecommendationDemoFeature\")\n", - "settings = MaterializationSettings(\"productRecommendationFeatureSetting\",\n", - " backfill_time=backfill_time,\n", - " sinks=[redisSink],\n", - " feature_names=[\"feature_user_age\", \"feature_user_gift_card_balance\"])\n", - "\n", - "client.materialize_features(settings)\n", - "client.wait_job_to_finish(timeout_sec=500)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can then get the features from the online store (Redis):\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Fetching feature value for online inference\n", - "\n", - "For features that are already materialized by the previous step, their latest value can be queried via the client's\n", - "`get_online_features` or `multi_get_online_features` API." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.get_online_features('productRecommendationDemoFeature', '2', [\n", - " 'feature_user_age', 'feature_user_gift_card_balance'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.multi_get_online_features('productRecommendationDemoFeature', ['1', '2'], [\n", - " 'feature_user_age', 'feature_user_gift_card_balance'])\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Registering and Fetching features\n", - "\n", - "We can also register the features with an Apache Atlas compatible service, such as Azure Purview, and share the registered features across teams:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.register_features()\n", - "client.list_registered_features(project_name=\"feathr_getting_started\")" - ] - } - ], - "metadata": { - "interpreter": { - "hash": "b3c5d8fd79e029a19bf620c04a250a0cafa2291ba3ed87972a3e2a099b099985" - }, - "kernelspec": { - "display_name": "Python 3.9.12 ('product_env': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feathr Feature Store on Azure Demo Notebook\n", + "\n", + "This notebook illustrates the use of Feature Store to create a model that predicts product ratings. It includes these steps:\n", + "\n", + "\n", + "This tutorial demonstrates the key capabilities of Feathr, including:\n", + "\n", + "1. Install and set up Feathr with Azure\n", + "2. Create shareable features with Feathr feature definition configs.\n", + "3. Create a training dataset via point-in-time feature join.\n", + "4. Compute and write features.\n", + "5. Train a model using these features to predict fares.\n", + "6. Materialize feature value to online store.\n", + "7. Fetch feature value in real-time from online store for online scoring.\n", + "\n", + "In this tutorial, we use Feathr Feature Store to create a model that predict users rating for a product. The feature flow is as below:\n", + "![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/product_recommendation_overview.png?raw=true)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Use Quick Start Template to Provision Azure Resources\n", + "\n", + "Feathr has native cloud integration. To use Feathr on Azure, you only need three steps:\n", + "\n", + "- Get the `Principal ID` of your account by running `az ad signed-in-user show --query id -o tsv` in the link below (Select \"Bash\" if asked), and write down that value (something like `b65ef2e0-42b8-44a7-9b55-abbccddeefff`). Think this ID as something representing you when accessing Azure, and it will be used to grant permissions in the next step in the UI.\n", + "\n", + "[Launch Cloud Shell](https://shell.azure.com/bash)\n", + "\n", + "- Click the button below to deploy a minimal set of Feathr resources for demo purpose. You will need to fill in the `Principal ID` and `Resource Prefix`. You will need \"Owner\" permission of the selected subscription.\n", + "\n", + "[![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Flinkedin%2Ffeathr%2Fmain%2Fdocs%2Fhow-to-guides%2Fazure_resource_provision.json)\n", + "\n", + "- Run the cells below.\n", + "\n", + "And the architecture is as below. In the above template, we are using Synapse as Spark provider, use Azure Data Lake Gen2 as offline store, and use Redis as online store, Azure Purview (Apache Atlas compatible) as feature reigstry. \n", + "\n", + "\n", + "![Architecture](https://github.com/linkedin/feathr/blob/main/docs/images/architecture.png?raw=true)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Install Feathr \n", + "\n", + "Install Feathr using pip:\n", + "\n", + "`pip install -U feathr pandavro scikit-learn`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Configure the required environment with Feathr Quick Start Template\n", + "\n", + "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. Run the code below to install Feathr, login to Azure to get the required credentials to access more cloud resources." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**REQUIRED STEP: Fill in the resource prefix when provisioning the resources**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "resource_prefix = \"feathr_resource_prefix\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install feathr azure-cli pandavro scikit-learn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Login to Azure with a device code (You will see instructions in the output):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! az login --use-device-code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "import os\n", + "import tempfile\n", + "from datetime import datetime, timedelta\n", + "from math import sqrt\n", + "\n", + "import pandas as pd\n", + "import pandavro as pdx\n", + "from feathr import FeathrClient\n", + "from feathr import BOOLEAN, FLOAT, INT32, ValueType\n", + "from feathr import Feature, DerivedFeature, FeatureAnchor\n", + "from feathr import BackfillTime, MaterializationSettings\n", + "from feathr import FeatureQuery, ObservationSettings\n", + "from feathr import RedisSink\n", + "from feathr import INPUT_CONTEXT, HdfsSource\n", + "from feathr import WindowAggTransformation\n", + "from feathr import TypedKey\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.model_selection import train_test_split\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.keyvault.secrets import SecretClient\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get all the required credentials from Azure KeyVault" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get all the required credentials from Azure Key Vault\n", + "key_vault_name=resource_prefix+\"kv\"\n", + "synapse_workspace_url=resource_prefix+\"syws\"\n", + "adls_account=resource_prefix+\"dls\"\n", + "adls_fs_name=resource_prefix+\"fs\"\n", + "purview_name=resource_prefix+\"purview\"\n", + "key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n", + "credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n", + "client = SecretClient(vault_url=key_vault_uri, credential=credential)\n", + "secretName = \"FEATHR-ONLINE-STORE-CONN\"\n", + "retrieved_secret = client.get_secret(secretName).value\n", + "\n", + "# Get redis credentials; This is to parse Redis connection string.\n", + "redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n", + "redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n", + "redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n", + "redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n", + "\n", + "# Set the resource link\n", + "os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n", + "os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n", + "os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n", + "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", + "os.environ['online_store__redis__host'] = redis_host\n", + "os.environ['online_store__redis__port'] = redis_port\n", + "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", + "os.environ['REDIS_PASSWORD']=redis_password\n", + "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", + "feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Configure the required environment (Don't need to update if using the above Quick Start Template)\n", + "\n", + "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. If you use Feathr CLI to create a workspace, you should have a folder with a file called `feathr_config.yaml` in it with all the required configurations. Otherwise, update the configuration below.\n", + "\n", + "The code below will write this configuration string to a temporary location and load it to Feathr. Please still refer to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It should also have more explanations on the meaning of each variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tempfile\n", + "yaml_config = \"\"\"\n", + "# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", + "api_version: 1\n", + "project_config:\n", + " project_name: 'feathr_getting_started'\n", + " required_environment_variables:\n", + " - 'REDIS_PASSWORD'\n", + " - 'AZURE_CLIENT_ID'\n", + " - 'AZURE_TENANT_ID'\n", + " - 'AZURE_CLIENT_SECRET'\n", + "offline_store:\n", + " adls:\n", + " adls_enabled: true\n", + " wasb:\n", + " wasb_enabled: true\n", + " s3:\n", + " s3_enabled: false\n", + " s3_endpoint: 's3.amazonaws.com'\n", + " jdbc:\n", + " jdbc_enabled: false\n", + " jdbc_database: 'feathrtestdb'\n", + " jdbc_table: 'feathrtesttable'\n", + " snowflake:\n", + " url: \"dqllago-ol19457.snowflakecomputing.com\"\n", + " user: \"feathrintegration\"\n", + " role: \"ACCOUNTADMIN\"\n", + "spark_config:\n", + " spark_cluster: 'azure_synapse'\n", + " spark_result_output_parts: '1'\n", + " azure_synapse:\n", + " dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n", + " pool_name: 'spark3'\n", + " workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_getting_started'\n", + " executor_size: 'Small'\n", + " executor_num: 4\n", + " feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n", + " databricks:\n", + " workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n", + " config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n", + " work_dir: 'dbfs:/feathr_getting_started'\n", + " feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n", + "online_store:\n", + " redis:\n", + " host: 'feathrazuretest3redis.redis.cache.windows.net'\n", + " port: 6380\n", + " ssl_enabled: True\n", + "feature_registry:\n", + " purview:\n", + " type_system_initialization: true\n", + " purview_name: 'feathrazuretest3-purview1'\n", + " delimiter: '__'\n", + "\"\"\"\n", + "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", + "with open(tmp.name, \"w\") as text_file:\n", + " text_file.write(yaml_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup necessary environment variables (Skip if using the above Quick Start Template)\n", + "\n", + "You should setup the environment variables in order to run this sample. More environment variables can be set by referring to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It also has more explanations on the meaning of each variable.\n", + "\n", + "To run this notebook, for Azure users, you need AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET and REDIS_PASSWORD.\n", + "To run this notebook, for Databricks useres, you need DATABRICKS_WORKSPACE_TOKEN_VALUE and REDIS_PASSWORD." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Initialize Feathr Client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = FeathrClient(config_path=tmp.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## View the data\n", + "\n", + "In this tutorial, we use Feathr Feature Store to create a model that predicts users product rating. To make it simple, let's just predict users' rating for one product. (We will expand the exmaple to predict ratings for arbitrary product in future tutorials.)\n", + "\n", + "We have 3 datasets to work with: one observation dataset(a.k.a. label dataset) and two raw datasets to generate features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Observation dataset(a.k.a. label dataset)\n", + "# Observation dataset usually comes with a event_timestamp to denote when the observation happened.\n", + "# The label here is product_rating. Our model objective is to predict a user's rating for this product.\n", + "import pandas as pd\n", + "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_observation_mock_data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# User profile dataset\n", + "# Used to generate user features\n", + "import pandas as pd\n", + "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_profile_mock_data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# User purchase history dataset.\n", + "# Used to generate user features. This is activity type data, so we need to use aggregation to genearte features.\n", + "import pandas as pd\n", + "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_purchase_history_mock_data.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Defining Features with Feathr\n", + "\n", + "In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n", + "\n", + "\n", + "1. The typed key (a.k.a. entity id) identifies the subject of feature, e.g. a user id, 123.\n", + "2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", + "3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that, in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n", + "It is merely a function/transformation executing against request data at runtime.\n", + "For example, the day of week of the request, which is calculated by converting the request UNIX timestamp.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Sources Section with UDFs\n", + "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession, DataFrame\n", + "def feathr_udf_preprocessing(df: DataFrame) -> DataFrame:\n", + " from pyspark.sql.functions import col\n", + " df = df.withColumn(\"tax_rate_decimal\", col(\"tax_rate\")/100)\n", + " df.show(10)\n", + " return df\n", + "\n", + "batch_source = HdfsSource(name=\"userProfileData\",\n", + " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_profile_mock_data.csv\",\n", + " preprocessing=feathr_udf_preprocessing)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Anchors and Features\n", + "A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "user_id = TypedKey(key_column=\"user_id\",\n", + " key_column_type=ValueType.INT32,\n", + " description=\"user id\",\n", + " full_name=\"product_recommendation.user_id\")\n", + "\n", + "feature_user_age = Feature(name=\"feature_user_age\",\n", + " key=user_id,\n", + " feature_type=INT32, transform=\"age\")\n", + "feature_user_tax_rate = Feature(name=\"feature_user_tax_rate\",\n", + " key=user_id,\n", + " feature_type=FLOAT,\n", + " transform=\"tax_rate_decimal\")\n", + "feature_user_gift_card_balance = Feature(name=\"feature_user_gift_card_balance\",\n", + " key=user_id,\n", + " feature_type=FLOAT,\n", + " transform=\"gift_card_balance\")\n", + "feature_user_has_valid_credit_card = Feature(name=\"feature_user_has_valid_credit_card\",\n", + " key=user_id,\n", + " feature_type=BOOLEAN,\n", + " transform=\"number_of_credit_cards > 0\")\n", + " \n", + "features = [\n", + " feature_user_age,\n", + " feature_user_tax_rate,\n", + " feature_user_gift_card_balance,\n", + " feature_user_has_valid_credit_card\n", + "]\n", + "\n", + "request_anchor = FeatureAnchor(name=\"anchored_features\",\n", + " source=batch_source,\n", + " features=features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Window aggregation features\n", + "\n", + "For window aggregation features, see the supported fields below:\n", + "\n", + "Note that the `agg_func` should be any of these:\n", + "\n", + "| Aggregation Type | Input Type | Description |\n", + "| --- | --- | --- |\n", + "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", + "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", + "|LATEST| Any |Returns the latest not-null values from within the defined time window |\n", + "\n", + "\n", + "After you have defined features and sources, bring them together to build an anchor:\n", + "\n", + "\n", + "Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "purchase_history_data = HdfsSource(name=\"purchase_history_data\",\n", + " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_purchase_history_mock_data.csv\",\n", + " event_timestamp_column=\"purchase_date\",\n", + " timestamp_format=\"yyyy-MM-dd\")\n", + " \n", + "agg_features = [Feature(name=\"feature_user_totla_purchase_in_90days\",\n", + " key=user_id,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(agg_expr=\"cast_float(purchase_amount)\",\n", + " agg_func=\"AVG\",\n", + " window=\"90d\"))\n", + " ]\n", + "\n", + "agg_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n", + " source=purchase_history_data,\n", + " features=agg_features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Derived Features Section\n", + "Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feature_user_purchasing_power = DerivedFeature(name=\"feature_user_purchasing_power\",\n", + " key=user_id,\n", + " feature_type=FLOAT,\n", + " input_features=[\n", + " feature_user_gift_card_balance, feature_user_has_valid_credit_card],\n", + " transform=\"feature_user_gift_card_balance + if_else(feature_user_has_valid_credit_card, 100, 0)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=[\n", + " feature_user_purchasing_power])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create training data using point-in-time correct feature join\n", + "\n", + "A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n", + "\n", + "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", + "what features and how these features should be joined to the observation data. \n", + "\n", + "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if client.spark_runtime == 'databricks':\n", + " output_path = 'dbfs:/feathrazure_test.avro'\n", + "else:\n", + " output_path = feathr_output_path\n", + "\n", + "\n", + "feature_query = FeatureQuery(\n", + " feature_list=[\"feature_user_age\", \n", + " \"feature_user_tax_rate\", \n", + " \"feature_user_gift_card_balance\", \n", + " \"feature_user_has_valid_credit_card\", \n", + " \"feature_user_totla_purchase_in_90days\",\n", + " \"feature_user_purchasing_power\"], key=user_id)\n", + "settings = ObservationSettings(\n", + " observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_observation_mock_data.csv\",\n", + " event_timestamp_column=\"event_timestamp\",\n", + " timestamp_format=\"yyyy-MM-dd\")\n", + "client.get_offline_features(observation_settings=settings,\n", + " feature_query=feature_query,\n", + " output_path=output_path)\n", + "client.wait_job_to_finish(timeout_sec=500)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download the result and show the result\n", + "\n", + "Let's use the helper function `get_result_df` to download the result and view it:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_result_df(client: FeathrClient) -> pd.DataFrame:\n", + " \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n", + " res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n", + " tmp_dir = tempfile.TemporaryDirectory()\n", + " client.feathr_spark_laucher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n", + " dataframe_list = []\n", + " # assuming the result are in avro format\n", + " for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n", + " dataframe_list.append(pdx.read_avro(file))\n", + " vertical_concat_df = pd.concat(dataframe_list, axis=0)\n", + " tmp_dir.cleanup()\n", + " return vertical_concat_df\n", + "\n", + "df_res = get_result_df(client)\n", + "\n", + "df_res" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train a machine learning model\n", + "After getting all the features, let's train a machine learning model with the converted feature by Feathr:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# drop non-feature columns\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "final_df = df_res\n", + "final_df.drop([\"event_timestamp\"], axis=1, inplace=True, errors='ignore')\n", + "final_df.fillna(0, inplace=True)\n", + "final_df['product_rating'] = final_df['product_rating'].astype(\"float64\")\n", + "\n", + "train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"product_rating\"], axis=1),\n", + " final_df[\"product_rating\"],\n", + " test_size=0.2,\n", + " random_state=42)\n", + "model = GradientBoostingRegressor()\n", + "model.fit(train_x, train_y)\n", + "\n", + "y_predict = model.predict(test_x)\n", + "\n", + "y_actual = test_y.values.flatten().tolist()\n", + "rmse = sqrt(mean_squared_error(y_actual, y_predict))\n", + "\n", + "sum_actuals = sum_errors = 0\n", + "\n", + "for actual_val, predict_val in zip(y_actual, y_predict):\n", + " abs_error = actual_val - predict_val\n", + " if abs_error < 0:\n", + " abs_error = abs_error * -1\n", + "\n", + " sum_errors = sum_errors + abs_error\n", + " sum_actuals = sum_actuals + actual_val\n", + "\n", + "mean_abs_percent_error = sum_errors / sum_actuals\n", + "print(\"Model MAPE:\")\n", + "print(mean_abs_percent_error)\n", + "print()\n", + "print(\"Model Accuracy:\")\n", + "print(1 - mean_abs_percent_error)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Materialize feature value into offline/online storage\n", + "\n", + "While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n", + "and materialize the feature value to offline and/or online storage. \n", + "\n", + "We can push the generated features to the online store like below:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "backfill_time = BackfillTime(start=datetime(\n", + " 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n", + "redisSink = RedisSink(table_name=\"productRecommendationDemoFeature\")\n", + "settings = MaterializationSettings(\"productRecommendationFeatureSetting\",\n", + " backfill_time=backfill_time,\n", + " sinks=[redisSink],\n", + " feature_names=[\"feature_user_age\", \"feature_user_gift_card_balance\"])\n", + "\n", + "client.materialize_features(settings)\n", + "client.wait_job_to_finish(timeout_sec=500)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then get the features from the online store (Redis):\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetching feature value for online inference\n", + "\n", + "For features that are already materialized by the previous step, their latest value can be queried via the client's\n", + "`get_online_features` or `multi_get_online_features` API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.get_online_features('productRecommendationDemoFeature', '2', [\n", + " 'feature_user_age', 'feature_user_gift_card_balance'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.multi_get_online_features('productRecommendationDemoFeature', ['1', '2'], [\n", + " 'feature_user_age', 'feature_user_gift_card_balance'])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Registering and Fetching features\n", + "\n", + "We can also register the features with an Apache Atlas compatible service, such as Azure Purview, and share the registered features across teams:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.register_features()\n", + "client.list_registered_features(project_name=\"feathr_getting_started\")" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "b3c5d8fd79e029a19bf620c04a250a0cafa2291ba3ed87972a3e2a099b099985" + }, + "kernelspec": { + "display_name": "Python 3.9.12 ('product_env': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/product_recommendation_demo_advanced.ipynb b/feathr_project/feathrcli/data/feathr_user_workspace/product_recommendation_demo_advanced.ipynb new file mode 100644 index 000000000..a381529b8 --- /dev/null +++ b/feathr_project/feathrcli/data/feathr_user_workspace/product_recommendation_demo_advanced.ipynb @@ -0,0 +1,850 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feathr Feature Store on Azure Demo Notebook\n", + "\n", + "This notebook illustrates the use of Feathr Feature Store to create a model that predict users' rating for different products for a e-commerce website.\n", + "\n", + "## Model Problem Statement\n", + "The e-commerce website has collected past user ratings for various products. The websie also collected data about user and product, like user age, product category etc. Now we want to predict users' product rating for new product so that we can recommend the new product to users that give a high rating for those products.\n", + "\n", + "After the model is trained, given a user_id, product_id pair and features, we should be able to predict the product rating that the user will give for this product_id.\n", + "\n", + "(Compared with [the beginner version of product recommendation](product_recommendation_demo.ipynb), this tutorial expanded the example by predicting ratings for all products.)\n", + "\n", + "## Feature Creation Illustration\n", + "In this example, our observation data has compound entity key where a record is uniquely identified by user_id and product_id. So there might be 3 types of features:\n", + "* User features that are different for different users but are the same for different products. For example, user age is different for different users but it's the same for all products(or it's product-agnostic).\n", + "* Product features that are different for different products but are the same for different users.\n", + "* User-to-product features that are different for different users AND different products. For example, a feature to represent if the user has bought this product before or not.\n", + "\n", + "We will focus on the first two in our example.\n", + "\n", + "The feature creation flow is as below:\n", + "![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/product_recommendation_advanced.jpg?raw=true)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Use Quick Start Template to Provision Azure Resources\n", + "\n", + "Feathr has native cloud integration. To use Feathr on Azure, you only need three steps:\n", + "\n", + "- Get the `Principal ID` of your account by running `az ad signed-in-user show --query objectId -o tsv` in the link below (Select \"Bash\" if asked), and write down that value (something like `b65ef2e0-42b8-44a7-9b55-abbccddeefff`). Think this ID as something representing you when accessing Azure, and it will be used to grant permissions in the next step in the UI.\n", + "\n", + "[Launch Cloud Shell](https://shell.azure.com/bash)\n", + "\n", + "- Click the button below to deploy a minimal set of Feathr resources for demo purpose. You will need to fill in the `Principal ID` and `Resource Prefix`. You will need \"Owner\" permission of the selected subscription.\n", + "\n", + "[![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Flinkedin%2Ffeathr%2Fmain%2Fdocs%2Fhow-to-guides%2Fazure_resource_provision.json)\n", + "\n", + "- Run the cells below.\n", + "\n", + "And the architecture is as below. In the above template, we are using Synapse as Spark provider, use Azure Data Lake Gen2 as offline store, and use Redis as online store, Azure Purview (Apache Atlas compatible) as feature reigstry. \n", + "\n", + "\n", + "![Architecture](https://github.com/linkedin/feathr/blob/main/docs/images/architecture.png?raw=true)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Install Feathr \n", + "\n", + "Install Feathr using pip:\n", + "\n", + "`pip install -U feathr pandavro scikit-learn`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Configure the required environment with Feathr Quick Start Template\n", + "\n", + "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. Run the code below to install Feathr, login to Azure to get the required credentials to access more cloud resources." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**REQUIRED STEP: Fill in the resource prefix when provisioning the resources**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "resource_prefix = \"feathr_resource_prefix\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install feathr azure-cli pandavro scikit-learn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Login to Azure with a device code (You will see instructions in the output):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! az login --use-device-code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "import os\n", + "import tempfile\n", + "from datetime import datetime, timedelta\n", + "from math import sqrt\n", + "\n", + "import pandas as pd\n", + "import pandavro as pdx\n", + "from feathr import FeathrClient\n", + "from feathr import BOOLEAN, FLOAT, INT32, ValueType\n", + "from feathr import Feature, DerivedFeature, FeatureAnchor\n", + "from feathr import BackfillTime, MaterializationSettings\n", + "from feathr import FeatureQuery, ObservationSettings\n", + "from feathr import RedisSink\n", + "from feathr import INPUT_CONTEXT, HdfsSource\n", + "from feathr import WindowAggTransformation\n", + "from feathr import TypedKey\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.model_selection import train_test_split\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.keyvault.secrets import SecretClient\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get all the required credentials from Azure KeyVault" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get all the required credentials from Azure Key Vault\n", + "key_vault_name=resource_prefix+\"kv\"\n", + "synapse_workspace_url=resource_prefix+\"syws\"\n", + "adls_account=resource_prefix+\"dls\"\n", + "adls_fs_name=resource_prefix+\"fs\"\n", + "purview_name=resource_prefix+\"purview\"\n", + "key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n", + "credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n", + "client = SecretClient(vault_url=key_vault_uri, credential=credential)\n", + "secretName = \"FEATHR-ONLINE-STORE-CONN\"\n", + "retrieved_secret = client.get_secret(secretName).value\n", + "\n", + "# Get redis credentials; This is to parse Redis connection string.\n", + "redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n", + "redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n", + "redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n", + "redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n", + "\n", + "# Set the resource link\n", + "os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n", + "os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n", + "os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n", + "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", + "os.environ['online_store__redis__host'] = redis_host\n", + "os.environ['online_store__redis__port'] = redis_port\n", + "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", + "os.environ['REDIS_PASSWORD']=redis_password\n", + "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", + "feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Configure the required environment (Don't need to update if using the above Quick Start Template)\n", + "\n", + "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. If you use Feathr CLI to create a workspace, you should have a folder with a file called `feathr_config.yaml` in it with all the required configurations. Otherwise, update the configuration below.\n", + "\n", + "The code below will write this configuration string to a temporary location and load it to Feathr. Please still refer to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It should also have more explanations on the meaning of each variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tempfile\n", + "yaml_config = \"\"\"\n", + "# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", + "api_version: 1\n", + "project_config:\n", + " project_name: 'feathr_getting_started'\n", + " required_environment_variables:\n", + " - 'REDIS_PASSWORD'\n", + " - 'AZURE_CLIENT_ID'\n", + " - 'AZURE_TENANT_ID'\n", + " - 'AZURE_CLIENT_SECRET'\n", + "offline_store:\n", + " adls:\n", + " adls_enabled: true\n", + " wasb:\n", + " wasb_enabled: true\n", + " s3:\n", + " s3_enabled: false\n", + " s3_endpoint: 's3.amazonaws.com'\n", + " jdbc:\n", + " jdbc_enabled: false\n", + " jdbc_database: 'feathrtestdb'\n", + " jdbc_table: 'feathrtesttable'\n", + " snowflake:\n", + " url: \"dqllago-ol19457.snowflakecomputing.com\"\n", + " user: \"feathrintegration\"\n", + " role: \"ACCOUNTADMIN\"\n", + "spark_config:\n", + " spark_cluster: 'azure_synapse'\n", + " spark_result_output_parts: '1'\n", + " azure_synapse:\n", + " dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n", + " pool_name: 'spark3'\n", + " workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_getting_started'\n", + " executor_size: 'Small'\n", + " executor_num: 4\n", + " feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n", + " databricks:\n", + " workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n", + " config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n", + " work_dir: 'dbfs:/feathr_getting_started'\n", + " feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n", + "online_store:\n", + " redis:\n", + " host: 'feathrazuretest3redis.redis.cache.windows.net'\n", + " port: 6380\n", + " ssl_enabled: True\n", + "feature_registry:\n", + " purview:\n", + " type_system_initialization: true\n", + " purview_name: 'feathrazuretest3-purview1'\n", + " delimiter: '__'\n", + "\"\"\"\n", + "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", + "with open(tmp.name, \"w\") as text_file:\n", + " text_file.write(yaml_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup necessary environment variables (Skip if using the above Quick Start Template)\n", + "\n", + "You should setup the environment variables in order to run this sample. More environment variables can be set by referring to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It also has more explanations on the meaning of each variable.\n", + "\n", + "To run this notebook, for Azure users, you need AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET and REDIS_PASSWORD.\n", + "To run this notebook, for Databricks useres, you need DATABRICKS_WORKSPACE_TOKEN_VALUE and REDIS_PASSWORD." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Initialize Feathr Client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = FeathrClient(config_path=tmp.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore the raw source data\n", + "We have 4 datasets to work with: one observation dataset(a.k.a. label dataset), two raw datasets to generate features for users, one raw datasets to generate features for product." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Observation dataset(a.k.a. label dataset)\n", + "# Observation dataset usually comes with a event_timestamp to denote when the observation happened.\n", + "# The label here is product_rating. Our model objective is to predict a user's rating for this product.\n", + "import pandas as pd\n", + "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_observation_mock_data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# User profile dataset\n", + "# Used to generate user features\n", + "import pandas as pd\n", + "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_profile_mock_data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# User purchase history dataset.\n", + "# Used to generate user features. This is activity type data, so we need to use aggregation to genearte features.\n", + "import pandas as pd\n", + "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/user_purchase_history_mock_data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Product detail dataset.\n", + "# Used to generate product features.\n", + "import pandas as pd\n", + "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/product_recommendation_sample/product_detail_mock_data.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Defining Features with Feathr\n", + "Let's try to create features from those raw source data.\n", + "In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n", + "\n", + "\n", + "1. The typed key (a.k.a. entity key) identifies the subject of feature, e.g. a user id, 123.\n", + "2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", + "3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n", + "4. The timestamp indicates when the event happened. For example, the user purchased certain product on a certain timestamp. This is usually used for point-in-time join." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n", + "It is merely a function/transformation executing against request data at runtime.\n", + "For example, the day of week of the request, which is calculated by converting the request UNIX timestamp.\n", + "(We won't cover this in the tutorial.)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Sources Section with UDFs\n", + "\n", + "#### Define Anchors and Features\n", + "A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature.\n", + "\n", + "#### Feature source\n", + "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession, DataFrame\n", + "def feathr_udf_preprocessing(df: DataFrame) -> DataFrame:\n", + " from pyspark.sql.functions import col\n", + " df = df.withColumn(\"tax_rate_decimal\", col(\"tax_rate\")/100)\n", + " df.show(10)\n", + " return df\n", + "\n", + "batch_source = HdfsSource(name=\"userProfileData\",\n", + " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_profile_mock_data.csv\",\n", + " preprocessing=feathr_udf_preprocessing)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's define some features for users so our recommendation can be customized for users.\n", + "user_id = TypedKey(key_column=\"user_id\",\n", + " key_column_type=ValueType.INT32,\n", + " description=\"user id\",\n", + " full_name=\"product_recommendation.user_id\")\n", + "\n", + "feature_user_age = Feature(name=\"feature_user_age\",\n", + " key=user_id,\n", + " feature_type=INT32, transform=\"age\")\n", + "feature_user_tax_rate = Feature(name=\"feature_user_tax_rate\",\n", + " key=user_id,\n", + " feature_type=FLOAT,\n", + " transform=\"tax_rate_decimal\")\n", + "feature_user_gift_card_balance = Feature(name=\"feature_user_gift_card_balance\",\n", + " key=user_id,\n", + " feature_type=FLOAT,\n", + " transform=\"gift_card_balance\")\n", + "feature_user_has_valid_credit_card = Feature(name=\"feature_user_has_valid_credit_card\",\n", + " key=user_id,\n", + " feature_type=BOOLEAN,\n", + " transform=\"number_of_credit_cards > 0\")\n", + " \n", + "features = [\n", + " feature_user_age,\n", + " feature_user_tax_rate,\n", + " feature_user_gift_card_balance,\n", + " feature_user_has_valid_credit_card\n", + "]\n", + "\n", + "user_feature_anchor = FeatureAnchor(name=\"anchored_features\",\n", + " source=batch_source,\n", + " features=features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's define some features for the products so our recommendation can be customized for proudcts.\n", + "product_batch_source = HdfsSource(name=\"productProfileData\",\n", + " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/product_detail_mock_data.csv\")\n", + "\n", + "product_id = TypedKey(key_column=\"product_id\",\n", + " key_column_type=ValueType.INT32,\n", + " description=\"product id\",\n", + " full_name=\"product_recommendation.product_id\")\n", + "\n", + "feature_product_quantity = Feature(name=\"feature_product_quantity\",\n", + " key=product_id,\n", + " feature_type=FLOAT, \n", + " transform=\"quantity\")\n", + "feature_product_price = Feature(name=\"feature_product_price\",\n", + " key=product_id,\n", + " feature_type=FLOAT,\n", + " transform=\"price\")\n", + " \n", + "product_features = [\n", + " feature_product_quantity,\n", + " feature_product_price\n", + "]\n", + "\n", + "product_anchor = FeatureAnchor(name=\"product_anchored_features\",\n", + " source=product_batch_source,\n", + " features=product_features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Window aggregation features\n", + "\n", + "For window aggregation features, see the supported fields below:\n", + "\n", + "Note that the `agg_func` should be any of these:\n", + "\n", + "| Aggregation Type | Input Type | Description |\n", + "| --- | --- | --- |\n", + "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", + "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", + "|LATEST| Any |Returns the latest not-null values from within the defined time window |\n", + "\n", + "\n", + "After you have defined features and sources, bring them together to build an anchor:\n", + "\n", + "\n", + "Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "purchase_history_data = HdfsSource(name=\"purchase_history_data\",\n", + " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_purchase_history_mock_data.csv\",\n", + " event_timestamp_column=\"purchase_date\",\n", + " timestamp_format=\"yyyy-MM-dd\")\n", + " \n", + "agg_features = [Feature(name=\"feature_user_totla_purchase_in_90days\",\n", + " key=user_id,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(agg_expr=\"cast_float(purchase_amount)\",\n", + " agg_func=\"AVG\",\n", + " window=\"90d\"))\n", + " ]\n", + "\n", + "user_agg_feature_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n", + " source=purchase_history_data,\n", + " features=agg_features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Derived Features Section\n", + "Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "feature_user_purchasing_power = DerivedFeature(name=\"feature_user_purchasing_power\",\n", + " key=user_id,\n", + " feature_type=FLOAT,\n", + " input_features=[\n", + " feature_user_gift_card_balance, feature_user_has_valid_credit_card],\n", + " transform=\"feature_user_gift_card_balance + if_else(feature_user_has_valid_credit_card, 100, 0)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "client.build_features(anchor_list=[user_agg_feature_anchor, user_feature_anchor, product_anchor], derived_feature_list=[\n", + " feature_user_purchasing_power])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create training data using point-in-time correct feature join\n", + "\n", + "A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n", + "\n", + "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", + "what features and how these features should be joined to the observation data. \n", + "\n", + "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if client.spark_runtime == 'databricks':\n", + " output_path = 'dbfs:/feathrazure_test.avro'\n", + "else:\n", + " output_path = 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/test123_temp/product_rec_new'\n", + " # output_path = feathr_output_path\n", + "\n", + "\n", + "user_feature_query = FeatureQuery(\n", + " feature_list=[\"feature_user_age\", \n", + " \"feature_user_tax_rate\", \n", + " \"feature_user_gift_card_balance\", \n", + " \"feature_user_has_valid_credit_card\", \n", + " \"feature_user_totla_purchase_in_90days\",\n", + " \"feature_user_purchasing_power\"\n", + " ], \n", + " key=user_id)\n", + "\n", + "product_feature_query = FeatureQuery(\n", + " feature_list=[\n", + " \"feature_product_quantity\",\n", + " \"feature_product_price\"\n", + " ], \n", + " key=product_id)\n", + "\n", + "settings = ObservationSettings(\n", + " observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/product_recommendation_sample/user_observation_mock_data.csv\",\n", + " event_timestamp_column=\"event_timestamp\",\n", + " timestamp_format=\"yyyy-MM-dd\")\n", + "client.get_offline_features(observation_settings=settings,\n", + " feature_query=[user_feature_query, product_feature_query],\n", + " output_path=output_path)\n", + "client.wait_job_to_finish(timeout_sec=1000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download the training dataset and show the result\n", + "\n", + "Let's use the helper function `get_result_df` to download the result and view it:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_result_df(client: FeathrClient) -> pd.DataFrame:\n", + " \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n", + " res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n", + " tmp_dir = tempfile.TemporaryDirectory()\n", + " client.feathr_spark_laucher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n", + " dataframe_list = []\n", + " # assuming the result are in avro format\n", + " for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n", + " dataframe_list.append(pdx.read_avro(file))\n", + " vertical_concat_df = pd.concat(dataframe_list, axis=0)\n", + " tmp_dir.cleanup()\n", + " return vertical_concat_df\n", + "\n", + "df_res = get_result_df(client)\n", + "\n", + "df_res" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train a machine learning model\n", + "After getting all the features, let's train a machine learning model with the converted feature by Feathr:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import GradientBoostingRegressor\n", + "train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"product_rating\"], axis=1),\n", + " final_df[\"product_rating\"],\n", + " test_size=0.2,\n", + " random_state=42)\n", + "model = GradientBoostingRegressor()\n", + "model.fit(train_x, train_y)\n", + "\n", + "y_predict = model.predict(test_x)\n", + "\n", + "y_actual = test_y.values.flatten().tolist()\n", + "rmse = sqrt(mean_squared_error(y_actual, y_predict))\n", + "\n", + "sum_actuals = sum_errors = 0\n", + "\n", + "for actual_val, predict_val in zip(y_actual, y_predict):\n", + " abs_error = actual_val - predict_val\n", + " if abs_error < 0:\n", + " abs_error = abs_error * -1\n", + "\n", + " sum_errors = sum_errors + abs_error\n", + " sum_actuals = sum_actuals + actual_val\n", + "\n", + "mean_abs_percent_error = sum_errors / sum_actuals\n", + "print(\"Model MAPE:\")\n", + "print(mean_abs_percent_error)\n", + "print()\n", + "print(\"Model Accuracy:\")\n", + "print(1 - mean_abs_percent_error)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Materialize feature value into offline/online storage\n", + "\n", + "While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n", + "and materialize the feature value to offline and/or online storage. \n", + "\n", + "We can push the generated features to the online store like below:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Materialize user features\n", + "# (You can only materialize features of same entity key into one table so we can only materialize user features first.)\n", + "backfill_time = BackfillTime(start=datetime(\n", + " 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n", + "redisSink = RedisSink(table_name=\"user_features\")\n", + "settings = MaterializationSettings(\"user_feature_setting\",\n", + " backfill_time=backfill_time,\n", + " sinks=[redisSink],\n", + " feature_names=[\"feature_user_age\", \"feature_user_gift_card_balance\"])\n", + "\n", + "client.materialize_features(settings)\n", + "client.wait_job_to_finish(timeout_sec=1000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then get the features from the online store (Redis):\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetching feature value for online inference\n", + "\n", + "For features that are already materialized by the previous step, their latest value can be queried via the client's\n", + "`get_online_features` or `multi_get_online_features` API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.get_online_features('user_features', '2', [\n", + " 'feature_user_age', 'feature_user_gift_card_balance'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.multi_get_online_features('user_features', ['1', '2'], [\n", + " 'feature_user_age', 'feature_user_gift_card_balance'])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Materialize product features\n", + "\n", + "We can also materialize product features into a separate table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Materialize product features\n", + "backfill_time = BackfillTime(start=datetime(\n", + " 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n", + "redisSink = RedisSink(table_name=\"product_features\")\n", + "settings = MaterializationSettings(\"product_feature_setting\",\n", + " backfill_time=backfill_time,\n", + " sinks=[redisSink],\n", + " feature_names=[\"feature_product_price\"])\n", + "\n", + "client.materialize_features(settings)\n", + "client.wait_job_to_finish(timeout_sec=1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.get_online_features('product_feature_setting', '2', [\n", + " 'feature_product_price'])\n", + "\n", + "client.get_online_features('product_features', '2', [\n", + " 'feature_product_price'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Registering and Fetching features\n", + "\n", + "We can also register the features with an Apache Atlas compatible service, such as Azure Purview, and share the registered features across teams:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.register_features()\n", + "client.list_registered_features(project_name=\"feathr_getting_started\")" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "b3c5d8fd79e029a19bf620c04a250a0cafa2291ba3ed87972a3e2a099b099985" + }, + "kernelspec": { + "display_name": "Python 3.9.12 ('product_env': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/feathr_project/test/prep_azure_kafka_test_data.py b/feathr_project/test/prep_azure_kafka_test_data.py index 7d4a195d3..674040be8 100644 --- a/feathr_project/test/prep_azure_kafka_test_data.py +++ b/feathr_project/test/prep_azure_kafka_test_data.py @@ -40,7 +40,8 @@ def send_avro_record_to_kafka(topic, record): bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(record, encoder) - sasl = _EnvVaraibleUtil.get_environment_variable('KAFKA_SASL_JAAS_CONFIG') + envutils = _EnvVaraibleUtil() + sasl = envutils.get_environment_variable('KAFKA_SASL_JAAS_CONFIG') conf = { 'bootstrap.servers': KAFKA_BROKER, 'security.protocol': 'SASL_SSL', diff --git a/feathr_project/test/test_azure_feature_monitoring_e2e.py b/feathr_project/test/test_azure_feature_monitoring_e2e.py new file mode 100644 index 000000000..ed2cbebd2 --- /dev/null +++ b/feathr_project/test/test_azure_feature_monitoring_e2e.py @@ -0,0 +1,25 @@ +import os +from pathlib import Path + +from feathr import MonitoringSettings +from feathr import RedisSink, MonitoringSqlSink + +from test_fixture import (basic_test_setup, get_online_test_table_name) + + +def test_feature_monitoring(): + online_test_table = get_online_test_table_name("nycTaxiCITable") + test_workspace_dir = Path( + __file__).parent.resolve() / "test_user_workspace" + + client = basic_test_setup(os.path.join(test_workspace_dir, "feathr_config.yaml")) + + monitor_sink = MonitoringSqlSink(table_name=online_test_table) + settings = MonitoringSettings("monitoringSetting", + sinks=[monitor_sink], + feature_names=[ + "f_location_avg_fare", "f_location_max_fare"]) + client.monitor_features(settings) + # just assume the job is successful without validating the actual result in Redis. Might need to consolidate + # this part with the test_feathr_online_store test case + client.wait_job_to_finish(timeout_sec=900) diff --git a/feathr_project/test/test_azure_spark_maven_e2e.py b/feathr_project/test/test_azure_spark_maven_e2e.py new file mode 100644 index 000000000..5aa51b4ab --- /dev/null +++ b/feathr_project/test/test_azure_spark_maven_e2e.py @@ -0,0 +1,63 @@ +import os +from datetime import datetime, timedelta +from pathlib import Path + +from click.testing import CliRunner +from feathr import BOOLEAN, FLOAT, INT32, ValueType +from feathr import FeathrClient +from feathr import ValueType +from feathr.utils.job_utils import get_result_df +from feathr import (BackfillTime, MaterializationSettings) +from feathr import FeatureQuery +from feathr import ObservationSettings +from feathr import RedisSink, HdfsSink +from feathr import TypedKey +from feathrcli.cli import init +import pytest + +from test_fixture import (basic_test_setup, get_online_test_table_name) + +def test_feathr_online_store_agg_features(): + """ + Test FeathrClient() get_online_features and batch_get can get data correctly. + """ + + online_test_table = get_online_test_table_name("nycTaxiCITable") + test_workspace_dir = Path( + __file__).parent.resolve() / "test_user_workspace" + # os.chdir(test_workspace_dir) + + # The `feathr_runtime_location` was commented out in this config file, so feathr should use + # Maven package as the dependency and `noop.jar` as the main file + client = basic_test_setup(os.path.join(test_workspace_dir, "feathr_config_maven.yaml")) + + backfill_time = BackfillTime(start=datetime( + 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1)) + redisSink = RedisSink(table_name=online_test_table) + settings = MaterializationSettings("nycTaxiTable", + sinks=[redisSink], + feature_names=[ + "f_location_avg_fare", "f_location_max_fare"], + backfill_time=backfill_time) + client.materialize_features(settings) + # just assume the job is successful without validating the actual result in Redis. Might need to consolidate + # this part with the test_feathr_online_store test case + client.wait_job_to_finish(timeout_sec=900) + + res = client.get_online_features(online_test_table, '265', [ + 'f_location_avg_fare', 'f_location_max_fare']) + # just assme there are values. We don't hard code the values for now for testing + # the correctness of the feature generation should be garunteed by feathr runtime. + # ID 239 and 265 are available in the `DOLocationID` column in this file: + # https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2020-04.csv + # View more detials on this dataset: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page + assert len(res) == 2 + assert res[0] != None + assert res[1] != None + res = client.multi_get_online_features(online_test_table, + ['239', '265'], + ['f_location_avg_fare', 'f_location_max_fare']) + assert res['239'][0] != None + assert res['239'][1] != None + assert res['265'][0] != None + assert res['265'][1] != None \ No newline at end of file diff --git a/feathr_project/test/test_feature_registry.py b/feathr_project/test/test_feature_registry.py index 970fccdea..b5647d213 100644 --- a/feathr_project/test/test_feature_registry.py +++ b/feathr_project/test/test_feature_registry.py @@ -76,6 +76,9 @@ def test_feathr_register_features_partially(): client.register_features() time.sleep(30) full_registration = client.get_features_from_registry(client.project_name) + + now = datetime.now() + os.environ["project_config__project_name"] = ''.join(['feathr_ci_registry','_', str(now.minute), '_', str(now.second), '_', str(now.microsecond)]) client: FeathrClient = registry_test_setup_partially(os.path.join(test_workspace_dir, "feathr_config.yaml")) new_project_name = client.project_name diff --git a/feathr_project/test/test_fixture.py b/feathr_project/test/test_fixture.py index b7d431d33..145a4de81 100644 --- a/feathr_project/test/test_fixture.py +++ b/feathr_project/test/test_fixture.py @@ -171,12 +171,8 @@ def registry_test_setup(config_path: str): client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=derived_feature_list) return client def registry_test_setup_partially(config_path: str): - - - # use a new project name every time to make sure all features are registered correctly - now = datetime.now() - os.environ["project_config__project_name"] = ''.join(['feathr_ci_registry','_', str(now.minute), '_', str(now.second), '_', str(now.microsecond)]) - + """Register a partial of a project. Will call `generate_entities()` and register only the first anchor feature. + """ client = FeathrClient(config_path=config_path, project_registry_tag={"for_test_purpose":"true"}) request_anchor, agg_anchor, derived_feature_list = generate_entities() @@ -185,11 +181,8 @@ def registry_test_setup_partially(config_path: str): return client def registry_test_setup_append(config_path: str): - - - # use a new project name every time to make sure all features are registered correctly - now = datetime.now() - os.environ["project_config__project_name"] = ''.join(['feathr_ci_registry','_', str(now.minute), '_', str(now.second), '_', str(now.microsecond)]) + """Append features to a project. Will call `generate_entities()` and register from the 2nd anchor feature + """ client = FeathrClient(config_path=config_path, project_registry_tag={"for_test_purpose":"true"}) diff --git a/feathr_project/test/test_secrets_read.py b/feathr_project/test/test_secrets_read.py new file mode 100644 index 000000000..e1d14dcc6 --- /dev/null +++ b/feathr_project/test/test_secrets_read.py @@ -0,0 +1,61 @@ +import os +from datetime import datetime +from pathlib import Path +from unittest import result + +from click.testing import CliRunner +from feathr import (BOOLEAN, FLOAT, INT32, FeatureQuery, ObservationSettings, + SparkExecutionConfiguration, TypedKey, ValueType) +from feathr.client import FeathrClient +from feathr.utils.job_utils import get_result_df + +from test_fixture import basic_test_setup +from feathr.constants import OUTPUT_FORMAT + +# test parquet file read/write without an extension name +def test_feathr_get_secrets_from_key_vault(): + """ + Test if the program can read the key vault secrets as expected + """ + # TODO: need to test get_environment_variable() as well + os.environ['SECRETS__AZURE_KEY_VAULT__NAME'] = 'feathrazuretest3-kv' + + # the config below doesn't have `ONLINE_STORE__REDIS__HOST` for testing purpose + yaml_config = """ + project_config: + project_name: 'project_feathr_integration_test' + offline_store: + s3: + s3_enabled: true + s3_endpoint: 's3.amazonaws.com' + snowflake: + url: "dqllago-ol19457.snowflakecomputing.com" + user: "feathrintegration" + role: "ACCOUNTADMIN" + spark_config: + spark_cluster: 'databricks' + spark_result_output_parts: '1' + databricks: + workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' + workspace_token_value: '' + config_template: '{"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":2,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}}' + work_dir: 'dbfs:/feathr_getting_started' + feathr_runtime_location: '' + online_store: + redis: + port: 6380 + ssl_enabled: True + feature_registry: + purview: + type_system_initialization: false + purview_name: 'feathrazuretest3-purview1' + delimiter: '__' + """ + + with open("/tmp/feathr_config.yaml", "w") as text_file: + text_file.write(yaml_config) + + client = FeathrClient(config_path="/tmp/feathr_config.yaml") + # `redis_host` should be there since it's not available in the environment variable, and not in the config file, we expect we get it from azure key_vault + assert client.redis_host is not None + diff --git a/feathr_project/test/test_user_workspace/feathr_config.yaml b/feathr_project/test/test_user_workspace/feathr_config.yaml index 08c36a8c4..fdfc1d556 100644 --- a/feathr_project/test/test_user_workspace/feathr_config.yaml +++ b/feathr_project/test/test_user_workspace/feathr_config.yaml @@ -84,11 +84,11 @@ spark_config: feathr_runtime_location: "../../feathr-impl/build/libs/feathr-impl-0.6.0-cloud.jar" databricks: # workspace instance - workspace_instance_url: 'https://adb-5638037984879289.9.azuredatabricks.net/' + workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net/' workspace_token_value: '' # config string including run time information, spark version, machine size, etc. # the config follows the format in the databricks documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs - config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_F4s','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}} + config_template: {"run_name":"FEATHR_FILL_IN","new_cluster":{"spark_version":"9.1.x-scala2.12","num_workers":2,"spark_conf":{"FEATHR_FILL_IN":"FEATHR_FILL_IN"},"instance_pool_id":"0403-214809-inlet434-pool-l9dj3kwz"},"libraries":[{"jar":"FEATHR_FILL_IN"}],"spark_jar_task":{"main_class_name":"FEATHR_FILL_IN","parameters":["FEATHR_FILL_IN"]}} # Feathr Job location. Support local paths, path start with http(s)://, and paths start with dbfs:/ work_dir: 'dbfs:/feathr_getting_started' # this is the default location so end users don't have to compile the runtime again. @@ -111,4 +111,10 @@ feature_registry: # delimiter indicates that how the project/workspace name, feature names etc. are delimited. By default it will be '__' # this is for global reference (mainly for feature sharing). For exmaple, when we setup a project called foo, and we have an anchor called 'taxi_driver' and the feature name is called 'f_daily_trips' # the feature will have a globally unique name called 'foo__taxi_driver__f_daily_trips' - delimiter: '__' \ No newline at end of file + delimiter: '__' + +monitoring: + database: + sql: + url: 'jdbc:postgresql://featuremonitoring.postgres.database.azure.com:5432/postgres' + user: "demo" diff --git a/feathr_project/test/test_user_workspace/feathr_config_maven.yaml b/feathr_project/test/test_user_workspace/feathr_config_maven.yaml new file mode 100644 index 000000000..ed3af5826 --- /dev/null +++ b/feathr_project/test/test_user_workspace/feathr_config_maven.yaml @@ -0,0 +1,118 @@ +# DO NOT MOVE OR DELETE THIS FILE + +# This file contains the configurations that are used by Feathr +# All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of this config file. +# For example, `feathr_runtime_location` for databricks can be overwritten by setting this environment variable: +# SPARK_CONFIG__DATABRICKS__FEATHR_RUNTIME_LOCATION +# Another example would be overwriting Redis host with this config: `ONLINE_STORE__REDIS__HOST` +# For example if you want to override this setting in a shell environment: +# export ONLINE_STORE__REDIS__HOST=feathrazure.redis.cache.windows.net + +# version of API settings +api_version: 1 +project_config: + project_name: 'project_feathr_integration_test' + # Information that are required to be set via environment variables. + required_environment_variables: + # the environemnt variables are required to run Feathr + # Redis password for your online store + - 'REDIS_PASSWORD' + # client IDs and client Secret for the service principal. Read the getting started docs on how to get those information. + - 'AZURE_CLIENT_ID' + - 'AZURE_TENANT_ID' + - 'AZURE_CLIENT_SECRET' + optional_environment_variables: + # the environemnt variables are optional, however you will need them if you want to use some of the services: + - ADLS_ACCOUNT + - ADLS_KEY + - WASB_ACCOUNT + - WASB_KEY + - S3_ACCESS_KEY + - S3_SECRET_KEY + - JDBC_TABLE + - JDBC_USER + - JDBC_PASSWORD + - KAFKA_SASL_JAAS_CONFIG + +offline_store: + # paths starts with abfss:// or abfs:// + # ADLS_ACCOUNT and ADLS_KEY should be set in environment variable if this is set to true + adls: + adls_enabled: true + + # paths starts with wasb:// or wasbs:// + # WASB_ACCOUNT and WASB_KEY should be set in environment variable + wasb: + wasb_enabled: true + + # paths starts with s3a:// + # S3_ACCESS_KEY and S3_SECRET_KEY should be set in environment variable + s3: + s3_enabled: true + # S3 endpoint. If you use S3 endpoint, then you need to provide access key and secret key in the environment variable as well. + s3_endpoint: 's3.amazonaws.com' + + # jdbc endpoint + jdbc: + jdbc_enabled: true + jdbc_database: 'feathrtestdb' + jdbc_table: 'feathrtesttable' + + # snowflake endpoint + snowflake: + url: "dqllago-ol19457.snowflakecomputing.com" + user: "feathrintegration" + role: "ACCOUNTADMIN" + +spark_config: + # choice for spark runtime. Currently support: azure_synapse, databricks + # The `databricks` configs will be ignored if `azure_synapse` is set and vice versa. + spark_cluster: 'azure_synapse' + # configure number of parts for the spark output for feature generation job + spark_result_output_parts: '1' + + azure_synapse: + dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net' + pool_name: 'spark3' + # workspace dir for storing all the required configuration files and the jar resources + workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_test_workspace' + executor_size: 'Small' + executor_num: 4 + + # Feathr Job configuration. Support local paths, path start with http(s)://, and paths start with abfs(s):// + # this is the default location so end users don't have to compile the runtime again. + # feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-0.5.0-SNAPSHOT.jar + # Unset this value will use default package on Maven + # feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.5.0.jar" + databricks: + # workspace instance + workspace_instance_url: 'https://adb-5638037984879289.9.azuredatabricks.net/' + workspace_token_value: '' + # config string including run time information, spark version, machine size, etc. + # the config follows the format in the databricks documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs + config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_F4s','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}} + # Feathr Job location. Support local paths, path start with http(s)://, and paths start with dbfs:/ + work_dir: 'dbfs:/feathr_getting_started' + + # this is the default location so end users don't have to compile the runtime again. + # Unset this value will use default package on Maven + # feathr_runtime_location: "../../target/scala-2.12/feathr-assembly-0.5.0.jar" + +online_store: + redis: + # Redis configs to access Redis cluster + host: 'feathrazuretest3redis.redis.cache.windows.net' + port: 6380 + ssl_enabled: True + +feature_registry: + purview: + # Registry configs + # register type system in purview during feathr client initialization. This is only required to be executed once. + type_system_initialization: true + # configure the name of the purview endpoint + purview_name: 'feathrazuretest3-purview1' + # delimiter indicates that how the project/workspace name, feature names etc. are delimited. By default it will be '__' + # this is for global reference (mainly for feature sharing). For exmaple, when we setup a project called foo, and we have an anchor called 'taxi_driver' and the feature name is called 'f_daily_trips' + # the feature will have a globally unique name called 'foo__taxi_driver__f_daily_trips' + delimiter: '__' \ No newline at end of file diff --git a/feathr_project/test/test_utils/query_sql.py b/feathr_project/test/test_utils/query_sql.py new file mode 100644 index 000000000..8e14b8cda --- /dev/null +++ b/feathr_project/test/test_utils/query_sql.py @@ -0,0 +1,43 @@ +import psycopg2 +from feathr._envvariableutil import _EnvVaraibleUtil + +# script to query SQL database for debugging purpose + +def show_table(cursor, table_name): + cursor.execute("select * from " + table_name + ";") + print(cursor.fetchall()) + + q = """ + SELECT column_name, data_type, is_nullable + FROM information_schema.columns + WHERE table_name = %s; + """ + + cur = conn.cursor() + cur.execute(q, (table_name,)) # (table_name,) passed as tuple + print(cur.fetchall()) + + +# Update connection string information +host = "featuremonitoring.postgres.database.azure.com" +dbname = "postgres" +user = "demo" +password = _EnvVaraibleUtil.get_environment_variable('SQL_TEST_PASSWORD') +sslmode = "require" + +# Construct connection string +conn_string = "host={0} user={1} dbname={2} password={3} sslmode={4}".format(host, user, dbname, password, sslmode) +conn = psycopg2.connect(conn_string) +print("Connection established") + +cursor = conn.cursor() + +show_table(cursor, "f_int") +cursor.execute("select * from f_location_avg_fare;") +print(cursor.fetchall()) + + +# Clean up +conn.commit() +cursor.close() +conn.close() \ No newline at end of file diff --git a/registry/sql-registry/.dockerignore b/registry/sql-registry/.dockerignore new file mode 100644 index 000000000..bc0ed1f7a --- /dev/null +++ b/registry/sql-registry/.dockerignore @@ -0,0 +1,3 @@ +__pycache__ +.env +.vscode diff --git a/registry/sql-registry/.gitignore b/registry/sql-registry/.gitignore new file mode 100644 index 000000000..ed2a6faed --- /dev/null +++ b/registry/sql-registry/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +.env +.vscode +.idea diff --git a/registry/sql-registry/Dockerfile b/registry/sql-registry/Dockerfile new file mode 100644 index 000000000..d2647021d --- /dev/null +++ b/registry/sql-registry/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.9 + +COPY ./ /usr/src + +WORKDIR /usr/src +RUN pip install -r requirements.txt + +# Start web server +CMD [ "uvicorn","main:app","--host", "0.0.0.0", "--port", "80" ] diff --git a/registry/sql-registry/README.md b/registry/sql-registry/README.md new file mode 100644 index 000000000..f06ca7def --- /dev/null +++ b/registry/sql-registry/README.md @@ -0,0 +1,5 @@ +# SQL-Based Registry for Feathr + +This is the reference implementation of [the Feathr API spec](./api-spec.md), base on SQL databases instead of PurView. + +Please note that this implementation uses iterations of `select` to retrieve graph lineages, this approach is very inefficient and should **not** be considered as production-ready. We only suggest to use this implementation for testing/researching purposes. \ No newline at end of file diff --git a/registry/sql-registry/api-spec.md b/registry/sql-registry/api-spec.md new file mode 100644 index 000000000..1b14cae8b --- /dev/null +++ b/registry/sql-registry/api-spec.md @@ -0,0 +1,366 @@ +# Feathr Registry API Specifications + +## Data Models + +### EntityType +Type: Enum + +| Value | +|-----------------------------| +| `feathr_workspace_v1` | +| `feathr_source_v1` | +| `feathr_anchor_v1` | +| `feathr_anchor_feature_v1` | +| `feathr_derived_feature_v1` | + +### ValueType +Type: Enum + +| Value | +|---------------| +| `UNSPECIFIED` | +| `BOOL` | +| `INT32` | +| `INT64` | +| `FLOAT` | +| `DOUBLE` | +| `STRING` | +| `BYTES` | + +### VectorType +Type: Enum + +| Value | +|----------| +| `TENSOR` | + +### TensorCategory +Type: Enum + +| Value | +|----------| +| `DENSE` | +| `SPARSE` | + +### FeatureType +Type: Object + +| Field | Type | +|----------------|-------------------------------------| +| type | [`VectorType`](#valuetype) | +| tensorCategory | [`TensorCategory`](#tensorcategory) | +| dimensionType | [`array`](#valuetype) | +| valType | [`ValueType`](#valuetype) | + +### TypedKey +Type: Object + +| Field | Type | +|------------------|-----------------------------| +| key_column | `string` | +| key_column_type | [`ValueType`](#valuetype) | +| full_name | `string`, optional | +| description | `string`, optional | +| key_column_alias | `string`, optional | + +### ExpressionTransformation +Type: Object + +| Field | Type | +|----------------|----------| +| transform_expr | `string` | + +### WindowAggregationTransformation +Type: Object + +| Field | Type | +|----------|--------------------| +| def_expr | `string` | +| agg_func | `string`, optional | +| window | `string`, optional | +| group_by | `string`, optional | +| filter | `string`, optional | +| limit | `number`, optional | + +### UdfTransformation +Type: Object + +| Field | Type | +|-------|----------| +| name | `string` | + +### EntityReference +Type: Object + +| Field | Type | Comments | +|------------------|-----------------------------|--------------------------------------| +| guid | `Guid` | | +| typeName | [`EntityType`](#entitytype) | | +| uniqueAttributes | `map` | Contains `qualifiedName` only so far | + +### ProjectAttributes +Type: Object + +| Field | Type | +|------------------|----------------------------------------------| +| qualifiedName | `string` | +| name | `string` | +| anchors | [`array`](#entityreference) | +| sources | [`array`](#entityreference) | +| anchor_features | [`array`](#entityreference) | +| derived_features | [`array`](#entityreference) | +| tags | `map` | + +### SourceAttributes +Type: Object + +| Field | Type | +|----------------------|-----------------------| +| qualifiedName | `string` | +| name | `string` | +| path | `string` | +| preprocessing | `string`, optional | +| eventTimestampColumn | `string`, optional | +| timestampFormat | `string`, optional | +| type | `string` | +| tags | `map` | + +### AnchorAttributes +Type: Object + +| Field | Type | +|---------------|----------------------------------------------| +| qualifiedName | `string` | +| name | `string` | +| features | [`array`](#entityreference) | +| source | [`EntityReference`](#entityreference) | +| tags | `map` | + +### AnchorFeatureAttributes +Type: Object + +| Field | Type | +|----------------|--------------------------------| +| qualifiedName | `string` | +| name | `string` | +| type | [`FeatureType`](#featuretype) | +| transformation | [`ExpressionTransformation`](#expressiontransformation)
`or` [`WindowAggregationTransformation`](#windowaggregationtransformation)
`or` [`UdfTransformation`](#udftransformation) | +| key | [`array`](#typedkey) | +| tags | `map` | + +### DerivedFeatureAttributes +Type: Object + +| Field | Type | +|------------------------|--------------------------------| +| qualifiedName | `string` | +| name | `string` | +| type | [`FeatureType`](#featuretype) | +| transformation | [`ExpressionTransformation`](#expressiontransformation)
`or` [`WindowAggregationTransformation`](#windowaggregationtransformation)
`or` [`UdfTransformation`](#udftransformation) | +| key | [`array`](#typedkey) | +| input_anchor_features | [`array`](#entityreference) | +| input_derived_features | [`array`](#entityreference) | +| tags | `map` | + +### EntityStatus +Type: Enum + +| Value | +|----------| +| `ACTIVE` | + +### Entity +Type: Object + +| Field | Type | +|----------------|---------------------------------| +| guid | `Guid` | +| lastModifiedTS | `string` | +| status | [`EntityStatus`](#entitystatus) | +| displayText | `string` | +| typeName | [`EntityType`](#entitytype) | +| attributes | [`ProjectAttributes`](#projectattributes)
`or` [`SourceAttributes`](#sourceattributes)
`or` [`AnchorAttributes`](#anchorattributes)
`or` [`AnchorFeatureAttributes`](#anchorfeatureattributes)
`or` [`DerivedFeatureAttributes`](#derivedfeatureattributes) | + +### RelationshipType +Type: Enum + +| Value | +|-------------| +| `BelongsTo` | +| `Contains` | +| `Produces` | +| `Consumes` | + +### Relationship +Type: Object + +| Field | Type | +|------------------|-----------------------------------------| +| relationshipId | `Guid` | +| relationshipType | [`RelationshipType`](#relationshiptype) | +| fromEntityId | `Guid` | +| toEntityId | `Guid` | + +### ProjectDefinition +Type: Object + +| Field | Type | +|----------------------|-----------------------| +| qualifiedName | `string` | +| tags | `map` | + + +### SourceDefinition +Type: Object + +| Field | Type | +|----------------------|-----------------------| +| qualifiedName | `string` | +| name | `string` | +| path | `string` | +| preprocessing | `string`, optional | +| eventTimestampColumn | `string`, optional | +| timestampFormat | `string`, optional | +| type | `string` | +| tags | `map` | + +### AnchorDefinition +Type: Object + +| Field | Type | +|----------------------|-----------------------| +| qualifiedName | `string` | +| name | `string` | +| source_id | `Guid` | +| tags | `map` | + +### AnchorFeatureDefinition +Type: Object + +| Field | Type | +|----------------|--------------------------------| +| qualifiedName | `string` | +| name | `string` | +| featureType | [`FeatureType`](#featuretype) | +| transformation | [`ExpressionTransformation`](#expressiontransformation)
`or` [`WindowAggregationTransformation`](#windowaggregationtransformation)
`or` [`UdfTransformation`](#udftransformation) | +| key | [`array`](#typedkey) | +| tags | `map` | + +### DerivedFeatureDefinition +Type: Object + +| Field | Type | +|------------------------|--------------------------------| +| qualifiedName | `string` | +| name | `string` | +| featureType | [`FeatureType`](#featuretype) | +| transformation | [`ExpressionTransformation`](#expressiontransformation)
`or` [`WindowAggregationTransformation`](#windowaggregationtransformation)
`or` [`UdfTransformation`](#udftransformation) | +| key | [`array`](#typedkey) | +| input_anchor_features | `array` | +| input_derived_features | `array` | +| tags | `map` | + + +### EntitiesAndRelationships +Type: Object + +| Field | Type | +|---------------|----------------------------------------| +| guidEntityMap | [`map`](#entity) | +| relations | [`array`](#relationship) | + + +## Feathr Registry API + +### `GET /projects` +List **names** of all projects. + +Response Type: `array` + +### `GET /projects/{project}` +Get everything defined in the project + +Response Type: [`EntitiesAndRelationships`](#entitiesandrelationships) + +### `GET /projects/{project}/datasources` +Get all sources defined in the project. + +Response Type: [`array`](#entity) + +### `GET /projects/{project}/features` +Get all anchor features and derived features in the project, or only features meet the search criteria in the project. + +Query Parameters: + +| Field | Type | +|---------|--------| +| keyword | string | +| size | number | +| offset | number | + + +Response Type: Object + +| Field | Type | +|----------|----------------------------| +| features | [`array`](#entity) | + +### `GET /features/:feature` +Get feature details. + +Response Type: Object + +| Field | Type | Comments | +|-----------------|-----------------------|-----------------------------| +| entity | [`Entity`](#entity) | | +| referredEntities| `map` | For compatibility, not used | + +### `POST /projects` +Create new project + ++ Request Type: [`ProjectDefinition`](#projectdefinition) ++ Response Type: Object + +| Field | Type | +|-------|------| +| guid | Guid | + +### `POST /projects/{project}/datasources` +Create new source in the project + ++ Request Type: [`SourceDefinition`](#sourcedefinition) ++ Response Type: Object + +| Field | Type | +|-------|------| +| guid | Guid | + +### `POST /projects/{project}/anchors` +Create new anchor in the project + ++ Request Type: [`AnchorDefinition`](#anchordefinition) ++ Response Type: Object + +| Field | Type | +|-------|------| +| guid | Guid | + +### `POST /projects/{project}/anchors/{anchor}/features` +Create new anchor feature in the project under specified anchor + ++ Request Type: [`AnchorFeatureDefinition`](#anchorfeaturedefinition) ++ Response Type: Object + +| Field | Type | +|-------|------| +| guid | Guid | + +### `POST /projects/{project}/derivedfeatures` +Create new derived feature in the project + ++ Request Type: [`DerivedFeatureDefinition`](#derivedfeaturedefinition) ++ Response Type: Object + +| Field | Type | +|-------|------| +| guid | Guid | diff --git a/registry/sql-registry/main.py b/registry/sql-registry/main.py new file mode 100644 index 000000000..a40fae89c --- /dev/null +++ b/registry/sql-registry/main.py @@ -0,0 +1,77 @@ +import os +from typing import Optional +from fastapi import APIRouter, FastAPI, HTTPException +from starlette.middleware.cors import CORSMiddleware +from registry import * +from registry.db_registry import DbRegistry +from registry.models import EntityType + +rp = "/" +try: + rp = os.environ["API_BASE"] + if rp[0] != '/': + rp = '/' + rp +except: + pass +print("Using API BASE: ", rp) + +registry = DbRegistry() +app = FastAPI() +router = APIRouter() + +# Enables CORS +app.add_middleware(CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +@router.get("/projects") +def get_projects() -> list[str]: + return registry.get_projects() + + +@router.get("/projects/{project}") +def get_projects(project: str) -> dict: + return registry.get_project(project).to_dict() + + +@router.get("/projects/{project}/datasources") +def get_project_datasources(project: str) -> list: + p = registry.get_entity(project) + source_ids = [s.id for s in p.attributes.sources] + sources = registry.get_entities(source_ids) + return list([e.to_dict() for e in sources]) + + +@router.get("/projects/{project}/features") +def get_project_features(project: str, keyword: Optional[str] = None) -> list: + if keyword is None: + p = registry.get_entity(project) + feature_ids = [s.id for s in p.attributes.anchor_features] + \ + [s.id for s in p.attributes.derived_features] + features = registry.get_entities(feature_ids) + return list([e.to_dict() for e in features]) + else: + efs = registry.search_entity(keyword, [EntityType.AnchorFeature, EntityType.DerivedFeature]) + feature_ids = [ef.id for ef in efs] + features = registry.get_entities(feature_ids) + return list([e.to_dict() for e in features]) + + +@router.get("/features/{feature}") +def get_feature(feature: str) -> dict: + e = registry.get_entity(feature) + if e.entity_type not in [EntityType.DerivedFeature, EntityType.AnchorFeature]: + raise HTTPException(status_code=404, detail=f"Feature {feature} not found") + return e + + +@router.get("/features/{feature}/lineage") +def get_feature_lineage(feature: str) -> dict: + lineage = registry.get_lineage(feature) + return lineage.to_dict() + + +app.include_router(prefix = rp, router=router) diff --git a/registry/sql-registry/registry/__init__.py b/registry/sql-registry/registry/__init__.py new file mode 100644 index 000000000..5ce157408 --- /dev/null +++ b/registry/sql-registry/registry/__init__.py @@ -0,0 +1,6 @@ +__all__ = ["interface", "models", "database", "db_registry"] + +from registry.models import * +from registry.interface import Registry +from registry.database import DbConnection, connect +from registry.db_registry import DbRegistry \ No newline at end of file diff --git a/registry/sql-registry/registry/database.py b/registry/sql-registry/registry/database.py new file mode 100644 index 000000000..d82568972 --- /dev/null +++ b/registry/sql-registry/registry/database.py @@ -0,0 +1,85 @@ +from abc import ABC, abstractmethod +import threading +from distutils.log import debug, warn +import os +import pymssql + + +providers = [] + +class DbConnection(ABC): + @abstractmethod + def execute(self, sql: str, *args, **kwargs) -> list[dict]: + pass + +def quote(id): + if isinstance(id, str): + return f"'{id}'" + else: + return ",".join([f"'{i}'" for i in id]) + + +def parse_conn_str(s: str) -> dict: + """ + TODO: Not a sound and safe implementation, but useful enough in this case + as the connection string is provided by users themselves. + """ + parts = dict([p.strip().split("=", 1) + for p in s.split(";") if len(p.strip()) > 0]) + server = parts["Server"].split(":")[1].split(",")[0] + return { + "host": server, + "database": parts["Initial Catalog"], + "user": parts["User ID"], + "password": parts["Password"], + # "charset": "utf-8", ## For unknown reason this causes connection failure + } + + +class MssqlConnection(DbConnection): + @staticmethod + def connect(*args, **kwargs): + conn_str = os.environ["CONNECTION_STR"] + if "Server=" not in conn_str: + debug("`CONNECTION_STR` is not in ADO connection string format") + return None + return MssqlConnection(parse_conn_str(conn_str)) + + def __init__(self, params): + self.params = params + self.make_connection() + self.mutex = threading.Lock() + + def make_connection(self): + self.conn = pymssql.connect(**self.params) + + def execute(self, sql: str, *args, **kwargs) -> list[dict]: + debug(f"SQL: `{sql}`") + # NOTE: Only one cursor is allowed at the same time + retry = 0 + while True: + try: + with self.mutex: + c = self.conn.cursor(as_dict=True) + c.execute(sql, *args, **kwargs) + return c.fetchall() + except pymssql.OperationalError: + warn("Database error, retrying...") + # Reconnect + self.make_connection() + retry += 1 + if retry >= 3: + # Stop retrying + raise + pass + + +providers.append(MssqlConnection) + + +def connect(): + for p in providers: + ret = p.connect() + if ret is not None: + return ret + raise RuntimeError("Cannot connect to database") \ No newline at end of file diff --git a/registry/sql-registry/registry/db_registry.py b/registry/sql-registry/registry/db_registry.py new file mode 100644 index 000000000..f5456c5e5 --- /dev/null +++ b/registry/sql-registry/registry/db_registry.py @@ -0,0 +1,194 @@ +from typing import Optional, Tuple, Union +from uuid import UUID +from registry import Registry +from registry import connect +from registry.models import Edge, EntitiesAndRelations, Entity, EntityRef, EntityType, RelationshipType, _to_type, _to_uuid +import json + + +def quote(id): + if isinstance(id, str): + return f"'{id}'" + else: + return ",".join([f"'{i}'" for i in id]) + + +class DbRegistry(Registry): + def __init__(self): + self.conn = connect() + + def get_projects(self) -> list[str]: + ret = self.conn.execute( + f"select qualified_name from entities where entity_type='{EntityType.Project}'") + return list([r["qualified_name"] for r in ret]) + + def get_entity(self, id_or_name: Union[str, UUID]) -> Entity: + return self._fill_entity(self._get_entity(id_or_name)) + + def get_entities(self, ids: list[UUID]) -> list[Entity]: + return list([self._fill_entity(e) for e in self._get_entities(ids)]) + + def get_entity_id(self, id_or_name: Union[str, UUID]) -> UUID: + try: + id = _to_uuid(id_or_name) + return id + except ValueError: + pass + # It is a name + ret = self.conn.execute( + f"select entity_id from entities where qualified_name='{id_or_name}'") + return ret[0]["entity_id"] + + def get_neighbors(self, id_or_name: Union[str, UUID], relationship: RelationshipType) -> list[Edge]: + rows = self.conn.execute(fr''' + select edge_id, from_id, to_id, conn_type + from edges + where from_id = '{self.get_entity_id(id_or_name)}' + and conn_type = '{relationship.name}' + ''') + return list([Edge(**row) for row in rows]) + + def get_lineage(self, id_or_name: Union[str, UUID]) -> EntitiesAndRelations: + """ + Get feature lineage on both upstream and downstream + Returns [entity_id:entity] map and list of edges have been traversed. + """ + id = self.get_entity_id(id_or_name) + upstream_entities, upstream_edges = self._bfs( + id, RelationshipType.Consumes) + downstream_entities, downstream_edges = self._bfs( + id, RelationshipType.Produces) + return EntitiesAndRelations( + upstream_entities + downstream_entities, + upstream_edges + downstream_edges) + + def get_project(self, id_or_name: Union[str, UUID]) -> EntitiesAndRelations: + """ + This function returns not only the project itself, but also everything in the project + """ + project = self._get_entity(id_or_name) + edges = set(self.get_neighbors(id_or_name, RelationshipType.Contains)) + ids = list([e.to_id for e in edges]) + children = self._get_entities(ids) + child_map = dict([(e.id, e) for e in children]) + project.attributes.children = children + for anchor in project.attributes.anchors: + conn = self.get_neighbors(anchor.id, RelationshipType.Contains) + feature_ids = [e.to_id for e in conn] + edges = edges.union(conn) + features = list([child_map[id] for id in feature_ids]) + anchor.attributes.features = features + source_id = self.get_neighbors(anchor.id, RelationshipType.Consumes)[0].to_id + anchor.attributes.source = child_map[source_id] + for df in project.attributes.derived_features: + conn = self.get_neighbors(anchor.id, RelationshipType.Consumes) + input_ids = [e.to_id for e in conn] + edges = edges.union(conn) + features = list([child_map[id] for id in input_ids]) + df.attributes.input_features = features + all_edges = self._get_edges(ids) + return EntitiesAndRelations([project] + children, list(edges.union(all_edges))) + + def _fill_entity(self, e: Entity) -> Entity: + """ + Entities in the DB contains only attributes belong to itself, but the returned + data model contains connections/contents, so we need to fill this gap + """ + if e.entity_type == EntityType.Project: + edges = self.get_neighbors(e.id, RelationshipType.Contains) + ids = list([e.to_id for e in edges]) + children = self._get_entities(ids) + e.attributes.children = children + return e + if e.entity_type == EntityType.Anchor: + conn = self.get_neighbors(e.id, RelationshipType.Contains) + feature_ids = [e.to_id for e in conn] + features = self._get_entities(feature_ids) + e.attributes.features = features + source_id = self.get_neighbors(e.id, RelationshipType.Consumes)[0].to_id + source = self.get_entity(source_id) + e.attributes.source = source + return e + if e.entity_type == EntityType.DerivedFeature: + conn = self.get_neighbors(e.id, RelationshipType.Consumes) + feature_ids = [e.to_id for e in conn] + features = self._get_entities(feature_ids) + e.attributes.input_features = features + return e + return e + + def _get_edges(self, ids: list[UUID], types: list[RelationshipType] = []) -> list[Edge]: + sql = fr"""select edge_id, from_id, to_id, conn_type from edges + where from_id in ({quote(ids)}) + and to_id in ({quote(ids)})""" + if len(types)>0: + sql = fr"""select edge_id, from_id, to_id, conn_type from edges + where conn_type in ({quote(types)}) + and from_id in ({quote(ids)}) + and to_id in ({quote(ids)})""" + rows = self.conn.execute(sql) + return list([_to_type(row, Edge) for row in rows]) + + def _get_entity(self, id_or_name: Union[str, UUID]) -> Entity: + row = self.conn.execute(fr''' + select entity_id, qualified_name, entity_type, attributes + from entities + where entity_id = '{self.get_entity_id(id_or_name)}' + ''')[0] + row["attributes"] = json.loads(row["attributes"]) + return _to_type(row, Entity) + + def _get_entities(self, ids: list[UUID]) -> list[Entity]: + rows = self.conn.execute(fr''' + select entity_id, qualified_name, entity_type, attributes + from entities + where entity_id in ({quote(ids)}) + ''') + ret = [] + for row in rows: + row["attributes"] = json.loads(row["attributes"]) + ret.append(Entity(**row)) + return ret + + def _bfs(self, id: UUID, conn_type: RelationshipType) -> Tuple[list[Entity], list[Edge]]: + """ + Breadth first traversal + Starts from `id`, follow edges with `conn_type` only. + + WARN: There is no depth limit. + """ + connections = [] + to_ids = [{ + "to_id": id, + }] + # BFS over SQL + while len(to_ids) != 0: + to_ids = self._bfs_step(to_ids, conn_type) + connections.extend(to_ids) + ids = set([id]) + for r in connections: + ids.add(r["from_id"]) + ids.add(r["to_id"]) + entities = self.get_entities(ids) + edges = list([Edge(**c) for c in connections]) + return (entities, edges) + + def _bfs_step(self, ids: list[UUID], conn_type: RelationshipType) -> set[dict]: + """ + One step of the BFS process + Returns all edges that connect to node ids the next step + """ + ids = list([id["to_id"] for id in ids]) + sql = fr"""select edge_id, from_id, to_id, conn_type from edges where conn_type = '{conn_type.name}' and from_id in ({quote(ids)})""" + return self.conn.execute(sql) + + def search_entity(self, + keyword: str, + type: list[EntityType]) -> list[EntityRef]: + """ + WARN: This search function is implemented via `like` operator, which could be extremely slow. + """ + types = ",".join([quote(str(t)) for t in type]) + sql = fr'''select entity_id as id, qualified_name, entity_type as type from entities where qualified_name like %s and entity_type in ({types})''' + rows = self.conn.execute(sql, ('%' + keyword + '%', )) + return list([EntityRef(**row) for row in rows]) diff --git a/registry/sql-registry/registry/interface.py b/registry/sql-registry/registry/interface.py new file mode 100644 index 000000000..406c52ace --- /dev/null +++ b/registry/sql-registry/registry/interface.py @@ -0,0 +1,69 @@ +from abc import ABC, abstractmethod +from typing import Union +from uuid import UUID +from registry.database import DbConnection + +from registry.models import * + +class Registry(ABC): + @abstractmethod + def get_projects(self) -> list[str]: + """ + Returns the names of all projects + """ + pass + + @abstractmethod + def get_entity(self, id_or_name: Union[str, UUID]) -> Entity: + """ + Get one entity by its id or qualified name + """ + pass + + @abstractmethod + def get_entities(self, ids: list[UUID]) -> list[Entity]: + """ + Get list of entities by their ids + """ + pass + + @abstractmethod + def get_entity_id(self, id_or_name: Union[str, UUID]) -> UUID: + """ + Get entity id by its name + """ + pass + + @abstractmethod + def get_neighbors(self, id_or_name: Union[str, UUID], relationship: RelationshipType) -> list[Edge]: + """ + Get list of edges with specified type that connect to this entity. + The edge contains fromId and toId so we can follow to the entity it connects to + """ + pass + + @abstractmethod + def get_lineage(self, id_or_name: Union[str, UUID]) -> EntitiesAndRelations: + """ + Get all the upstream and downstream entities of an entity, along with all edges connect them. + Only meaningful to features and data sources. + """ + pass + + @abstractmethod + def get_project(self, id_or_name: Union[str, UUID]) -> EntitiesAndRelations: + """ + Get a project and everything inside of it, both entities and edges + """ + pass + + @abstractmethod + def search_entity(self, + keyword: str, + type: list[EntityType], + project: Optional[Union[str, UUID]] = None) -> list[EntityRef]: + """ + Search entities with specified type that also match the keyword in a project + """ + pass + diff --git a/registry/sql-registry/registry/models.py b/registry/sql-registry/registry/models.py new file mode 100644 index 000000000..3c08d2692 --- /dev/null +++ b/registry/sql-registry/registry/models.py @@ -0,0 +1,728 @@ +from abc import ABC, abstractmethod +from enum import Enum +from typing import Optional, Union +from uuid import UUID +import json +import re + + +def _to_snake(d, level: int = 0): + """ + Convert `string`, `list[string]`, or all keys in a `dict` into snake case + The maximum length of input string or list is 100, or it will be truncated before being processed, for dict, the exception will be thrown if it has more than 100 keys. + the maximum nested level is 10, otherwise the exception will be thrown + """ + if level >= 10: + raise ValueError("Too many nested levels") + if isinstance(d, str): + d = d[:100] + return re.sub(r'([A-Z]\w+$)', r'_\1', d).lower() + if isinstance(d, list): + d = d[:100] + return [_to_snake(i, level + 1) if isinstance(i, (dict, list)) else i for i in d] + if len(d) > 100: + raise ValueError("Dict has too many keys") + return {_to_snake(a, level + 1): _to_snake(b, level + 1) if isinstance(b, (dict, list)) else b for a, b in d.items()} + + +def _to_type(value, type): + """ + Convert `value` into `type`, + or `list[type]` if `value` is a list + NOTE: This is **not** a generic implementation, only for objects in this module + """ + if isinstance(value, type): + return value + if isinstance(value, list): + return list([_to_type(v, type) for v in value]) + if isinstance(value, dict): + if hasattr(type, "new"): + try: + # The convention is to use `new` method to create the object from a dict + return type.new(**_to_snake(value)) + except TypeError: + pass + return type(**_to_snake(value)) + if issubclass(type, Enum): + try: + n = int(value) + return type(n) + except ValueError: + pass + if hasattr(type, "new"): + try: + # As well as Enum types, some of them have alias that cannot be handled by default Enum constructor + return type.new(value) + except KeyError: + pass + return type[value] + return type(value) + + +def _to_uuid(value): + return _to_type(value, UUID) + + +class ValueType(Enum): + UNSPECIFIED = 0 + BOOLEAN = 1 + INT = 2 + LONG = 3 + FLOAT = 4 + DOUBLE = 5 + STRING = 6 + BYTES = 7 + + +class VectorType(Enum): + TENSOR = 0 + + +class TensorCategory(Enum): + DENSE = 0 + SPARSE = 1 + + +class EntityType(Enum): + Project = 1 + Source = 2 + Anchor = 3 + AnchorFeature = 4 + DerivedFeature = 5 + + @staticmethod + def new(v): + return { + "feathr_workspace_v1": EntityType.Project, + "feathr_source_v1": EntityType.Source, + "feathr_anchor_v1": EntityType.Anchor, + "feathr_anchor_feature_v1": EntityType.AnchorFeature, + "feathr_derived_feature_v1": EntityType.DerivedFeature, + }[v] + + def __str__(self): + return { + EntityType.Project: "feathr_workspace_v1", + EntityType.Source: "feathr_source_v1", + EntityType.Anchor: "feathr_anchor_v1", + EntityType.AnchorFeature: "feathr_anchor_feature_v1", + EntityType.DerivedFeature: "feathr_derived_feature_v1", + }[self] + + +class RelationshipType(Enum): + Contains = 1 + BelongsTo = 2 + Consumes = 3 + Produces = 4 + + +class ToDict(ABC): + """ + This ABC is used to convert object to dict, then JSON. + """ + @abstractmethod + def to_dict(self) -> dict: + pass + + def to_json(self, indent=None) -> str: + return json.dumps(self.to_dict(), indent=indent) + + +class FeatureType(ToDict): + def __init__(self, + type: Union[str, VectorType], + tensor_category: Union[str, TensorCategory], + dimension_type: list[Union[str, ValueType]], + val_type: Union[str, ValueType]): + self.type = _to_type(type, VectorType) + self.tensor_category = _to_type(tensor_category, TensorCategory) + self.dimension_type = _to_type(dimension_type, ValueType) + self.val_type = _to_type(val_type, ValueType) + + def to_dict(self) -> dict: + return { + "type": self.type.name, + "tensorCategory": self.tensor_category.name, + "dimensionType": [t.name for t in self.dimension_type], + "valType": self.val_type.name, + } + + +class TypedKey(ToDict): + def __init__(self, + key_column: str, + key_column_type: ValueType, + full_name: Optional[str] = None, + description: Optional[str] = None, + key_column_alias: Optional[str] = None): + self.key_column = key_column + self.key_column_type = _to_type(key_column_type, ValueType) + self.full_name = full_name + self.description = description + self.key_column_alias = key_column_alias + + def to_dict(self) -> dict: + ret = { + "key_column": self.key_column, + "key_column_type": self.key_column_type.name, + } + if self.full_name is not None: + ret["full_name"] = self.full_name + if self.description is not None: + ret["description"] = self.full_name + if self.key_column_alias is not None: + ret["key_column_alias"] = self.key_column_alias + return ret + + +class Transformation(ToDict): + @staticmethod + def new(**kwargs): + if "transform_expr" in kwargs: + return ExpressionTransformation(**kwargs) + elif "def_expr" in kwargs: + return WindowAggregationTransformation(**kwargs) + elif "name" in kwargs: + return UdfTransformation(**kwargs) + else: + raise ValueError(kwargs) + + +class ExpressionTransformation(Transformation): + def __init__(self, transform_expr: str): + self.transform_expr = transform_expr + + def to_dict(self) -> dict: + return { + "transform_expr": self.transform_expr + } + + +class WindowAggregationTransformation(Transformation): + def __init__(self, + def_expr: str, + agg_func: Optional[str] = None, + window: Optional[str] = None, + group_by: Optional[str] = None, + filter: Optional[str] = None, + limit: Optional[int] = None): + self.def_expr = def_expr + self.agg_func = agg_func + self.window = window + self.group_by = group_by + self.filter = filter + self.limit = limit + + def to_dict(self) -> dict: + ret = { + "def_expr": self.def_expr, + } + if self.agg_func is not None: + ret["agg_func"] = self.agg_func + if self.window is not None: + ret["window"] = self.window + if self.group_by is not None: + ret["group_by"] = self.group_by + if self.filter is not None: + ret["filter"] = self.filter + if self.limit is not None: + ret["limit"] = self.limit + return ret + + +class UdfTransformation(Transformation): + def __init__(self, name: str): + self.name = name + + def to_dict(self) -> dict: + return { + "name": self.name + } + + +class EntityRef(ToDict): + def __init__(self, + id: UUID, + type: Union[str, EntityType], + qualified_name: Optional[str] = None, + uniq_attr: dict = {}): + self.id = id + self.type = _to_type(type, EntityType) + if qualified_name is not None: + self.uniq_attr = {"qualifiedName": qualified_name} + else: + self.uniq_attr = uniq_attr + + @property + def entity_type(self) -> EntityType: + return self.type + + @property + def qualified_name(self) -> EntityType: + return self.uniq_attr['qualifiedName'] + + def get_ref(self): + return self + + def to_dict(self) -> dict: + return { + "guid": str(self.id), + "typeName": str(self.type), + "uniqueAttributes": self.uniq_attr, + } + + +class Attributes(ToDict): + @staticmethod + def new(entity_type: Union[str, EntityType], **kwargs): + return { + EntityType.Project: ProjectAttributes, + EntityType.Source: SourceAttributes, + EntityType.Anchor: AnchorAttributes, + EntityType.AnchorFeature: AnchorFeatureAttributes, + EntityType.DerivedFeature: DerivedFeatureAttributes, + }[_to_type(entity_type, EntityType)](**kwargs) + + +class Entity(ToDict): + def __init__(self, + entity_id: Union[str, UUID], + qualified_name: str, + entity_type: Union[str, EntityType], + attributes: Union[dict, Attributes], + **kwargs): + self.id = _to_uuid(entity_id) + self.qualified_name = qualified_name + self.entity_type = _to_type(entity_type, EntityType) + if isinstance(attributes, Attributes): + self.attributes = attributes + else: + self.attributes = Attributes.new( + entity_type, **_to_snake(attributes)) + + def get_ref(self) -> EntityRef: + return EntityRef(self.id, + self.attributes.entity_type, + self.qualified_name) + + def to_dict(self) -> dict: + return { + "guid": str(self.id), + "lastModifiedTS": "1", + "status": "ACTIVE", + "displayText": self.attributes.name, + "typeName": str(self.attributes.entity_type), + "attributes": self.attributes.to_dict(), + } + + +class ProjectAttributes(Attributes): + def __init__(self, + name: str, + children: list[Union[dict, Entity]] = [], + tags: dict = {}, + **kwargs): + self.name = name + self.tags = tags + self._children = [] + if len(children) > 0: + self.children = children + + @property + def entity_type(self) -> EntityType: + return EntityType.Project + + @property + def children(self): + return self._children + + @children.setter + def children(self, v: list[Union[dict, Entity]]): + for f in v: + if isinstance(f, Entity): + self._children.append(f) + elif isinstance(f, dict): + self._children.append(_to_type(f, Entity)) + else: + raise TypeError(f) + + @property + def sources(self): + return [ + e for e in self.children if e.entity_type == EntityType.Source] + + @property + def anchors(self): + return [ + e for e in self.children if e.entity_type == EntityType.Anchor] + + @property + def anchor_features(self): + return [ + e for e in self.children if e.entity_type == EntityType.AnchorFeature] + + @property + def derived_features(self): + return [ + e for e in self.children if e.entity_type == EntityType.DerivedFeature] + + def to_dict(self) -> dict: + return { + "qualifiedName": self.name, + "name": self.name, + "sources": list([e.get_ref().to_dict() for e in self.sources]), + "anchors": list([e.get_ref().to_dict() for e in self.anchors]), + "anchor_features": list([e.get_ref().to_dict() for e in self.anchor_features]), + "derived_features": list([e.get_ref().to_dict() for e in self.derived_features]), + "tags": self.tags, + } + + +class SourceAttributes(Attributes): + def __init__(self, + qualified_name: str, + name: str, + type: str, + path: str, + preprocessing: Optional[str] = None, + event_timestamp_column: Optional[str] = None, + timestamp_format: Optional[str] = None, + tags: dict = {}): + self.qualified_name = qualified_name + self.name = name + self.type = type + self.path = path + self.preprocessing = preprocessing + self.event_timestamp_column = event_timestamp_column + self.timestamp_format = timestamp_format + self.tags = tags + + @property + def entity_type(self) -> EntityType: + return EntityType.Source + + def to_dict(self) -> dict: + ret = { + "qualifiedName": self.qualified_name, + "name": self.name, + "type": self.type, + "path": self.path, + "tags": self.tags, + } + if self.preprocessing is not None: + ret["preprocessing"] = self.preprocessing + if self.event_timestamp_column is not None: + ret["eventTimestampColumn"] = self.event_timestamp_column + if self.timestamp_format is not None: + ret["timestampFormat"] = self.timestamp_format + return ret + + +class AnchorAttributes(Attributes): + def __init__(self, + qualified_name: str, + name: str, + # source: Optional[Union[dict, EntityRef, Entity]] = None, + # features: list[Union[dict, EntityRef, Entity]] = [], + tags: dict = {}, + **kwargs): + self.qualified_name = qualified_name + self.name = name + self._source = None + self._features = [] + # if source is not None: + # self._source = source.get_ref() + # if len(features)>0: + # self._set_feature(features) + self.tags = tags + + @property + def entity_type(self) -> EntityType: + return EntityType.Anchor + + @property + def source(self) -> EntityRef: + return self._source + + @source.setter + def source(self, s): + if isinstance(s, Entity): + self._source = s.get_ref() + elif isinstance(s, EntityRef): + self._source = s + elif isinstance(s, dict): + self._source = _to_type(s, Entity).get_ref() + else: + raise TypeError(s) + + @property + def features(self): + return self._features + + @features.setter + def features(self, features): + self._features = [] + for f in features: + if isinstance(f, Entity): + self._features.append(f.get_ref()) + elif isinstance(f, EntityRef): + self._features.append(f) + elif isinstance(f, dict): + self._features.append(_to_type(f, Entity).get_ref()) + else: + raise TypeError(f) + + def to_dict(self) -> dict: + ret = { + "qualifiedName": self.qualified_name, + "name": self.name, + "features": list([e.get_ref().to_dict() for e in self.features]), + "tags": self.tags, + } + if self.source is not None: + ret["source"] = self.source.get_ref().to_dict() + return ret + + +class AnchorFeatureAttributes(Attributes): + def __init__(self, + qualified_name: str, + name: str, + type: Union[dict, FeatureType], + transformation: Union[dict, Transformation], + key: list[Union[dict, TypedKey]], + tags: dict = {}): + self.qualified_name = qualified_name + self.name = name + self.type = _to_type(type, FeatureType) + self.transformation = _to_type(transformation, Transformation) + self.key = _to_type(key, TypedKey) + self.tags = tags + + @property + def entity_type(self) -> EntityType: + return EntityType.AnchorFeature + + def to_dict(self) -> dict: + return { + "qualifiedName": self.qualified_name, + "name": self.name, + "type": self.type.to_dict(), + "transformation": self.transformation.to_dict(), + "key": list([k.to_dict() for k in self.key]), + "tags": self.tags, + } + + +class DerivedFeatureAttributes(Attributes): + def __init__(self, + qualified_name: str, + name: str, + type: Union[dict, FeatureType], + transformation: Union[dict, Transformation], + key: list[Union[dict, TypedKey]], + # input_anchor_features: list[Union[dict, EntityRef, Entity]] = [], + # input_derived_features: list[Union[dict, EntityRef, Entity]] = [], + tags: dict = {}, + **kwargs): + self.qualified_name = qualified_name + self.name = name + self.type = _to_type(type, FeatureType) + self.transformation = _to_type(transformation, Transformation) + self.key = _to_type(key, TypedKey) + self._input_anchor_features = [] + self._input_derived_features = [] + self.tags = tags + # self._set_input_anchor_features(input_anchor_features) + # self._set_input_derived_features(input_derived_features) + + @property + def entity_type(self) -> EntityType: + return EntityType.DerivedFeature + + @property + def input_features(self): + return self._input_anchor_features + self._input_derived_features + + @input_features.setter + def input_features(self, v: Union[dict, Entity]): + self._input_anchor_features = [] + self._input_derived_features = [] + for f in v: + e = None + if isinstance(f, Entity): + e = f + elif isinstance(f, dict): + e = _to_type(f, Entity) + else: + raise TypeError(f) + + if e.entity_type == EntityType.AnchorFeature: + self._input_anchor_features.append(e) + elif e.entity_type == EntityType.DerivedFeature: + self._input_derived_features.append(e) + else: + pass + + @property + def input_anchor_features(self): + return self._input_anchor_features + + # @input_anchor_features.setter + # def input_anchor_features(self, v): + # self._input_anchor_features = [] + # for f in v: + # if isinstance(f, Entity): + # self._input_anchor_features.append(f.get_ref()) + # elif isinstance(f, EntityRef): + # self._input_anchor_features.append(f) + # elif isinstance(f, dict): + # self._input_anchor_features.append( + # to_type(f, Entity).get_ref()) + # else: + # raise TypeError(f) + + @property + def input_derived_features(self): + return self._input_derived_features + + # @input_derived_features.setter + # def input_derived_features(self, v): + # self._input_derived_features = [] + # for f in v: + # if isinstance(f, Entity): + # self._input_derived_features.append(f.get_ref()) + # elif isinstance(f, EntityRef): + # self._input_derived_features.append(f) + # elif isinstance(f, dict): + # self._input_derived_features.append( + # to_type(f, Entity).get_ref()) + # else: + # raise TypeError(f) + + def to_dict(self) -> dict: + return { + "qualifiedName": self.qualified_name, + "name": self.name, + "type": self.type.to_dict(), + "transformation": self.transformation.to_dict(), + "key": list([k.to_dict() for k in self.key]), + "input_anchor_features": [e.get_ref().to_dict() for e in self.input_anchor_features], + "input_derived_features": [e.get_ref().to_dict() for e in self.input_derived_features], + "tags": self.tags, + } + + +class Edge(ToDict): + def __init__(self, + edge_id: Union[str, UUID], + from_id: Union[str, UUID], + to_id: Union[str, UUID], + conn_type: Union[str, RelationshipType]): + self.id = _to_uuid(edge_id) + self.from_id = _to_uuid(from_id) + self.to_id = _to_uuid(to_id) + self.conn_type = _to_type(conn_type, RelationshipType) + + def __eq__(self, o: object) -> bool: + # Edge ID is kinda useless + return self.from_id == o.from_id and self.to_id == o.to_id and self.conn_type == o.conn_type + + def __hash__(self) -> int: + return hash((self.from_id, self.to_id, self.conn_type)) + + def to_dict(self) -> dict: + return { + "relationshipId": str(self.id), + "fromEntityId": str(self.from_id), + "toEntityId": str(self.to_id), + "relationshipType": self.conn_type.name, + } + + +class EntitiesAndRelations(ToDict): + def __init__(self, entities: list[Entity], edges: list[Edge]): + self.entities = dict([(e.id, e) for e in entities]) + self.edges = set(edges) + + def to_dict(self) -> dict: + return { + "guidEntityMap": dict([(str(id), self.entities[id].to_dict()) for id in self.entities]), + "relations": list([e.to_dict() for e in self.edges]), + } + + +class ProjectDef: + def __init__(self, qualified_name: str, tags: dict = {}): + self.qualified_name = qualified_name + self.name = qualified_name + self.tags = tags + + +class SourceDef: + def __init__(self, + qualified_name: str, + name: str, + path: str, + type: str, + preprocessing: Optional[str] = None, + event_timestamp_column: Optional[str] = None, + timestamp_format: Optional[str] = None, + tags: dict = {}): + self.qualified_name = qualified_name + self.name = name + self.path = path + self.type = type + self.preprocessing = preprocessing + self.event_timestamp_column = event_timestamp_column + self.timestamp_format = timestamp_format + self.tags = tags + + +class AnchorDef: + def __init__(self, + qualified_name: str, + name: str, + source_id: Union[str, UUID], + tags: dict = {}): + self.qualified_name = qualified_name + self.name = name + self.source_id = _to_uuid(source_id) + self.tags = tags + + +class AnchorFeatureDef: + def __init__(self, + qualified_name: str, + name: str, + feature_type: Union[dict, FeatureType], + transformation: Union[dict, Transformation], + key: list[Union[dict, TypedKey]], + tags: dict = {}): + self.qualified_name = qualified_name + self.name = name + self.feature_type = _to_type(feature_type, FeatureType) + self.transformation = _to_type(transformation, Transformation) + self.key = _to_type(key, TypedKey) + self.tags = tags + + +class DerivedFeatureDef: + def __init__(self, + qualified_name: str, + name: str, + feature_type: Union[dict, FeatureType], + transformation: Union[dict, Transformation], + key: list[Union[dict, TypedKey]], + input_anchor_features: list[Union[str, UUID]], + input_derived_features: list[Union[str, UUID]], + tags: dict = {}): + self.qualified_name = qualified_name + self.name = name + self.feature_type = _to_type(feature_type, FeatureType) + self.transformation = _to_type(transformation, Transformation) + self.key = _to_type(key, TypedKey) + self.input_anchor_features = _to_uuid(input_anchor_features) + self.input_derived_features = _to_uuid(input_derived_features) + self.tags = tags diff --git a/registry/sql-registry/requirements.txt b/registry/sql-registry/requirements.txt new file mode 100644 index 000000000..c6d61de98 --- /dev/null +++ b/registry/sql-registry/requirements.txt @@ -0,0 +1,3 @@ +pymssql +fastapi +uvicorn \ No newline at end of file diff --git a/registry/sql-registry/scripts/schema.sql b/registry/sql-registry/scripts/schema.sql new file mode 100644 index 000000000..d7258d577 --- /dev/null +++ b/registry/sql-registry/scripts/schema.sql @@ -0,0 +1,15 @@ +create table entities +( + entity_id varchar(50) not null primary key, + qualified_name varchar(200) not null, + entity_type varchar(100) not null, + attributes NVARCHAR(MAX) not null, +) + +create table edges +( + edge_id varchar(50) not null primary key, + from_id varchar(50) not null, + to_id varchar(50) not null, + conn_type varchar(20) not null, +) \ No newline at end of file diff --git a/registry/sql-registry/scripts/test_data.sql b/registry/sql-registry/scripts/test_data.sql new file mode 100644 index 000000000..a248d56fe --- /dev/null +++ b/registry/sql-registry/scripts/test_data.sql @@ -0,0 +1,92 @@ +insert into entities (entity_id, qualified_name, entity_type, attributes) +values('a4cfbc03-c65d-4f32-be3d-1d11247c9cdd', 'feathr_ci_registry_12_33_182947__PASSTHROUGH', 'feathr_source_v1', '{"path": "PASSTHROUGH", "qualifiedName": "feathr_ci_registry_12_33_182947__PASSTHROUGH", "name": "PASSTHROUGH", "type": "PASSTHROUGH"}'); +insert into entities (entity_id, qualified_name, entity_type, attributes) +values('dc24b1d5-206d-40db-b10a-606dd16a0297', 'feathr_ci_registry_12_33_182947__request_features__f_is_long_trip_distance', 'feathr_anchor_feature_v1', '{"qualifiedName": "feathr_ci_registry_12_33_182947__request_features__f_is_long_trip_distance", "name": "f_is_long_trip_distance", "type": {"type": "TENSOR", "tensorCategory": "DENSE", "dimensionType": [], "valType": "BOOLEAN"}, "transformation": {"transform_expr": "cast_float(trip_distance)>30"}, "key": [{"full_name": "feathr.dummy_typedkey", "key_column": "NOT_NEEDED", "description": "A dummy typed key for passthrough/request feature.", "key_column_alias": "NOT_NEEDED", "key_column_type": "UNSPECIFIED"}]}'); +insert into entities (entity_id, qualified_name, entity_type, attributes) +values('c626c41c-d6c2-4b16-a267-6cdeea497c52', 'feathr_ci_registry_12_33_182947__f_trip_time_rounded', 'feathr_derived_feature_v1', '{"qualifiedName": "feathr_ci_registry_12_33_182947__f_trip_time_rounded", "name": "f_trip_time_rounded", "input_derived_features": [], "type": {"type": "TENSOR", "tensorCategory": "DENSE", "dimensionType": [], "valType": "INT"}, "transformation": {"transform_expr": "f_trip_time_duration % 10"}, "input_anchor_features": [{"guid": "103baca1-377a-4ddf-8429-5da91026c269", "typeName": "feathr_anchor_feature_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__request_features__f_trip_time_duration"}}], "key": [{"full_name": "feathr.dummy_typedkey", "key_column": "NOT_NEEDED", "description": "A dummy typed key for passthrough/request feature.", "key_column_alias": "NOT_NEEDED", "key_column_type": "UNSPECIFIED"}]}'); +insert into entities (entity_id, qualified_name, entity_type, attributes) +values('537bc481-aa15-4a3b-be4e-2042da6f5a09', 'feathr_ci_registry_12_33_182947__aggregationFeatures__f_location_max_fare', 'feathr_anchor_feature_v1', '{"qualifiedName": "feathr_ci_registry_12_33_182947__aggregationFeatures__f_location_max_fare", "name": "f_location_max_fare", "type": {"type": "TENSOR", "tensorCategory": "DENSE", "dimensionType": [], "valType": "FLOAT"}, "transformation": {"filter": null, "agg_func": "MAX", "limit": null, "group_by": null, "window": "90d", "def_expr": "cast_float(fare_amount)"}, "key": [{"full_name": "nyc_taxi.location_id", "key_column": "DOLocationID", "description": "location id in NYC", "key_column_alias": "DOLocationID", "key_column_type": "2"}]}'); +insert into entities (entity_id, qualified_name, entity_type, attributes) +values('479c6306-5fdb-4e06-9008-c18f68db52a4', 'feathr_ci_registry_12_33_182947__f_trip_time_rounded_plus', 'feathr_derived_feature_v1', '{"qualifiedName": "feathr_ci_registry_12_33_182947__f_trip_time_rounded_plus", "name": "f_trip_time_rounded_plus", "input_derived_features": [{"guid": "c626c41c-d6c2-4b16-a267-6cdeea497c52", "typeName": "feathr_derived_feature_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__f_trip_time_rounded"}}], "type": {"type": "TENSOR", "tensorCategory": "DENSE", "dimensionType": [], "valType": "INT"}, "transformation": {"transform_expr": "f_trip_time_rounded + 100"}, "input_anchor_features": [], "key": [{"full_name": "feathr.dummy_typedkey", "key_column": "NOT_NEEDED", "description": "A dummy typed key for passthrough/request feature.", "key_column_alias": "NOT_NEEDED", "key_column_type": "UNSPECIFIED"}]}'); +insert into entities (entity_id, qualified_name, entity_type, attributes) +values('c4a0ae0f-09cc-43bf-94e9-21ff178fbda6', 'feathr_ci_registry_12_33_182947__nycTaxiBatchSource', 'feathr_source_v1', '{"timestamp_format": "yyyy-MM-dd HH:mm:ss", "path": "wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04.csv", "event_timestamp_column": "lpep_dropoff_datetime", "preprocessing": " def add_new_dropoff_and_fare_amount_column(df: DataFrame):\n df = df.withColumn(\"new_lpep_dropoff_datetime\", col(\"lpep_dropoff_datetime\"))\n df = df.withColumn(\"new_fare_amount\", col(\"fare_amount\") + 1000000)\n return df\n", "qualifiedName": "feathr_ci_registry_12_33_182947__nycTaxiBatchSource", "name": "nycTaxiBatchSource", "type": "wasbs", "tags": {"for_test_purpose": "true"}}'); +insert into entities (entity_id, qualified_name, entity_type, attributes) +values('2a052ccd-3e31-46a7-bffb-2ab1302b1b00', 'feathr_ci_registry_12_33_182947__aggregationFeatures__f_location_avg_fare', 'feathr_anchor_feature_v1', '{"qualifiedName": "feathr_ci_registry_12_33_182947__aggregationFeatures__f_location_avg_fare", "name": "f_location_avg_fare", "type": {"type": "TENSOR", "tensorCategory": "DENSE", "dimensionType": [], "valType": "FLOAT"}, "transformation": {"filter": null, "agg_func": "AVG", "limit": null, "group_by": null, "window": "90d", "def_expr": "cast_float(fare_amount)"}, "key": [{"full_name": "nyc_taxi.location_id", "key_column": "DOLocationID", "description": "location id in NYC", "key_column_alias": "DOLocationID", "key_column_type": "2"}]}'); +insert into entities (entity_id, qualified_name, entity_type, attributes) +values('cd7306a7-c458-45e8-a00b-44a2f2117135', 'feathr_ci_registry_12_33_182947', 'feathr_workspace_v1', '{"anchor_features": [{"guid": "a5c47bd8-3729-45fa-8701-b8b76ada150a", "typeName": "feathr_anchor_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__aggregationFeatures"}}, {"guid": "260325a5-27f9-40d1-8697-c727feb1dbdc", "typeName": "feathr_anchor_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__request_features"}}], "derived_features": [{"guid": "226b42ee-0c34-4329-b935-744aecc63fb4", "typeName": "feathr_derived_feature_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__f_trip_time_distance"}}, {"guid": "c626c41c-d6c2-4b16-a267-6cdeea497c52", "typeName": "feathr_derived_feature_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__f_trip_time_rounded"}}, {"guid": "479c6306-5fdb-4e06-9008-c18f68db52a4", "typeName": "feathr_derived_feature_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__f_trip_time_rounded_plus"}}], "qualifiedName": "feathr_ci_registry_12_33_182947", "name": "feathr_ci_registry_12_33_182947", "tags": {"for_test_purpose": "true"}}'); +insert into entities (entity_id, qualified_name, entity_type, attributes) +values('5316c516-77f9-4be4-a7ec-8bf6e893e2aa', 'feathr_ci_registry_12_33_182947__request_features__f_trip_distance', 'feathr_anchor_feature_v1', '{"qualifiedName": "feathr_ci_registry_12_33_182947__request_features__f_trip_distance", "name": "f_trip_distance", "type": {"type": "TENSOR", "tensorCategory": "DENSE", "dimensionType": [], "valType": "FLOAT"}, "transformation": {"transform_expr": "trip_distance"}, "key": [{"full_name": "feathr.dummy_typedkey", "key_column": "NOT_NEEDED", "description": "A dummy typed key for passthrough/request feature.", "key_column_alias": "NOT_NEEDED", "key_column_type": "UNSPECIFIED"}], "tags": {"for_test_purpose": "true"}}'); +insert into entities (entity_id, qualified_name, entity_type, attributes) +values('103baca1-377a-4ddf-8429-5da91026c269', 'feathr_ci_registry_12_33_182947__request_features__f_trip_time_duration', 'feathr_anchor_feature_v1', '{"qualifiedName": "feathr_ci_registry_12_33_182947__request_features__f_trip_time_duration", "name": "f_trip_time_duration", "type": {"type": "TENSOR", "tensorCategory": "DENSE", "dimensionType": [], "valType": "INT"}, "transformation": {"transform_expr": "(to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime))/60"}, "key": [{"full_name": "feathr.dummy_typedkey", "key_column": "NOT_NEEDED", "description": "A dummy typed key for passthrough/request feature.", "key_column_alias": "NOT_NEEDED", "key_column_type": "UNSPECIFIED"}]}'); +insert into entities (entity_id, qualified_name, entity_type, attributes) +values('a5c47bd8-3729-45fa-8701-b8b76ada150a', 'feathr_ci_registry_12_33_182947__aggregationFeatures', 'feathr_anchor_v1', '{"features": [{"guid": "2a052ccd-3e31-46a7-bffb-2ab1302b1b00", "typeName": "feathr_anchor_feature_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__aggregationFeatures__f_location_avg_fare"}}, {"guid": "537bc481-aa15-4a3b-be4e-2042da6f5a09", "typeName": "feathr_anchor_feature_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__aggregationFeatures__f_location_max_fare"}}], "qualifiedName": "feathr_ci_registry_12_33_182947__aggregationFeatures", "name": "aggregationFeatures", "source": {"guid": "c4a0ae0f-09cc-43bf-94e9-21ff178fbda6", "typeName": "feathr_source_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__nycTaxiBatchSource"}}}'); +insert into entities (entity_id, qualified_name, entity_type, attributes) +values('260325a5-27f9-40d1-8697-c727feb1dbdc', 'feathr_ci_registry_12_33_182947__request_features', 'feathr_anchor_v1', '{"features": [{"guid": "5316c516-77f9-4be4-a7ec-8bf6e893e2aa", "typeName": "feathr_anchor_feature_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__request_features__f_trip_distance"}}, {"guid": "103baca1-377a-4ddf-8429-5da91026c269", "typeName": "feathr_anchor_feature_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__request_features__f_trip_time_duration"}}, {"guid": "dc24b1d5-206d-40db-b10a-606dd16a0297", "typeName": "feathr_anchor_feature_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__request_features__f_is_long_trip_distance"}}, {"guid": "2380fe5b-ce2a-401e-98bf-af8b98460f67", "typeName": "feathr_anchor_feature_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__request_features__f_day_of_week"}}], "qualifiedName": "feathr_ci_registry_12_33_182947__request_features", "name": "request_features", "source": {"guid": "a4cfbc03-c65d-4f32-be3d-1d11247c9cdd", "typeName": "feathr_source_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__PASSTHROUGH"}}, "tags": {"for_test_purpose": "true"}}'); +insert into entities (entity_id, qualified_name, entity_type, attributes) +values('2380fe5b-ce2a-401e-98bf-af8b98460f67', 'feathr_ci_registry_12_33_182947__request_features__f_day_of_week', 'feathr_anchor_feature_v1', '{"qualifiedName": "feathr_ci_registry_12_33_182947__request_features__f_day_of_week", "name": "f_day_of_week", "type": {"type": "TENSOR", "tensorCategory": "DENSE", "dimensionType": [], "valType": "INT"}, "transformation": {"transform_expr": "dayofweek(lpep_dropoff_datetime)"}, "key": [{"full_name": "feathr.dummy_typedkey", "key_column": "NOT_NEEDED", "description": "A dummy typed key for passthrough/request feature.", "key_column_alias": "NOT_NEEDED", "key_column_type": "UNSPECIFIED"}]}'); +insert into entities (entity_id, qualified_name, entity_type, attributes) +values('226b42ee-0c34-4329-b935-744aecc63fb4', 'feathr_ci_registry_12_33_182947__f_trip_time_distance', 'feathr_derived_feature_v1', '{"qualifiedName": "feathr_ci_registry_12_33_182947__f_trip_time_distance", "name": "f_trip_time_distance", "input_derived_features": [], "type": {"type": "TENSOR", "tensorCategory": "DENSE", "dimensionType": [], "valType": "FLOAT"}, "transformation": {"transform_expr": "f_trip_distance * f_trip_time_duration"}, "input_anchor_features": [{"guid": "5316c516-77f9-4be4-a7ec-8bf6e893e2aa", "typeName": "feathr_anchor_feature_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__request_features__f_trip_distance"}}, {"guid": "103baca1-377a-4ddf-8429-5da91026c269", "typeName": "feathr_anchor_feature_v1", "uniqueAttributes": {"qualifiedName": "feathr_ci_registry_12_33_182947__request_features__f_trip_time_duration"}}], "key": [{"full_name": "feathr.dummy_typedkey", "key_column": "NOT_NEEDED", "description": "A dummy typed key for passthrough/request feature.", "key_column_alias": "NOT_NEEDED", "key_column_type": "UNSPECIFIED"}]}'); + +insert into edges (edge_id, from_id, to_id, conn_type) values ('455f7195-8463-4c60-9cf0-65bd9db0ae0a', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'a4cfbc03-c65d-4f32-be3d-1d11247c9cdd', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('a2777fd2-1136-40d0-8686-47b5b5fed1ef', 'a4cfbc03-c65d-4f32-be3d-1d11247c9cdd', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('ca88290d-03d1-4641-bf36-cbf3280b3e9d', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'c4a0ae0f-09cc-43bf-94e9-21ff178fbda6', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('1da26ec9-6608-4971-ac04-8ed170543325', 'c4a0ae0f-09cc-43bf-94e9-21ff178fbda6', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('9ae3a7c0-0163-4170-b0cf-81705e5b6aca', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'a5c47bd8-3729-45fa-8701-b8b76ada150a', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('50068cf8-7f5e-482a-a018-68bf27f89f6d', 'a5c47bd8-3729-45fa-8701-b8b76ada150a', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('898a10fd-0315-4fcb-9803-0144047033c7', 'cd7306a7-c458-45e8-a00b-44a2f2117135', '260325a5-27f9-40d1-8697-c727feb1dbdc', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('08540367-3d1f-4d57-8af1-36b3a11762ed', '260325a5-27f9-40d1-8697-c727feb1dbdc', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('d05ca3c6-2610-4352-9be7-c4d8dd6ab6b6', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'dc24b1d5-206d-40db-b10a-606dd16a0297', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('ed6ca745-ee85-403c-b1bf-a3f1a0463132', 'dc24b1d5-206d-40db-b10a-606dd16a0297', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('55d85311-8e42-4273-a538-ef4dc33b1570', 'cd7306a7-c458-45e8-a00b-44a2f2117135', '537bc481-aa15-4a3b-be4e-2042da6f5a09', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('215cdc37-678a-4c56-a390-76d6471fa629', '537bc481-aa15-4a3b-be4e-2042da6f5a09', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('4166aca1-e0f4-4883-b2ff-0051ee80a830', 'cd7306a7-c458-45e8-a00b-44a2f2117135', '2a052ccd-3e31-46a7-bffb-2ab1302b1b00', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('e1174ff1-0b1a-4b03-a1f5-b8b923a131ba', '2a052ccd-3e31-46a7-bffb-2ab1302b1b00', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('b314c95b-e4ea-4a39-9584-9b7eb0a6b8d2', 'cd7306a7-c458-45e8-a00b-44a2f2117135', '5316c516-77f9-4be4-a7ec-8bf6e893e2aa', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('7a37d438-8c33-4cc8-a0bb-9db3963f073a', '5316c516-77f9-4be4-a7ec-8bf6e893e2aa', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('31265953-4820-470f-8cfc-38efefec9fa7', 'cd7306a7-c458-45e8-a00b-44a2f2117135', '2380fe5b-ce2a-401e-98bf-af8b98460f67', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('58c448cf-87bd-4f36-92d5-e6f6de48569d', '2380fe5b-ce2a-401e-98bf-af8b98460f67', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('c86e4e49-88c9-4ac8-a1b8-ff473e1dc588', 'cd7306a7-c458-45e8-a00b-44a2f2117135', '103baca1-377a-4ddf-8429-5da91026c269', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('3fddedcf-c590-43b9-aaf4-4c4ce6600f2e', '103baca1-377a-4ddf-8429-5da91026c269', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('be429eb2-758d-4783-b166-cfcc7d2fb4f2', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'c626c41c-d6c2-4b16-a267-6cdeea497c52', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('bd66f19f-3508-4b13-ba40-598dd4abbd0d', 'c626c41c-d6c2-4b16-a267-6cdeea497c52', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('178c7c56-0f25-4048-be18-435ab5a169f4', 'cd7306a7-c458-45e8-a00b-44a2f2117135', '479c6306-5fdb-4e06-9008-c18f68db52a4', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('c8dc4c34-7950-4a04-a4bd-829a4c20ab4e', '479c6306-5fdb-4e06-9008-c18f68db52a4', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('92f143c0-59a8-461b-b9cc-179f3403dd38', 'cd7306a7-c458-45e8-a00b-44a2f2117135', '226b42ee-0c34-4329-b935-744aecc63fb4', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('e9a6f066-f4cc-4de4-bb26-2bc9522760a4', '226b42ee-0c34-4329-b935-744aecc63fb4', 'cd7306a7-c458-45e8-a00b-44a2f2117135', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('805b2c95-fc22-48b1-81cc-f86e3f2c2956', 'a5c47bd8-3729-45fa-8701-b8b76ada150a', '2a052ccd-3e31-46a7-bffb-2ab1302b1b00', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('e54835e6-b399-4d49-8ab8-6b452cbc00ca', '2a052ccd-3e31-46a7-bffb-2ab1302b1b00', 'a5c47bd8-3729-45fa-8701-b8b76ada150a', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('02a774bb-0a1c-4874-b9e2-bbd4806c2f3e', 'a5c47bd8-3729-45fa-8701-b8b76ada150a', '537bc481-aa15-4a3b-be4e-2042da6f5a09', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('17c74f58-18bc-4c4a-974b-2732a84576c3', '537bc481-aa15-4a3b-be4e-2042da6f5a09', 'a5c47bd8-3729-45fa-8701-b8b76ada150a', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('c8dded57-22f8-46d0-a29c-3e7c072ebee4', '260325a5-27f9-40d1-8697-c727feb1dbdc', '5316c516-77f9-4be4-a7ec-8bf6e893e2aa', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('7e2995b8-2a4f-49c1-b72c-80c8c6d1e060', '5316c516-77f9-4be4-a7ec-8bf6e893e2aa', '260325a5-27f9-40d1-8697-c727feb1dbdc', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('3ba9b460-5878-43c1-bb00-c88916bf5e6b', '260325a5-27f9-40d1-8697-c727feb1dbdc', '103baca1-377a-4ddf-8429-5da91026c269', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('a4bea335-611d-49cc-9e94-b18896103d40', '103baca1-377a-4ddf-8429-5da91026c269', '260325a5-27f9-40d1-8697-c727feb1dbdc', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('f0d65e5b-9762-4c65-9333-83aa6c1beb75', '260325a5-27f9-40d1-8697-c727feb1dbdc', 'dc24b1d5-206d-40db-b10a-606dd16a0297', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('cd72ee26-b867-4321-8566-7daa488f7a61', 'dc24b1d5-206d-40db-b10a-606dd16a0297', '260325a5-27f9-40d1-8697-c727feb1dbdc', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('bc211906-6d48-4649-8e49-518cd89c61f8', '260325a5-27f9-40d1-8697-c727feb1dbdc', '2380fe5b-ce2a-401e-98bf-af8b98460f67', 'Contains'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('aeafba5d-b352-4c36-8fd5-d135cea70605', '2380fe5b-ce2a-401e-98bf-af8b98460f67', '260325a5-27f9-40d1-8697-c727feb1dbdc', 'BelongsTo'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('2b6126be-63fd-4140-8891-e0b5db880573', '260325a5-27f9-40d1-8697-c727feb1dbdc', 'a4cfbc03-c65d-4f32-be3d-1d11247c9cdd', 'Consumes'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('89a348be-ae8d-40ad-a1cd-12d00b981b2a', 'a4cfbc03-c65d-4f32-be3d-1d11247c9cdd', '260325a5-27f9-40d1-8697-c727feb1dbdc', 'Produces'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('7e6219a9-e433-4706-9145-f791a58ef7c3', '5316c516-77f9-4be4-a7ec-8bf6e893e2aa', 'a4cfbc03-c65d-4f32-be3d-1d11247c9cdd', 'Consumes'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('a447062c-c75e-48b2-bbe3-5d9eab770c26', 'a4cfbc03-c65d-4f32-be3d-1d11247c9cdd', '5316c516-77f9-4be4-a7ec-8bf6e893e2aa', 'Produces'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('faf70a4b-2358-4881-952b-aae3cec55053', '103baca1-377a-4ddf-8429-5da91026c269', 'a4cfbc03-c65d-4f32-be3d-1d11247c9cdd', 'Consumes'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('b6647b84-5043-4ac2-8b31-3db7c2d6cf32', 'a4cfbc03-c65d-4f32-be3d-1d11247c9cdd', '103baca1-377a-4ddf-8429-5da91026c269', 'Produces'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('b63cb71d-71c1-49a1-93bd-ea63cb4ab4e7', 'dc24b1d5-206d-40db-b10a-606dd16a0297', 'a4cfbc03-c65d-4f32-be3d-1d11247c9cdd', 'Consumes'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('c582313e-9840-4ae6-827c-d2a01363ee6b', 'a4cfbc03-c65d-4f32-be3d-1d11247c9cdd', 'dc24b1d5-206d-40db-b10a-606dd16a0297', 'Produces'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('a2d2ca10-11eb-4e6f-9a5c-b8196a56d10a', '2380fe5b-ce2a-401e-98bf-af8b98460f67', 'a4cfbc03-c65d-4f32-be3d-1d11247c9cdd', 'Consumes'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('50701b58-58ef-4b6e-ac00-5f69d8ceebf6', 'a4cfbc03-c65d-4f32-be3d-1d11247c9cdd', '2380fe5b-ce2a-401e-98bf-af8b98460f67', 'Produces'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('6d010032-02fb-4e94-aef7-85e56d5cf99c', 'a5c47bd8-3729-45fa-8701-b8b76ada150a', 'c4a0ae0f-09cc-43bf-94e9-21ff178fbda6', 'Consumes'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('421b8c1f-01b9-49c3-8820-ee05313d103a', 'c4a0ae0f-09cc-43bf-94e9-21ff178fbda6', 'a5c47bd8-3729-45fa-8701-b8b76ada150a', 'Produces'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('b6b8af3a-2531-46bc-9b1e-acf3a4c51396', '2a052ccd-3e31-46a7-bffb-2ab1302b1b00', 'c4a0ae0f-09cc-43bf-94e9-21ff178fbda6', 'Consumes'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('323d044f-bbbd-45fc-a93c-40ee0f17ab87', 'c4a0ae0f-09cc-43bf-94e9-21ff178fbda6', '2a052ccd-3e31-46a7-bffb-2ab1302b1b00', 'Produces'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('cbe5c0d4-621b-4e6b-a6c1-76c8ca6105f2', '537bc481-aa15-4a3b-be4e-2042da6f5a09', 'c4a0ae0f-09cc-43bf-94e9-21ff178fbda6', 'Consumes'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('bedbe18b-a7c3-40d3-b230-22f9ac5c6c76', 'c4a0ae0f-09cc-43bf-94e9-21ff178fbda6', '537bc481-aa15-4a3b-be4e-2042da6f5a09', 'Produces'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('84902105-164c-4fc3-9690-e638f16c3075', 'c626c41c-d6c2-4b16-a267-6cdeea497c52', '103baca1-377a-4ddf-8429-5da91026c269', 'Consumes'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('44668672-1520-4371-98d2-3ed48bddf9ea', '103baca1-377a-4ddf-8429-5da91026c269', 'c626c41c-d6c2-4b16-a267-6cdeea497c52', 'Produces'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('e8fe5609-53e7-4793-a39d-2ce75c630cd9', '479c6306-5fdb-4e06-9008-c18f68db52a4', 'c626c41c-d6c2-4b16-a267-6cdeea497c52', 'Consumes'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('591285ff-53dd-4e27-a30d-754fe97ea3be', 'c626c41c-d6c2-4b16-a267-6cdeea497c52', '479c6306-5fdb-4e06-9008-c18f68db52a4', 'Produces'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('df901204-59c1-4f3e-87f0-8ac07e90bc39', '226b42ee-0c34-4329-b935-744aecc63fb4', '5316c516-77f9-4be4-a7ec-8bf6e893e2aa', 'Consumes'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('23a05c64-b204-4c48-b906-9fabb6ea298b', '5316c516-77f9-4be4-a7ec-8bf6e893e2aa', '226b42ee-0c34-4329-b935-744aecc63fb4', 'Produces'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('3721c600-6423-4a38-a9b2-d85b426b6eaa', '226b42ee-0c34-4329-b935-744aecc63fb4', '103baca1-377a-4ddf-8429-5da91026c269', 'Consumes'); +insert into edges (edge_id, from_id, to_id, conn_type) values ('5f9b86fe-bdc9-4a76-b07c-876b9d7c1ee1', '103baca1-377a-4ddf-8429-5da91026c269', '226b42ee-0c34-4329-b935-744aecc63fb4', 'Produces'); + diff --git a/registry/sql-registry/test/test_basic.py b/registry/sql-registry/test/test_basic.py new file mode 100644 index 000000000..22e343aba --- /dev/null +++ b/registry/sql-registry/test/test_basic.py @@ -0,0 +1,17 @@ +import registry +r=registry.DbRegistry() + +l=r.get_lineage('226b42ee-0c34-4329-b935-744aecc63fb4').to_dict() +assert(len(l["guidEntityMap"]) == 4) + +af1=r.get_entity('2380fe5b-ce2a-401e-98bf-af8b98460f67') +af2=r.get_entity('feathr_ci_registry_12_33_182947__request_features__f_day_of_week') +assert(af1.to_dict()==af2.to_dict()) +df1=r.get_entity('226b42ee-0c34-4329-b935-744aecc63fb4') +df2=r.get_entity('feathr_ci_registry_12_33_182947__f_trip_time_distance') +assert(df1.to_dict()==df2.to_dict()) + +p=r.get_project('feathr_ci_registry_12_33_182947') +assert(len(p.to_dict()['guidEntityMap'])==14) + +es=r.search_entity("time", [registry.EntityType.DerivedFeature]) diff --git a/ui/README.md b/ui/README.md index 0dea719f5..cd6106c71 100644 --- a/ui/README.md +++ b/ui/README.md @@ -1,40 +1,60 @@ # Feathr Feature Store UI -This directory hosts Feathr Feature Store UI code. Visit [demo site](https://aka.ms/feathrdemo) to try Feathr Feature Store UI, +This directory hosts Feathr Feature Store UI code. -## Prerequisites -Install the latest version of [NodeJS](https://nodejs.org/en/) LTS v14.x or v16.x. Make sure to use npm no later than 6.x. Run `node --version` and `npm --version` to verify installed versions. +## Live Demo -## Build and run locally +Check out the latest Feathr Feature Store UI live demo [here](https://aka.ms/feathrdemo), use one of following accounts when you are prompted to login: +- A work or school organization account, includes Office 365 subscribers. +- Microsoft personal account, this means an account can access to Skype, Outlook.com, OneDrive, and Xbox LIVE. + +## Development Getting Started + +### Prerequisites + +1. Install latest [Node](https://nodejs.org/en/) v16.x. Run `node --version` to verify installed Node versions. + +### Build and run locally Each command in this section should be run from the root directory of the repository. Open terminal, go to root of this repository and run following commands. -```bash +``` cd ui npm install npm start ``` -This should launch [http://localhost:3000/](http://localhost:3000/) on your web browser. The page will reload when you make code changes and save. +This should launch [http://localhost:3000](http://localhost:3000) on your web browser. The page will reload when you make code changes and save. -## Deploying +#### [Optional] Override configurations for local development +- **Point to a different backend endpoint**: by default, UI talks to live backend API at (https://feathr-registry.azurewebsites.net/docs). To point to a custom backend API (e.g. running locally), create a .env.local in this directory and set REACT_APP_API_ENDPOINT, for example: +``` +REACT_APP_API_ENDPOINT=http://localhost:8080 +``` +- **Use different authentication settings**: by default, UI authenticates with an Azure AD application with multiple tenants authentication enabled. To change to use a different Azure AD application, create a .env.local in this directory and set REACT_APP_AAD_APP_CLIENT_ID and REACT_APP_AAD_APP_AUTHORITY, for example: +``` +REACT_APP_AAD_APP_CLIENT_ID=> +REACT_APP_AAD_APP_AUTHORITY=https://login.microsoftonline.com/<>REPLACE_WITH_YOUR_TENANT_ID> +``` -- For static file based deployment, run `npm run build` and upload `build/` to your server. +### Deploying + +- For static file based deployment, run `npm run build` and upload `build/` to your server. - For docker image based deployment, run `docker -t .` to build image and push to your container registry. -## Lint +### Lint -To lint typescript code files, sim run: -```bash +To lint ts and tsx code, run: +``` npm run lint:fix ``` -This command will Automatically fix all problems that can be fixed, and list the rest problems requires manual fix. -Linting rules are configured in [.eslintrc](.eslintrc) file. +This command will Automatically fix all problems that can be fixed, and list the rest problems requires manual fix. +Linting rules are configured in [.eslintrc](.eslintrc) file. [Read More](https://eslint.org/docs/rules/). -## Project Structure +### Project Structure ``` src/ diff --git a/ui/src/api/api.tsx b/ui/src/api/api.tsx index 653a2677a..ee509411f 100644 --- a/ui/src/api/api.tsx +++ b/ui/src/api/api.tsx @@ -1,11 +1,14 @@ import Axios from "axios"; import { DataSource, Feature, FeatureLineage, UserRole, Role } from "../models/model"; +import { InteractionRequiredAuthError, PublicClientApplication } from "@azure/msal-browser"; import mockUserRole from "./mock/userrole.json"; +import { getMsalConfig } from "../utils/utils"; const API_ENDPOINT = process.env.REACT_APP_API_ENDPOINT + "/api/v1"; -const token = "mockAppServiceKey"; +const msalInstance = getMsalConfig(); export const fetchDataSources = async (project: string) => { + const token = await getIdToken(msalInstance); return Axios .get(`${ API_ENDPOINT }/projects/${ project }/datasources?code=${ token }`, { headers: {} }) @@ -15,6 +18,7 @@ export const fetchDataSources = async (project: string) => { }; export const fetchProjects = async () => { + const token = await getIdToken(msalInstance); return Axios .get<[]>(`${ API_ENDPOINT }/projects?code=${ token }`, { @@ -26,6 +30,7 @@ export const fetchProjects = async () => { }; export const fetchFeatures = async (project: string, page: number, limit: number, keyword: string) => { + const token = await getIdToken(msalInstance); return Axios .get(`${ API_ENDPOINT }/projects/${ project }/features?code=${ token }`, { @@ -38,6 +43,7 @@ export const fetchFeatures = async (project: string, page: number, limit: number }; export const fetchFeature = async (project: string, featureId: string) => { + const token = await getIdToken(msalInstance); return Axios .get(`${ API_ENDPOINT }/features/${ featureId }?code=${ token }`, {}) .then((response) => { @@ -46,6 +52,7 @@ export const fetchFeature = async (project: string, featureId: string) => { }; export const fetchProjectLineages = async (project: string) => { + const token = await getIdToken(msalInstance); return Axios .get(`${ API_ENDPOINT }/projects/${ project }?code=${ token }`, {}) .then((response) => { @@ -54,6 +61,7 @@ export const fetchProjectLineages = async (project: string) => { }; export const fetchFeatureLineages = async (project: string) => { + const token = await getIdToken(msalInstance); return Axios .get(`${ API_ENDPOINT }/features/lineage/${ project }?code=${ token }`, {}) .then((response) => { @@ -63,8 +71,9 @@ export const fetchFeatureLineages = async (project: string) => { // Following are place-holder code export const createFeature = async (feature: Feature) => { + const token = await getIdToken(msalInstance); return Axios - .post(`${ API_ENDPOINT }/features`, feature, + .post(`${ API_ENDPOINT }/features?code=${ token }`, feature, { headers: { "Content-Type": "application/json;" }, params: {}, @@ -76,8 +85,9 @@ export const createFeature = async (feature: Feature) => { } export const updateFeature = async (feature: Feature, id: string) => { + const token = await getIdToken(msalInstance); feature.guid = id; - return await Axios.put(`${ API_ENDPOINT }/features/${ id }`, feature, + return await Axios.put(`${ API_ENDPOINT }/features/${ id }?code=${ token }`, feature, { headers: { "Content-Type": "application/json;" }, params: {}, @@ -89,8 +99,9 @@ export const updateFeature = async (feature: Feature, id: string) => { }; export const deleteFeature = async (qualifiedName: string) => { + const token = await getIdToken(msalInstance); return await Axios - .delete(`${ API_ENDPOINT }/features/${ qualifiedName }`, + .delete(`${ API_ENDPOINT }/features/${ qualifiedName }?code=${ token }`, { headers: { "Content-Type": "application/json;" }, params: {}, @@ -107,6 +118,7 @@ export const listUserRole = async () => { }; export const getUserRole = async (userName: string) => { + const token = await getIdToken(msalInstance); return await Axios .get(`${ API_ENDPOINT }/user/${userName}/userroles?code=${ token }`, {}) .then((response) => { @@ -115,8 +127,9 @@ export const getUserRole = async (userName: string) => { } export const addUserRole = async (role: Role) => { + const token = await getIdToken(msalInstance); return await Axios - .post(`${ API_ENDPOINT }/user/${role.userName}/userroles/new`, role, + .post(`${ API_ENDPOINT }/user/${role.userName}/userroles/new?code=${ token }`, role, { headers: { "Content-Type": "application/json;" }, params: {}, @@ -128,8 +141,9 @@ export const addUserRole = async (role: Role) => { } export const deleteUserRole = async (role: Role) => { + const token = await getIdToken(msalInstance); return await Axios - .post(`${ API_ENDPOINT }/user/${role.userName}/userroles/delete`, role, + .post(`${ API_ENDPOINT }/user/${role.userName}/userroles/delete?code=${ token }`, role, { headers: { "Content-Type": "application/json;" }, params: {}, @@ -139,3 +153,23 @@ export const deleteUserRole = async (role: Role) => { return error.response; }); } + +export const getIdToken = async( msalInstance: PublicClientApplication ): Promise => { + const activeAccount = msalInstance.getActiveAccount(); // This will only return a non-null value if you have logic somewhere else that calls the setActiveAccount API + const accounts = msalInstance.getAllAccounts(); + const request = { + scopes: ["User.Read"], + account: activeAccount || accounts[0] + }; + // Silently acquire an token for a given set of scopes. Will use cached token if available, otherwise will attempt to acquire a new token from the network via refresh token. + await msalInstance.acquireTokenSilent(request).then(response => { + return response.idToken + }).catch(error => { + if (error instanceof InteractionRequiredAuthError) { + msalInstance.acquireTokenPopup(request).then(response => { + return response.idToken + }); + } + }) + return "" +} \ No newline at end of file diff --git a/ui/src/app.tsx b/ui/src/app.tsx index 2a21eb373..3abae7935 100644 --- a/ui/src/app.tsx +++ b/ui/src/app.tsx @@ -2,7 +2,7 @@ import React from "react"; import { BrowserRouter, Route, Routes } from "react-router-dom"; import { Layout } from "antd"; import { QueryClient, QueryClientProvider } from "react-query"; -import { Configuration, InteractionType, PublicClientApplication, } from "@azure/msal-browser"; +import { InteractionType } from "@azure/msal-browser"; import { MsalAuthenticationTemplate, MsalProvider } from "@azure/msal-react"; import Header from "./components/header/header"; import SideMenu from "./components/sidemenu/siteMenu"; @@ -15,17 +15,11 @@ import Monitoring from "./pages/monitoring/monitoring"; import LineageGraph from "./pages/feature/lineageGraph"; import Management from "./pages/management/management"; import RoleManagement from "./pages/management/roleManagement"; +import { getMsalConfig } from "./utils/utils"; const queryClient = new QueryClient(); -const msalConfig: Configuration = { - auth: { - clientId: process.env.REACT_APP_AAD_APP_CLIENT_ID, - authority: process.env.REACT_APP_AAD_APP_AUTHORITY, - redirectUri: window.location.origin, - }, -}; -const msalClient = new PublicClientApplication(msalConfig); +const msalClient = getMsalConfig(); const App: React.FC = () => { return ( diff --git a/ui/src/components/graph/graph.tsx b/ui/src/components/graph/graph.tsx index 166cf19ae..64e9a7922 100644 --- a/ui/src/components/graph/graph.tsx +++ b/ui/src/components/graph/graph.tsx @@ -150,7 +150,7 @@ const Graph: React.FC = ({ data, nodeId }) => { if (isNode(element)) { resetHighlight(); highlightPath(element, false); - setURLSearchParams({ nodeId: element.data.id }); + setURLSearchParams({ nodeId: element.data.id, featureType: element.data.subtitle }); } } } onNodeDragStop={ onNodeDragStop } diff --git a/ui/src/components/graph/graphNodeDetails.tsx b/ui/src/components/graph/graphNodeDetails.tsx new file mode 100644 index 000000000..56e848fe6 --- /dev/null +++ b/ui/src/components/graph/graphNodeDetails.tsx @@ -0,0 +1,87 @@ +import React, { useEffect, useState } from 'react'; +import { useParams, useSearchParams } from "react-router-dom"; +import { fetchFeature } from '../../api'; +import { Feature } from "../../models/model"; +import { LoadingOutlined } from "@ant-design/icons"; +import { Card, Spin } from "antd"; + +type Params = { + project: string; + featureId: string; +} + +const GraphNodeDetails: React.FC = () => { + const [searchParams] = useSearchParams(); + const { project } = useParams() as Params; + const nodeId = searchParams.get('nodeId') as string; + const featureType = searchParams.get('featureType') as string; + const [feature, setFeature] = useState(); + const [loading, setLoading] = useState(false); + + const isFeature = (featureType:string) => { + return featureType === 'feathr_anchor_feature_v1' || featureType === 'feathr_derived_feature_v1' + } + + useEffect(() => { + const fetchFeatureData = async () => { + setFeature(undefined); + if (nodeId && isFeature(featureType)) { + setLoading(true); + const data = await fetchFeature(project, nodeId); + setFeature(data); + setLoading(false); + } + }; + + fetchFeatureData(); + }, [nodeId]); + + return ( + <> + { + loading + ? ( } />) + : (
+ { feature?.attributes.transformation && + + { feature.attributes.transformation.transform_expr && +

transform_expr: { feature.attributes.transformation.transform_expr }

} + { feature.attributes.transformation.filter && +

filter: { feature.attributes.transformation.filter }

} + { feature.attributes.transformation.agg_func && +

agg_func: { feature.attributes.transformation.agg_func }

} + { feature.attributes.transformation.limit && +

limit: { feature.attributes.transformation.limit }

} + { feature.attributes.transformation.group_by && +

group_by: { feature.attributes.transformation.group_by }

} + { feature.attributes.transformation.window && +

window: { feature.attributes.transformation.window }

} + { feature.attributes.transformation.def_expr && +

def_expr: { feature.attributes.transformation.def_expr }

} +
+ } + { feature?.attributes.key && feature.attributes.key.length > 0 && + +

full_name: { feature.attributes.key[0].full_name }

+

key_column: { feature.attributes.key[0].key_column }

+

description: { feature.attributes.key[0].description }

+

key_column_alias: { feature.attributes.key[0].key_column_alias }

+

key_column_type: { feature.attributes.key[0].key_column_type }

+
+ } + { feature?.attributes.type && + +

dimension_type: { feature.attributes.type.dimension_type }

+

tensor_category: { feature.attributes.type.tensor_category }

+

type: { feature.attributes.type.type }

+

val_type: { feature.attributes.type.val_type }

+
+ } +
) + } + + ) +} + + +export default GraphNodeDetails; diff --git a/ui/src/pages/feature/lineageGraph.tsx b/ui/src/pages/feature/lineageGraph.tsx index 3cfcf5452..5035a7fa9 100644 --- a/ui/src/pages/feature/lineageGraph.tsx +++ b/ui/src/pages/feature/lineageGraph.tsx @@ -1,5 +1,5 @@ import React, { useEffect, useState } from 'react'; -import { Card, Radio, Spin } from 'antd'; +import { Card, Col, Radio, Row, Spin } from 'antd'; import { useParams, useSearchParams } from "react-router-dom"; import { Elements } from 'react-flow-renderer'; import Graph from "../../components/graph/graph"; @@ -7,6 +7,7 @@ import { generateEdge, generateNode } from "../../components/graph/utils"; import { fetchProjectLineages } from "../../api"; import { FeatureLineage } from "../../models/model"; import { LoadingOutlined } from "@ant-design/icons"; +import GraphNodeDetails from "../../components/graph/graphNodeDetails"; type Params = { project: string; @@ -24,6 +25,7 @@ const LineageGraph: React.FC = () => { // Fetch lineage data from server side, invoked immediately after component is mounted useEffect(() => { const fetchLineageData = async () => { + setLoading(true); const data = await fetchProjectLineages(project); setLineageData(data); setLoading(false); @@ -105,9 +107,20 @@ const LineageGraph: React.FC = () => {
- { loading - ? } /> - : } + { + loading + ? ( } />) + : ( + + + + + + + + + ) + }
); diff --git a/ui/src/utils/utils.tsx b/ui/src/utils/utils.tsx new file mode 100644 index 000000000..8ae041e27 --- /dev/null +++ b/ui/src/utils/utils.tsx @@ -0,0 +1,12 @@ +import { Configuration, PublicClientApplication } from "@azure/msal-browser"; + +export const getMsalConfig = () => { + const msalConfig: Configuration = { + auth: { + clientId: process.env.REACT_APP_AAD_APP_CLIENT_ID, + authority: process.env.REACT_APP_AAD_APP_AUTHORITY, + redirectUri: window.location.origin, + }, + }; + return new PublicClientApplication(msalConfig); +} \ No newline at end of file