In [1]:
import os 
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame
from feast import FeatureStore

import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')

spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1g").\
        config("spark.executor.cores", 1).\
        config("spark.sql.sources.partitionOverwriteMode", "dynamic").\
        config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.1,com.microsoft.azure:azure-data-lake-store-sdk:2.3.10").\
        config("spark.hadoop.fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem").\
        config("fs.azure.account.key.myfeastadls.dfs.core.windows.net", os.environ["STORAGE_ACCOUNT_KEY"]).\
        config("spark.databricks.delta.formatCheck.enabled", False).\
        getOrCreate()

spark.conf.set("fs.azure.account.key.myfeastadls.dfs.core.windows.net", os.environ["STORAGE_ACCOUNT_KEY"])

hdfs = "hdfs://namenode:8020"
fs = FeatureStore("./fs_online")

# Another option - authenticate with oauth2
    # https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/#rdd-api-1
    # https://docs.microsoft.com/en-us/azure/databricks/clusters/configure#--spark-configuration

    # spark.hadoop.fs.azure.account.oauth2.client.id.<datalake>.dfs.core.windows.net <sp client id>
    # spark.hadoop.fs.azure.account.auth.type.<datalake>.dfs.core.windows.net OAuth
    # spark.hadoop.fs.azure.account.oauth.provider.type.<datalake>.dfs.core.windows.net org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider
    # org.apache.hadoop.fs.azure.account.oauth2.client.secret.<datalake>.dfs.core.windows.net {{secrets/secret/secret}}
    # spark.hadoop.fs.azure.account.oauth2.client.endpoint.<datalake>.dfs.core.windows.net https://login.microsoftonline.com/<tenant>/oauth2/token



:: loading settings :: url = jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-azure added as a dependency
com.microsoft.azure#azure-data-lake-store-sdk added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-40481444-7de3-46fc-a532-1ecf3063df0a;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-azure;3.3.1 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.11 in central
	found com.microsoft.azure#azure-storage;7.0.1 in central
	found com.fasterxml.jackson.core#jackson-core;2.10.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found com.microsoft.azure#azure-keyvault-core;1.0.0 in central
	found com.google.guava#guava;27.0-jre in central
	found com.google.guava#failureaccess;1.0 in central
	found com.google.guava#listenablefut

# Feature Discovery

In [9]:
# Feature Discovery

fs = FeatureStore("./fs_online")

entities = fs.list_entities()
print(f"Entities\n{'*'*100}\n")
for en in entities:
    print(f"Entity: {en.name}\n")
    print(en)
    print("="*100)

    
print(f"\nFeature Views\n{'*'*100}\n")
for f in fs.list_feature_views(): 
    print(f"Feature View: {f.name}\n")
    print(f)
    print("="*100+"")

Entities
****************************************************************************************************

Entity: driver

{
  "spec": {
    "name": "driver",
    "valueType": "STRING",
    "joinKey": "taxi_id"
  },
  "meta": {
    "createdTimestamp": "2022-05-19T19:43:17.269765Z",
    "lastUpdatedTimestamp": "2022-05-19T19:43:17.269765Z"
  }
}
Entity: trip_id

{
  "spec": {
    "name": "trip_id",
    "valueType": "STRING",
    "joinKey": "trip_id"
  },
  "meta": {
    "createdTimestamp": "2022-05-23T01:05:37.720234Z",
    "lastUpdatedTimestamp": "2022-05-23T01:05:37.720234Z"
  }
}
Entity: taxi_id

{
  "spec": {
    "name": "taxi_id",
    "valueType": "STRING",
    "joinKey": "taxi_id"
  },
  "meta": {
    "createdTimestamp": "2022-05-23T01:15:19.459273Z",
    "lastUpdatedTimestamp": "2022-05-23T01:15:19.459273Z"
  }
}
Entity: read_id

{
  "spec": {
    "name": "read_id",
    "valueType": "STRING",
    "joinKey": "read_id"
  },
  "meta": {
    "createdTimestamp": "2022-06-28T10:49:



# Get Historical Data from Feature Store

In [2]:
entity_df = spark.read.\
load("hdfs://namenode:8020/gold/chicago/f_taxi_trips_hourly").filter(F.col("created") <= "2022-04-15").select("taxi_id", "event_timestamp")
# withColumn("read_id", (F.unix_timestamp(F.col("event_timestamp"),"dd-MM-yyyy HH:00:00").cast("string"))).\
# select("read_id", "event_timestamp").\
# distinct().\
# sort("read_id")

                                                                                

In [3]:
entity_df.show(5)

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+-------------------+
|             taxi_id|    event_timestamp|
+--------------------+-------------------+
|0c16d63294bfa9a1d...|2022-04-01 13:00:00|
|dfe93dd8fbdee8c0f...|2022-04-01 13:00:00|
|3c07027096c12ad3f...|2022-04-01 13:00:00|
|48c0b5669ed50a0dc...|2022-04-01 13:00:00|
|21ccb8006a50da9b1...|2022-04-01 13:00:00|
+--------------------+-------------------+
only showing top 5 rows



                                                                                

In [4]:
hist_features = fs.get_historical_features(
    entity_df=entity_df.toPandas(),
    features=[
        "fv_chi_station_reads_hourly:precipitation_type",
        "fv_chi_station_reads_hourly:avg_temp",
        "fv_chi_station_reads_hourly:total_rain",
        "fv_chi_taxi_trips_hourly:avg_trip_time",
        "fv_chi_taxi_trips_hourly:avg_trip_dist",
        "fv_chi_taxi_trips_hourly:avg_trip_fare"
    ],
)

                                                                                

In [5]:
hist_df = hist_features.to_spark_df()

In [6]:
hist_df.printSchema()

root
 |-- taxi_id: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)
 |-- precipitation_type: string (nullable = true)
 |-- avg_temp: double (nullable = true)
 |-- total_rain: double (nullable = true)
 |-- avg_trip_time: double (nullable = true)
 |-- avg_trip_dist: double (nullable = true)
 |-- avg_trip_fare: double (nullable = true)



In [7]:
hist_df.show()

22/06/29 02:46:25 WARN TaskSetManager: Stage 6 contains a task of very large size (4691 KiB). The maximum recommended task size is 1000 KiB.
22/06/29 02:49:17 WARN TaskSetManager: Stage 13 contains a task of very large size (4691 KiB). The maximum recommended task size is 1000 KiB.
22/06/29 02:49:18 WARN TaskSetManager: Stage 14 contains a task of very large size (4691 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+--------------------+-------------------+------------------+------------------+----------+------------------+------------------+------------------+
|             taxi_id|    event_timestamp|precipitation_type|          avg_temp|total_rain|     avg_trip_time|     avg_trip_dist|     avg_trip_fare|
+--------------------+-------------------+------------------+------------------+----------+------------------+------------------+------------------+
|21ccb8006a50da9b1...|2022-04-01 13:00:00|                no|1.8333333333333333|     136.2|            1959.0|             17.13|             42.25|
|52b413067437984fb...|2022-04-01 17:00:00|                no|2.1233333333333335|     136.2|            1561.0| 5.715000000000001|           17.7175|
|a5c7281e5955cd080...|2022-04-01 19:00:00|                no| 2.033333333333333|     136.2|             701.0|             5.715|            18.375|
|025c4a64d4348a818...|2022-04-01 19:00:00|                no| 2.033333333333333|     136.2|             28

In [8]:
print(hist_features.query)


/*
 Compute a deterministic hash for the `left_table_query_string` that will be used throughout
 all the logic as the field to GROUP BY the data
*/
CREATE OR REPLACE TEMPORARY VIEW entity_dataframe AS (
    SELECT *,
        event_timestamp AS entity_timestamp
        
            ,CONCAT(
                
                CAST(event_timestamp AS STRING)
            ) AS fv_chi_station_reads_hourly__entity_row_unique_id
        
            ,CONCAT(
                
                    CAST(taxi_id AS STRING),
                
                CAST(event_timestamp AS STRING)
            ) AS fv_chi_taxi_trips_hourly__entity_row_unique_id
        
    FROM feast_entity_df_9d3dca1686e34da1a44161af016807d8
);

---EOS---



CREATE OR REPLACE TEMPORARY VIEW fv_chi_station_reads_hourly__cleaned AS (

    WITH fv_chi_station_reads_hourly__entity_dataframe AS (
        SELECT
            
            entity_timestamp,
            fv_chi_station_reads_hourly__entity_row_unique_id
        FROM en

# Get feature from online store for serving

In [19]:
hist_df.limit(1).show(truncate=False)

22/06/29 02:01:45 WARN TaskSetManager: Stage 90 contains a task of very large size (4691 KiB). The maximum recommended task size is 1000 KiB.
22/06/29 02:01:45 WARN TaskSetManager: Stage 91 contains a task of very large size (4691 KiB). The maximum recommended task size is 1000 KiB.
22/06/29 02:03:36 WARN TaskSetManager: Stage 97 contains a task of very large size (4691 KiB). The maximum recommended task size is 1000 KiB.
22/06/29 02:04:43 ERROR TaskSchedulerImpl: Lost executor 0 on 10.0.5.7: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/06/29 02:04:43 WARN TaskSetManager: Lost task 2.0 in stage 103.0 (TID 206) (10.0.5.7 executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/06/29 02:04:50 WARN TaskSetManager: Lost ta

+--------------------------------------------------------------------------------------------------------------------------------+-------------------+------------------+------------------+----------+-------------+-------------+-------------+
|taxi_id                                                                                                                         |event_timestamp    |precipitation_type|avg_temp          |total_rain|avg_trip_time|avg_trip_dist|avg_trip_fare|
+--------------------------------------------------------------------------------------------------------------------------------+-------------------+------------------+------------------+----------+-------------+-------------+-------------+
|0c16d63294bfa9a1d6452cfaf53bd8479acb2161f88c3f49cfc760ef4964b9b717e7695b79231a7dae2dc16e9aa22d4ad0feaa49138d1b15a7607e2e9c2b3286|2022-04-01 13:00:00|no                |1.8333333333333333|136.2     |1154.0       |13.55        |34.0         |
+-------------------------------

                                                                                

In [14]:
hist_df.limit(1).collect()

22/05/23 01:43:31 WARN TaskSetManager: Stage 61 contains a task of very large size (4247 KiB). The maximum recommended task size is 1000 KiB.
22/05/23 01:46:02 WARN TaskSetManager: Stage 65 contains a task of very large size (4247 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

[Row(taxi_id='0c16d63294bfa9a1d6452cfaf53bd8479acb2161f88c3f49cfc760ef4964b9b717e7695b79231a7dae2dc16e9aa22d4ad0feaa49138d1b15a7607e2e9c2b3286', event_timestamp=datetime.datetime(2022, 4, 1, 13, 0), avg_trip_time=1154.0, avg_trip_dist=13.55, avg_trip_fare=34.0, avg_trip_tips=7.7, total_fare_hour=34.0, total_tips_hour=7.7, trips_count=1, created=datetime.date(2022, 4, 1), precipitation_type='no', avg_temp=1.8333333333333333, total_rain=136.2)]

## Materialize Features to online store

In [9]:
from datetime import datetime
fs.materialize(
    start_date=datetime(2022, 4, 1),
    end_date=datetime(2022, 5, 30),
    feature_views=["fv_chi_taxi_trips_hourly"]
)

fs.materialize(
    start_date=datetime(2022, 4, 1),
    end_date=datetime(2022, 5, 30),
    feature_views=["fv_chi_station_reads_hourly"]
)



Materializing [1m[32m1[0m feature views from [1m[32m2022-04-01 00:00:00+00:00[0m to [1m[32m2022-05-30 00:00:00+00:00[0m into the [1m[32mredis[0m online store.

[1m[32mfv_chi_taxi_trips_hourly[0m:




Pulling latest features from spark offline store


100%|██████████████████████████████████████████████████████████| 2195/2195 [00:02<00:00, 876.48it/s]


Materializing [1m[32m1[0m feature views from [1m[32m2022-04-01 00:00:00+00:00[0m to [1m[32m2022-05-30 00:00:00+00:00[0m into the [1m[32mredis[0m online store.

[1m[32mfv_chi_station_reads_hourly[0m:




Pulling latest features from spark offline store


22/06/29 02:54:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/29 02:54:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/29 02:55:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.60it/s]


## Retrieve features from online store

![Online Store Data Model](docs/img/feast-data-model.png)


In [24]:
entity_rows = [{"taxi_id": "0c16d63294bfa9a1d6452cfaf53bd8479acb2161f88c3f49cfc760ef4964b9b717e7695b79231a7dae2dc16e9aa22d4ad0feaa49138d1b15a7607e2e9c2b3286", "read_id": "2022-04-01 13:00:00"}]

fs.get_online_features(entity_rows=entity_rows,
                       features=[
                           "fv_chi_taxi_trips_hourly:avg_trip_time",
                           "fv_chi_taxi_trips_hourly:avg_trip_dist",
                           "fv_chi_taxi_trips_hourly:avg_trip_fare",
                           "fv_chi_station_reads_hourly:precipitation_type",
                           "fv_chi_station_reads_hourly:avg_temp"
                       ]
                      ).to_dict()

{'taxi_id': ['0c16d63294bfa9a1d6452cfaf53bd8479acb2161f88c3f49cfc760ef4964b9b717e7695b79231a7dae2dc16e9aa22d4ad0feaa49138d1b15a7607e2e9c2b3286'],
 'read_id': ['2022-04-01 13:00:00'],
 'avg_trip_time': [722.0],
 'avg_trip_fare': [10.039999961853027],
 'avg_trip_dist': [2.2200000286102295],
 'precipitation_type': ['no'],
 'avg_temp': [21.823333740234375]}

In [35]:
fs.list_feature_views()

[<FeatureView(name = fv_chi_taxi_trips_hourly, entities = ['taxi_id'], stream_source = None, batch_source = {
   "type": "BATCH_SPARK",
   "timestampField": "event_timestamp",
   "createdTimestampColumn": "created",
   "dataSourceClassType": "feast.infra.offline_stores.contrib.spark_offline_store.spark_source.SparkSource",
   "name": "chi_taxi_trips_hourly",
   "sparkOptions": {
     "path": "hdfs://namenode:8020/gold/chicago/f_taxi_trips_hourly",
     "fileFormat": "parquet"
   }
 }, source = {
   "type": "BATCH_SPARK",
   "timestampField": "event_timestamp",
   "createdTimestampColumn": "created",
   "dataSourceClassType": "feast.infra.offline_stores.contrib.spark_offline_store.spark_source.SparkSource",
   "name": "chi_taxi_trips_hourly",
   "sparkOptions": {
     "path": "hdfs://namenode:8020/gold/chicago/f_taxi_trips_hourly",
     "fileFormat": "parquet"
   }
 }, ttl = 0:00:00, schema = [avg_trip_time-Float32, avg_trip_dist-Float32, avg_trip_fare-Float32, avg_trip_tips-Float32, to