In [1]:
import os 
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame
from feast import FeatureStore

import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')

spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1g").\
        config("spark.executor.cores", 1).\
        config("spark.sql.sources.partitionOverwriteMode", "dynamic").\
        config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.1,com.microsoft.azure:azure-data-lake-store-sdk:2.3.10").\
        config("spark.hadoop.fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem").\
        config("fs.azure.account.key.myfeastadls.dfs.core.windows.net", os.environ["STORAGE_ACCOUNT_KEY"]).\
        config("spark.databricks.delta.formatCheck.enabled", False).\
        getOrCreate()


# spark.conf.set
# spark.conf.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
# spark.conf.set("spark.hadoop.fs.azure.account.key.myfeastadls.dfs.core.windows.net", "U588DqWLAQQz3zoOkSTij94Me6Wfk+XrmS5Lcd0QePAiGl/LsgkFr76se9scT9w/wagZaEQmppcpTmOZi90DfA==").\
# spark.conf.set("spark.databricks.delta.formatCheck.enabled", False)

# os.environ["AZURE_TENANT_ID"]="f35cc17d-4ea3-4b5f-9c1e-e6770f7c7603"
# os.environ["AZURE_CLIENT_ID"]="5baa3265-c1e8-44fb-bb35-c448ae261d4a"
# os.environ["AZURE_CLIENT_SECRET"]="Src8Q~7jJtvkbnsWEzJOu4nS5LnqZOpD4Z_5ia0a"

# spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
# spark.conf.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
# spark.conf.set("fs.azure.account.key.myfeastadls.dfs.core.windows.net", os.environ["STORAGE_ACCOUNT_KEY"])

spark.conf.set("fs.azure.account.key.myfeastadls.dfs.core.windows.net", os.environ["STORAGE_ACCOUNT_KEY"])

hdfs = "hdfs://namenode:8020"
fs = FeatureStore("./fs_online")



:: loading settings :: url = jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-azure added as a dependency
com.microsoft.azure#azure-data-lake-store-sdk added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0a9bc3b8-f14f-47bc-bd42-685917edc1c8;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-azure;3.3.1 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.11 in central
	found com.microsoft.azure#azure-storage;7.0.1 in central
	found com.fasterxml.jackson.core#jackson-core;2.10.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found com.microsoft.azure#azure-keyvault-core;1.0.0 in central
	found com.google.guava#guava;27.0-jre in central
	found com.google.guava#failureaccess;1.0 in central
	found com.google.guava#listenablefut

In [2]:
# Feature Discovery

fs = FeatureStore("./fs_online")

entities = fs.list_entities()
print("Entities\n")
for en in entities:
    print(f"{en.name}")
    print("="*100)

    
print("\nFeature Views")
for f in fs.list_feature_views():
    print("="*100+"")    
    print(f"Feature View: {f.name}\n")
    print(f)

Entities

driver
trip_id
taxi_id

Feature Views
Feature View: fv_chi_station_reads_hourly

{
  "spec": {
    "name": "fv_chi_station_reads_hourly",
    "features": [
      {
        "name": "precipitation_type",
        "valueType": "STRING"
      },
      {
        "name": "avg_temp",
        "valueType": "FLOAT"
      },
      {
        "name": "total_rain",
        "valueType": "FLOAT"
      }
    ],
    "ttl": "0s",
    "batchSource": {
      "type": "BATCH_SPARK",
      "timestampField": "event_timestamp",
      "createdTimestampColumn": "created",
      "dataSourceClassType": "feast.infra.offline_stores.contrib.spark_offline_store.spark_source.SparkSource",
      "name": "chi_station_reads_hourly_fv",
      "sparkOptions": {
        "path": "abfss://gold@myfeastadls.dfs.core.windows.net/chicago/weather/station_reads_hourly_fv",
        "fileFormat": "parquet"
      }
    },
    "online": true
  },
  "meta": {
    "createdTimestamp": "2022-05-19T01:57:59.647444Z",
    "lastUpdated



In [3]:
entity_df = spark.read.\
load("hdfs://namenode:8020/gold/chicago/f_taxi_trips_hourly").filter(F.col("created") <= "2022-04-10")
# withColumn("read_id", (F.unix_timestamp(F.col("event_timestamp"),"dd-MM-yyyy HH:00:00").cast("string"))).\
# select("read_id", "event_timestamp").\
# distinct().\
# sort("read_id")

                                                                                

In [4]:
entity_df.show(5)

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+-------------------+-------------+-------------+------------------+------------------+---------------+-----------------+-----------+----------+
|             taxi_id|    event_timestamp|avg_trip_time|avg_trip_dist|     avg_trip_fare|     avg_trip_tips|total_fare_hour|  total_tips_hour|trips_count|   created|
+--------------------+-------------------+-------------+-------------+------------------+------------------+---------------+-----------------+-----------+----------+
|0c16d63294bfa9a1d...|2022-04-01 13:00:00|       1154.0|        13.55|              34.0|               7.7|           34.0|              7.7|          1|2022-04-01|
|dfe93dd8fbdee8c0f...|2022-04-01 13:00:00|        269.8|        1.108|               7.5|             1.642|           37.5|8.209999999999999|          5|2022-04-01|
|3c07027096c12ad3f...|2022-04-01 13:00:00|       1390.0|        12.06|31.416666666666668|6.6000000000000005|          94.25|             19.8|          3|2022-04-01|
|48c

                                                                                

In [8]:
hist_features = fs.get_historical_features(
    entity_df=entity_df.toPandas(),
    features=[
        "fv_chi_station_reads_hourly:precipitation_type",
        "fv_chi_station_reads_hourly:avg_temp",
        "fv_chi_station_reads_hourly:total_rain"
    ],
)

                                                                                

In [9]:
hist_df = hist_features.to_spark_df()

In [10]:
hist_df.printSchema()

root
 |-- taxi_id: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)
 |-- avg_trip_time: double (nullable = true)
 |-- avg_trip_dist: double (nullable = true)
 |-- avg_trip_fare: double (nullable = true)
 |-- avg_trip_tips: double (nullable = true)
 |-- total_fare_hour: double (nullable = true)
 |-- total_tips_hour: double (nullable = true)
 |-- trips_count: long (nullable = true)
 |-- created: date (nullable = true)
 |-- precipitation_type: string (nullable = true)
 |-- avg_temp: double (nullable = true)
 |-- total_rain: double (nullable = true)



In [11]:
hist_df.select("taxi_id", "avg_trip_time", "avg_trip_dist", "precipitation_type", "avg_temp", "total_rain").show()

22/05/23 01:33:13 WARN TaskSetManager: Stage 33 contains a task of very large size (4247 KiB). The maximum recommended task size is 1000 KiB.
22/05/23 01:33:14 WARN TaskSetManager: Stage 34 contains a task of very large size (4247 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+--------------------+------------------+------------------+------------------+------------------+----------+
|             taxi_id|     avg_trip_time|     avg_trip_dist|precipitation_type|          avg_temp|total_rain|
+--------------------+------------------+------------------+------------------+------------------+----------+
|d133de68d7dfb2069...|            1932.0|              17.8|                no|               0.6|     136.2|
|65e6fa122a48ea774...|            1797.5|             9.945|                no|0.7133333333333333|     136.2|
|cf2a68db5901a09fc...|            3248.0|             14.29|                no|2.1233333333333335|     136.2|
|6794d3cb4e473ce49...|1639.3333333333333| 6.713333333333334|                no|2.1233333333333335|     136.2|
|52b413067437984fb...|            1561.0| 5.715000000000001|                no|2.1233333333333335|     136.2|
|79014a1d9bac0ae75...|            1560.0|              11.9|                no|2.1233333333333335|     136.2|
|5da013ec6

In [14]:
hist_df.limit(1).collect()

22/05/23 01:43:31 WARN TaskSetManager: Stage 61 contains a task of very large size (4247 KiB). The maximum recommended task size is 1000 KiB.
22/05/23 01:46:02 WARN TaskSetManager: Stage 65 contains a task of very large size (4247 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

[Row(taxi_id='0c16d63294bfa9a1d6452cfaf53bd8479acb2161f88c3f49cfc760ef4964b9b717e7695b79231a7dae2dc16e9aa22d4ad0feaa49138d1b15a7607e2e9c2b3286', event_timestamp=datetime.datetime(2022, 4, 1, 13, 0), avg_trip_time=1154.0, avg_trip_dist=13.55, avg_trip_fare=34.0, avg_trip_tips=7.7, total_fare_hour=34.0, total_tips_hour=7.7, trips_count=1, created=datetime.date(2022, 4, 1), precipitation_type='no', avg_temp=1.8333333333333333, total_rain=136.2)]

In [19]:
entity_rows = [{"taxi_id": "0c16d63294bfa9a1d6452cfaf53bd8479acb2161f88c3f49cfc760ef4964b9b717e7695b79231a7dae2dc16e9aa22d4ad0feaa49138d1b15a7607e2e9c2b3286"}]

fs.get_online_features(entity_rows=entity_rows,
                       features=[
                           "fv_chi_taxi_trips_hourly:avg_trip_time",
                           "fv_chi_taxi_trips_hourly:avg_trip_dist",
                           "fv_chi_taxi_trips_hourly:avg_trip_fare"
                       ]
                      ).to_dict()

{'taxi_id': ['0c16d63294bfa9a1d6452cfaf53bd8479acb2161f88c3f49cfc760ef4964b9b717e7695b79231a7dae2dc16e9aa22d4ad0feaa49138d1b15a7607e2e9c2b3286'],
 'avg_trip_fare': [10.039999961853027],
 'avg_trip_dist': [2.2200000286102295],
 'avg_trip_time': [722.0]}

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 39666)
Traceback (most recent call last):
  File "/usr/lib/python3.9/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.9/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.9/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.9/socketserver.py", line 720, in __init__
    self.handle()
  File "/usr/local/lib/python3.9/dist-packages/pyspark/accumulators.py", line 262, in handle
    poll(accum_updates)
  File "/usr/local/lib/python3.9/dist-packages/pyspark/accumulators.py", line 235, in poll
    if func():
  File "/usr/local/lib/python3.9/dist-packages/pyspark/accumulators.py", line 239, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/lo