In [1]:
import os 
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame
from feast import FeatureStore

import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')

spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1g").\
        config("spark.executor.cores", 1).\
        getOrCreate()

spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
spark.conf.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
spark.conf.set("fs.azure.account.key.myfeastadls.dfs.core.windows.net", os.environ["STORAGE_ACCOUNT_KEY"])
spark.conf.set("spark.databricks.delta.formatCheck.enabled", False)

hdfs = "hdfs://namenode:8020"
fs = FeatureStore("./fs_online")


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/23 01:54:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# FEATURE ENGINEERING

In [12]:
df = spark.read.load(f"{hdfs}/silver/chicago/taxi_trips")

gdf = df.\
withColumn("event_timestamp", F.date_format(
    F.to_timestamp(
        F.col("trip_start_timestamp")
    ), "yyyy-MM-dd HH:00:00"
)).\
groupBy("taxi_id", "event_timestamp").\
agg(
    F.avg("trip_seconds").alias("avg_trip_time"),
    F.avg("trip_miles").alias("avg_trip_dist"),
    F.avg("fare").alias("avg_trip_fare"),
    F.avg("tips").alias("avg_trip_tips"),
    F.sum("fare").alias("total_fare_hour"),
    F.sum("tips").alias("total_tips_hour"),
    F.count("taxi_id").alias("trips_count")
).\
withColumn("created", F.to_date("event_timestamp", "yyyy-MM-dd HH:00:00"))
# withColumn("trip_id", F.concat_ws(
#     "",
#     F.col("taxi_id"), 
#     F.unix_timestamp(F.col("event_timestamp"),"yyyy-MM-dd HH:00:00").cast("string")
# ))

gdf.repartition("created").\
write.\
mode("overwrite").\
partitionBy("created").\
save(f"{hdfs}/gold/chicago/f_taxi_trips_hourly")

                                                                                

# REGISTERING CREATED FEATURE IN CENTRAL FEATURE REGISTRY

In [13]:
from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import SparkSource
from feast import Feature, FeatureView, ValueType
from datetime import timedelta, datetime
from feast import Entity

# Feature Source Definition
driver_stats_source = SparkSource(
    file_format="parquet",
    path=f"{hdfs}/gold/chicago/f_taxi_trips_hourly",
    timestamp_field="event_timestamp",
    created_timestamp_column="created",
    name="chi_taxi_trips_hourly"
)

# Feature Definition
driver_stats_fv = FeatureView(
    name="fv_chi_taxi_trips_hourly",
    entities=["taxi_id"],
    features=[
        Feature(name="avg_trip_time", dtype=ValueType.FLOAT),
        Feature(name="avg_trip_dist", dtype=ValueType.FLOAT),
        Feature(name="avg_trip_fare", dtype=ValueType.FLOAT),
        Feature(name="avg_trip_tips", dtype=ValueType.FLOAT),
        Feature(name="total_tips_hour", dtype=ValueType.FLOAT),
        Feature(name="trips_count", dtype=ValueType.FLOAT),
    ],
    batch_source=driver_stats_source
)

# Entity definition => entity == primary key 

driver_entity = Entity(name="taxi_id", value_type=ValueType.STRING)

## Apply Feature and Entity to Registry ---- feast apply

In [4]:
fs.apply([driver_entity, driver_stats_fv])

## Validate that feature metadata was created in registry

In [5]:
from feast import FeatureStore
fs = FeatureStore("./fs_online")

In [14]:
fs.get_feature_view("fv_chi_taxi_trips_hourly")

<FeatureView(name = fv_chi_taxi_trips_hourly, entities = ['taxi_id'], stream_source = None, batch_source = {
  "type": "BATCH_SPARK",
  "timestampField": "event_timestamp",
  "createdTimestampColumn": "created",
  "dataSourceClassType": "feast.infra.offline_stores.contrib.spark_offline_store.spark_source.SparkSource",
  "name": "chi_taxi_trips_hourly",
  "sparkOptions": {
    "path": "hdfs://namenode:8020/gold/chicago/f_taxi_trips_hourly",
    "fileFormat": "parquet"
  }
}, source = {
  "type": "BATCH_SPARK",
  "timestampField": "event_timestamp",
  "createdTimestampColumn": "created",
  "dataSourceClassType": "feast.infra.offline_stores.contrib.spark_offline_store.spark_source.SparkSource",
  "name": "chi_taxi_trips_hourly",
  "sparkOptions": {
    "path": "hdfs://namenode:8020/gold/chicago/f_taxi_trips_hourly",
    "fileFormat": "parquet"
  }
}, ttl = 0:00:00, schema = [avg_trip_time-Float32, avg_trip_dist-Float32, avg_trip_fare-Float32, avg_trip_tips-Float32, total_tips_hour-Float32

In [15]:
from feast import FeatureStore
fs = FeatureStore("./fs_online")

print("Entities\n")
for en in fs.list_entities():
    print(f"{en.name}")
    print("="*100)

    
print("\nFeature Views")
for f in fs.list_feature_views():
    print("="*100+"")    
    print(f"Feature View: {f.name}\n")
    print(f)

Entities

driver
trip_id
taxi_id

Feature Views
Feature View: fv_chi_station_reads_hourly

{
  "spec": {
    "name": "fv_chi_station_reads_hourly",
    "features": [
      {
        "name": "precipitation_type",
        "valueType": "STRING"
      },
      {
        "name": "avg_temp",
        "valueType": "FLOAT"
      },
      {
        "name": "total_rain",
        "valueType": "FLOAT"
      }
    ],
    "ttl": "0s",
    "batchSource": {
      "type": "BATCH_SPARK",
      "timestampField": "event_timestamp",
      "createdTimestampColumn": "created",
      "dataSourceClassType": "feast.infra.offline_stores.contrib.spark_offline_store.spark_source.SparkSource",
      "name": "chi_station_reads_hourly_fv",
      "sparkOptions": {
        "path": "abfss://gold@myfeastadls.dfs.core.windows.net/chicago/weather/station_reads_hourly_fv",
        "fileFormat": "parquet"
      }
    },
    "online": true
  },
  "meta": {
    "createdTimestamp": "2022-05-19T01:57:59.647444Z",
    "lastUpdated

## Materialize feature to Online Store

In [2]:
from datetime import datetime
fs.materialize(
    start_date=datetime(2022, 4, 1),
    end_date=datetime(2022, 5, 30),
    feature_views=["fv_chi_taxi_trips_hourly"]
)



Materializing [1m[32m1[0m feature views from [1m[32m2022-04-01 00:00:00+00:00[0m to [1m[32m2022-05-30 00:00:00+00:00[0m into the [1m[32mredis[0m online store.

[1m[32mfv_chi_taxi_trips_hourly[0m:




Pulling latest features from spark offline store


100%|██████████████████████████████████████████████████████████| 2195/2195 [00:02<00:00, 957.33it/s]
