In [1]:
import os 
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame
from feast import FeatureStore

import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')

spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1g").\
        config("spark.executor.cores", 1).\
        getOrCreate()

spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
spark.conf.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
spark.conf.set("fs.azure.account.key.myfeastadls.dfs.core.windows.net", os.environ["STORAGE_ACCOUNT_KEY"])

hdfs = "hdfs://namenode:8020"
fs = FeatureStore("./fs_online")


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/19 02:09:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# FEATURE ENGINEERING

In [2]:
df = spark.read.load(f"{hdfs}/silver/chicago/taxi_trips")

gdf = df.\
withColumn("event_timestamp", F.date_format(
    F.to_timestamp(
        F.col("trip_start_timestamp")
    ), "dd-MM-yyyy HH:00:00"
)).\
groupBy("taxi_id", "event_timestamp").\
agg(
    F.avg("trip_seconds").alias("avg_trip_time"),
    F.avg("trip_miles").alias("avg_trip_dist"),
    F.avg("fare").alias("avg_trip_fare"),
    F.avg("tips").alias("avg_trip_tips"),
    F.sum("fare").alias("total_fare_hour"),
    F.sum("tips").alias("total_tips_hour"),
    F.count("trip_id").alias("trips_count")
).\
withColumn("created", F.to_date("event_timestamp", "dd-MM-yyyy HH:00:00")).\
withColumn("trip_id", F.concat_ws(
    "",
    F.col("taxi_id"), 
    F.unix_timestamp(F.col("event_timestamp"),"dd-MM-yyyy HH:00:00").cast("string")
))

gdf.repartition("created").\
write.\
mode("overwrite").\
partitionBy("created").\
save(f"{hdfs}/gold/chicago/f_taxi_trips_hourly")

                                                                                

# REGISTERING CREATED FEATURE IN CENTRAL FEATURE REGISTRY

In [3]:
from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import SparkSource
from feast import Feature, FeatureView, ValueType
from datetime import timedelta, datetime
from feast import Entity

# Feature Source Definition
trip_stats_source = SparkSource(
    file_format="parquet",
    path=f"{hdfs}/gold/chicago/f_taxi_trips_hourly",
    timestamp_field="event_timestamp",
    created_timestamp_column="created",
    name="chi_taxi_trips_hourly"
)

# Feature Definition
trip_stats_fv = FeatureView(
    name="fv_chi_taxi_trips_hourly",
    entities=["trip_id"],
    features=[
        Feature(name="avg_trip_time", dtype=ValueType.FLOAT),
        Feature(name="avg_trip_dist", dtype=ValueType.FLOAT),
        Feature(name="avg_trip_fare", dtype=ValueType.FLOAT),
        Feature(name="avg_trip_tips", dtype=ValueType.FLOAT),
        Feature(name="total_tips_hour", dtype=ValueType.FLOAT),
        Feature(name="trips_count", dtype=ValueType.FLOAT),
    ],
    batch_source=trip_stats_source
)

# Entity definition => entity == primary key 

trip_entity = Entity(name="trip_id", value_type=ValueType.STRING)



## Apply Feature and Entity to Registry ---- feast apply

In [4]:
fs.apply([trip_entity, trip_stats_fv])

## Validate that feature metadata was created in registry

In [9]:
for f in fs.list_feature_views():
    print(f)

{
  "spec": {
    "name": "fv_chi_station_reads_hourly",
    "features": [
      {
        "name": "precipitation_type",
        "valueType": "STRING"
      },
      {
        "name": "avg_temp",
        "valueType": "FLOAT"
      },
      {
        "name": "total_rain",
        "valueType": "FLOAT"
      }
    ],
    "ttl": "0s",
    "batchSource": {
      "type": "BATCH_SPARK",
      "timestampField": "event_timestamp",
      "createdTimestampColumn": "created",
      "dataSourceClassType": "feast.infra.offline_stores.contrib.spark_offline_store.spark_source.SparkSource",
      "name": "chi_station_reads_hourly_fv",
      "sparkOptions": {
        "path": "abfss://gold@myfeastadls.dfs.core.windows.net/chicago/weather/station_reads_hourly_fv",
        "fileFormat": "parquet"
      }
    },
    "online": true
  },
  "meta": {
    "createdTimestamp": "2022-05-19T01:57:59.647444Z",
    "lastUpdatedTimestamp": "2022-05-19T01:57:59.647444Z"
  }
}
{
  "spec": {
    "name": "fv_chi_taxi_trip



In [27]:
entt = spark.read.load(f"{hdfs}/gold/chicago/f_taxi_trips_hourly").select("trip_id")
entt.show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------+
|trip_id                                                                                                                                   |
+------------------------------------------------------------------------------------------------------------------------------------------+
|e51e2c30caec952b40b8329a68b498e18ce8a1f40fa75c71e425e9426db562ac617b0a28e1c69f5c579048f75a43a2dc066c17448ab65f5016acca10558df3ed1648818000|
|dd9b35afedd096ac55661acaab6074c2d2fb289f592d7b9bd366cebd1633d8a8ebe58f217c5389c86b7c5731aef8018cacc109c152965a663cab562bac68b7a41648818000|
|bbb57d3a935ecb5c620c671bf5c2bc9b0ad1a7ac1124ec452050ee3b58e917004f9a6c6e733c13f23fca3b93720be55ea517aceeafbb724f766b95581f35c1af1648818000|
|992641c74ae28a22b5d72e1f9fcc89ebda221b88c7bb03db9d914a7bba1ae11601c36b633ed91cfe2283f8baeb56f8fd6bfacbb5884bf1681759d0a2375a5c471648825200|
|0609b5525eef

In [28]:
hist = fs.get_historical_features(
    entity_df=entity_df,
    features=["fv_chi_taxi_trips_hourly:avg_trip_time"]
)

