In [None]:
%pip install databricks-feature-store

In [None]:
files = [
    "s3a://ayushman-hops/taxidata_cleaned/2011.parquet", 
    "s3a://ayushman-hops/taxidata_cleaned/2012.parquet", 
    "s3a://ayushman-hops/taxidata_cleaned/2013.parquet",
    "s3a://ayushman-hops/taxidata_cleaned/2014.parquet",
    "s3a://ayushman-hops/taxidata_cleaned/2015.parquet",
    "s3a://ayushman-hops/taxidata_cleaned/2016.parquet"
]
sf = 10000000
raw_data = spark.read.parquet(*files).limit(sf)
display(raw_data.count())
display(raw_data)

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import FloatType, IntegerType, StringType
from pytz import timezone
import time


@udf(returnType=IntegerType())
def is_weekend(dt):
    tz = "America/New_York"
    return int(dt.astimezone(timezone(tz)).weekday() >= 5)  # 5 = Saturday, 6 = Sunday


def filter_df_by_ts(df, ts_column, start_date, end_date):
    if ts_column and start_date:
        df = df.filter(col(ts_column) >= start_date)
    if ts_column and end_date:
        df = df.filter(col(ts_column) < end_date)
    return df

In [None]:
from pyspark.sql.functions import col, floor, count, mean
from pyspark.sql.window import Window

def pickup_features_fn(df, ts_column, start_date, end_date):
    df = filter_df_by_ts(df, ts_column, start_date, end_date)
    df = df.withColumn('tpep_pickup_datetime', df['tpep_pickup_datetime'].cast('timestamp'))
    df = df.withColumn('window', floor(df['tpep_pickup_datetime'].cast('long') / (15 * 60)).cast('timestamp'))
    pickup_features = (
        df.groupby('pu_location_id', 'pu_borough', 'window')
        .agg(
            mean('fare_amount').alias('mean_fare_window_1h_pickup_zip'),
            count('fare_amount').alias('count_trips_window_1h_pickup_zip')
        )
        .withColumnRenamed('pu_location_id', 'location_id')
        .withColumnRenamed('pu_borough', 'borough')
        .withColumnRenamed('window', 'ts')
    )
    
    return pickup_features

def dropoff_features_fn(df, ts_column, start_date, end_date):
    df = filter_df_by_ts(df, ts_column, start_date, end_date)
    df = df.withColumn('tpep_dropoff_datetime', df['tpep_dropoff_datetime'].cast('timestamp'))
    df = df.withColumn('window', floor(df['tpep_dropoff_datetime'].cast('long') / (30 * 60)).cast('timestamp'))
    dropoff_features = (
        df.groupby('do_location_id', 'do_borough', 'window')
        .agg(count('do_borough').alias('count_trips_window_30m_dropoff_zip'))
        .withColumnRenamed('do_location_id', 'location_id')
        .withColumnRenamed('do_borough', 'borough')
        .withColumnRenamed('window', 'ts')
    )
    dropoff_features = dropoff_features.withColumn('ts', dropoff_features['ts'].cast('timestamp'))
    dropoff_features = dropoff_features.withColumn('dropoff_is_weekend', dayofweek(dropoff_features['ts']).isin([5, 6]))
    
    return dropoff_features


In [None]:
from datetime import datetime

pickup_features = pickup_features_fn(
    df=raw_data,
    ts_column="tpep_pickup_datetime",
    start_date=datetime(2011, 1, 1),
    end_date=datetime(2023, 1, 31),
)
dropoff_features = dropoff_features_fn(
    df=raw_data,
    ts_column="tpep_dropoff_datetime",
    start_date=datetime(2011, 1, 1),
    end_date=datetime(2023, 1, 31),
)

In [None]:
display(pickup_features.head(5))
pickup_features.count()

In [None]:
display(dropoff_features)
dropoff_features.count()

In [None]:
%sql CREATE DATABASE IF NOT EXISTS feature_store_taxi_example;

In [None]:
from databricks.feature_store import FeatureStoreClient
fs = FeatureStoreClient()

In [None]:
# fs.drop_table("feature_store_taxi_example.trip_pickup_time_series_features")
# fs.drop_table("feature_store_taxi_example.trip_dropoff_time_series_features")

In [None]:
fs.create_table(
    name="feature_store_taxi_example.trip_pickup_time_series_features",
    primary_keys=["location_id"],
    timestamp_keys=["ts"],
    df=pickup_features,
    description="Taxi Fares. Pickup Time Series Features",
)
fs.create_table(
    name="feature_store_taxi_example.trip_dropoff_time_series_features",
    primary_keys=["location_id"],
    timestamp_keys=["ts"],
    df=dropoff_features,
    description="Taxi Fares. Dropoff Time Series Features",
)

In [None]:
display(raw_data)

In [None]:
raw_data.count()

In [None]:
from databricks.feature_store import FeatureLookup

pickup_features_table = "fs_taxidata.pit_pickup_features"
dropoff_features_table = "fs_taxidata.pit_dropoff_features"

pickup_feature_lookups = [
    FeatureLookup(
        table_name=pickup_features_table,
        feature_names=[
            "mean_fare_window_1h_pickup_zip",
            "count_trips_window_1h_pickup_zip",
        ],
        lookup_key=["pu_location_id"],
        timestamp_lookup_key="tpep_pickup_datetime",
    ),
]

dropoff_feature_lookups = [
    FeatureLookup(
        table_name=dropoff_features_table,
        feature_names=["count_trips_window_30m_dropoff_zip", "dropoff_is_weekend"],
        lookup_key=["do_location_id"],
        timestamp_lookup_key="tpep_dropoff_datetime",
    ),
]

In [None]:
exclude_columns = ["ts"]

# Benchmark PIT-correct Join to Parquet

In [None]:
# Scale Factor is decided by size of raw_data dataframe

import time
start = time.time()
# Create the training set that includes the raw input data merged with corresponding features from both feature tables
training_set = fs.create_training_set(
    raw_data,
    feature_lookups=pickup_feature_lookups + dropoff_feature_lookups,
    label="fare_amount",
    exclude_columns=exclude_columns,
)

# Load the TrainingSet into a dataframe which can be passed into sklearn for training a model
training_df = training_set.load_df()
training_df.write.mode("overwrite").parquet("training")
print(f"time: {time.time() - start}")
display(training_df.count())


# Benchmark PIT-correct Join to Memory

In [None]:
# Scale Factor is decided by size of raw_data dataframe

start = time.time()
# Create the training set that includes the raw input data merged with corresponding features from both feature tables
training_set_count = fs.create_training_set(
    raw_data,
    feature_lookups=pickup_feature_lookups + dropoff_feature_lookups,
    label="fare_amount",
    exclude_columns=exclude_columns,
)

# Load the TrainingSet into a dataframe which can be passed into sklearn for training a model
training_df = training_set_count.load_df()
training_df.collect()
pdf = training_df.toPandas()
print(f"time: {time.time() - start}")

In [None]:
display(pdf)