In [None]:
%pip install feast[spark,aws,redis]

## Try fetching data from a data source

In [None]:
%sql
SELECT 
  user_id,
  to_timestamp(timestamp) as timestamp,
  SUM(amt) OVER (PARTITION BY user_id ORDER BY to_timestamp(timestamp) RANGE BETWEEN INTERVAL 1 day PRECEDING AND CURRENT ROW) as amt_sum_1d_10m,
  AVG(amt) OVER (PARTITION BY user_id ORDER BY to_timestamp(timestamp) RANGE BETWEEN INTERVAL 1 day PRECEDING AND CURRENT ROW) as amt_mean_1d_10m
FROM demo_fraud_v2.transactions 
WHERE 
  partition_0 = "2022" AND
  partition_1 = "04"
limit 10

user_id,timestamp,amt_sum_1d_10m,amt_mean_1d_10m
user_268308151877,2022-04-07T22:12:37.304+0000,1.79,1.79
user_268308151877,2022-04-07T22:23:06.566+0000,10.87,5.4350000000000005
user_268308151877,2022-04-07T22:26:15.216+0000,16.85,5.616666666666667
user_268308151877,2022-04-07T22:31:28.924+0000,101.88,25.47
user_268308151877,2022-04-07T22:31:41.513+0000,109.8,21.96
user_268308151877,2022-04-07T22:39:28.687+0000,214.13,35.68833333333333
user_268308151877,2022-04-07T22:39:42.791+0000,275.17,39.31
user_268308151877,2022-04-07T22:39:48.833+0000,355.01,44.37625
user_268308151877,2022-04-07T22:40:42.840+0000,466.32,51.81333333333333
user_268308151877,2022-04-07T22:41:20.686+0000,491.95,49.195


In [None]:
from feast import FeatureStore, RepoConfig
from feast.repo_config import RegistryConfig
from feast.infra.online_stores.dynamodb import DynamoDBOnlineStoreConfig
from feast.infra.offline_stores.contrib.spark_offline_store.spark import SparkOfflineStoreConfig
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=RuntimeWarning) 

repo_config = RepoConfig(
    registry=RegistryConfig(
      registry_type="sql", 
      path="postgresql://postgres:mysecretpassword@[YOUR-RDS-ENDPOINT:PORT]:5432/feast"
    ),
    project="feast_demo",
    provider="local",
    offline_store=SparkOfflineStoreConfig(
      spark_conf={
        "spark.ui.enabled": "false",
        "spark.eventLog.enabled": "false",
        "spark.sql.catalogImplementation": "hive",
        "spark.sql.parser.quotedRegexColumnNames": "true",
        "spark.sql.session.timeZone": "UTC"
      }
    ),
    online_store=DynamoDBOnlineStoreConfig(region="us-west-1"),
    entity_key_serialization_version=2
)
store = FeatureStore(config=repo_config)

## Managing data pipelines

### Part 1: Moving the latest feature values into a low latency store

In [None]:
from datetime import datetime
store.materialize(
  start_date=datetime(2022,4,20), 
  end_date=datetime(2022,4,21), 
)

### Part 2: Computing + pushing streaming feature values

In [None]:
from datetime import timedelta
from pyspark.sql.functions import col, from_json, from_utc_timestamp, when
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, BooleanType, IntegerType

def raw_data_deserialization(df):
    payload_schema = StructType([
        StructField('user_id', StringType(), False),
        StructField('transaction_id', StringType(), False),
        StructField('category', StringType(), False),
        StructField('amt', StringType(), False),
        StructField('is_fraud', StringType(), False),
        StructField('merchant', StringType(), False),
        StructField('merch_lat', StringType(), False),
        StructField('merch_long', StringType(), False),
        StructField('timestamp', StringType(), False),
    ])

    return (
        df.selectExpr('cast (data as STRING) jsonData')
        .select(from_json('jsonData', payload_schema).alias('payload'))
        .select(
            col('payload.user_id').alias('user_id'),
            col('payload.transaction_id').alias('transaction_id'),
            col('payload.category').alias('category'),
            col('payload.amt').cast('double').alias('amt'),
            col('payload.is_fraud').cast('long').alias('is_fraud'),
            col('payload.merchant').alias('merchant'),
            col('payload.merch_lat').cast('double').alias('merch_lat'),
            col('payload.merch_long').cast('double').alias('merch_long'),
            from_utc_timestamp('payload.timestamp', 'UTC').alias('timestamp')
        )
    )

def ingest_and_deserialize_kinesis_df():
    options = {
        "streamName": "[REDACTED STREAM NAME]",
        "roleArn": "[REDACTED ROLE]",
        "region": "us-west-2",
        "shardFetchInterval": "30s",
        "initialPosition": "latest"
    }
    reader = spark.readStream.format("kinesis").options(**options)
    df = reader.load()
    df = raw_data_deserialization(df)
    watermark = "{} seconds".format(timedelta(hours=24).seconds)
    df = df.withWatermark("timestamp", watermark)
    return df

# 1) Ingest and deserialize streaming events from Kinesis stream
df = ingest_and_deserialize_kinesis_df()

In [None]:
from pyspark.sql.functions import *
import pandas as pd

# 2) Aggregate feature values
stream_agg = (
    df.withWatermark("timestamp", "1 second") 
        .groupBy("user_id", window(timeColumn="timestamp", windowDuration="1 day", slideDuration="10 minutes"))
        .agg(
            sum("amt").alias("amt_sum_1d_10m"),
            avg("amt").alias("amt_mean_1d_10m")
        )
        .select("user_id", col("window.end").alias("timestamp"), "amt_sum_1d_10m", "amt_mean_1d_10m")
)

# 3) Push transformed features into Feast
from feast import FeatureStore
store = FeatureStore(config=repo_config)

def send_to_feast(df, epoch):
    pandas_df: pd.DataFrame = df.toPandas()
    if pandas_df.empty:
        return
    
    if "timestamp" in pandas_df:
        # Filter out only for the latest window for the user_id
        pandas_df = pandas_df.sort_values(by=["user_id","timestamp"], ascending=False).groupby("user_id").nth(-1)
        store.push("transactions_1d", pandas_df)

# 4) Launch streaming job to do all the above
query = (
    stream_agg
        .writeStream
        .outputMode("append") 
        .option("checkpointLocation", "/tmp/feast-workshop/")
        .trigger(processingTime="30 seconds")
        .foreachBatch(send_to_feast)
        .start()
)

# 5) (not shown) Orchestrate this job so it's self-healing. Monitor to ensure features land in Feast.
query.awaitTermination(timeout=30)
query.stop()

## Retrieve features

In [None]:
from feast import FeatureStore
store = FeatureStore(config=repo_config)

# To train a model, you need labeled events, and point in time correct features. Feast provides the features.
# Labeled events: historical log of user transactions (+ is_fraud)
# 1) Pull in user transaction features in that you need to train your model
entity_sql = f"""
    SELECT
        transaction_id,
        user_id,
        is_fraud, 
        to_timestamp(timestamp) as event_timestamp
    FROM {store.get_data_source("transactions_source").get_table_query_string()}
    WHERE 
      timestamp BETWEEN '2022-04-20' AND '2022-04-21' AND
      partition_0 = "2022" AND
      partition_1 = "04" AND
      partition_2 = "20"
"""
training_df = store.get_historical_features(
    entity_df=entity_sql,
    features=store.get_feature_service("model_v1"),
).to_spark_df()
# model.fit(training_df)
display(training_df)

transaction_id,user_id,is_fraud,event_timestamp,amt_sum_1d_10m,amt_mean_1d_10m
a8c85a3f58e882c2110a35830590ccbd,user_939970169861,0,2022-04-20T02:05:46.992+0000,5563.599999999999,48.3791304347826
75f3724fe64eb38dae8894f7c48b104e,user_268308151877,0,2022-04-20T02:08:26.746+0000,2727.120000000001,64.9314285714286
ad6e8d8de754a147b4d17e09030612aa,user_724235628997,0,2022-04-20T21:26:11.008+0000,66914.74999999991,63.24645557655946
0146e9e40725044663b1dad0065bdcf9,user_222506789984,0,2022-04-20T16:27:53.176+0000,100661.0,71.79814550641942
a1cc5776a227037a51a9ba7dd695e50e,user_502567604689,0,2022-04-20T16:28:42.259+0000,25737.45000000001,54.18410526315792
18f4311b646729b4bb21c1a6bdec5307,user_917975462998,0,2022-04-20T16:29:38.415+0000,14541.389999999992,64.91691964285711
075767768afb06cd3f4a23e6addc1fb0,user_930691958107,0,2022-04-20T20:15:34.737+0000,35127.49000000001,62.06270318021203
2b3d6c106d2d48c3ccb592303fe4b01f,user_26990816968,0,2022-04-20T00:56:27.096+0000,3238.430000000001,101.20093750000004
0956e5370b01c5e940fc2956419a89ab,user_884240387242,0,2022-04-20T19:23:49.273+0000,109534.00999999998,68.16055382700684
d2624c5102ae17177657424731e10260,user_939970169861,0,2022-04-20T13:12:58.497+0000,47894.19000000009,64.5474258760109


In [None]:
# 2) For an incoming transaction, pull the latest feature values for the associated user
# Note: Feast decoupled, so caller doesn't think about whether to pull this from a batch or streaming source
feature_vector = store.get_online_features(
    features=store.get_feature_service("model_v1"),
    entity_rows=[
        {
            "user_id": "user_268308151877",
        }
    ],
).to_dict()
# model.predict(feature_vector)

def print_online_features(feature_vector):
    for key, value in sorted(features.items()):
        print(key, " : ", value)

print_online_features(feature_vector)