In [1]:
!pip install feldera



In [2]:
import hopsworks
import json
import datetime
from hsfs.feature import Feature
from hopsworks.client.exceptions import RestAPIError
import config

from feldera import FelderaClient, PipelineBuilder
from utils.feldera import read_stream_connector, insert_stream_connector


- register at feldera: https://try.feldera.com and define feldera api key 
- copy feldera api key  cretae  hopsworks secret `feldera`: https://docs.hopsworks.ai/latest/user_guides/projects/secrets/create_secret/


In [3]:
project = hopsworks.login()
secrets_api = hopsworks.get_secrets_api()
feldera_api_key = secrets_api.get_secret("feldera").value

2024-10-28 18:08:54,731 INFO: Python Engine initialized.


In [4]:
client = FelderaClient("https://try.feldera.com", api_key = feldera_api_key)

In [6]:
# get feature groups
fs = project.get_feature_store()

In [7]:
import json
import os
from confluent_kafka import Producer
KAFKA_TOPIC_NAME = f"{project.name}_real_time_live_transactions"
SCHEMA_NAME = "live_transactions_schema"
kafka_api = project.get_kafka_api()

In [None]:
# Uncomment and run this cell to delete the schema and topic

# topics = kafka_api.get_topics()
# if topics[0].name == KAFKA_TOPIC_NAME:
#     try:
#         topics[0].delete()
#         print(f"Deleted topic {KAFKA_TOPIC_NAME}")
#     except hopsworks.RestAPIError:
#         print("Could not find topic to delete")

# try:
#     schema = kafka_api.get_schema(SCHEMA_NAME, 1)
#     print(f"Deleting schema {SCHEMA_NAME}")
#     schema.delete()
# except hopsworks.RestAPIError:
#     print("Could not find schema to delete")

In [None]:
schema = {
    "type": "record",
    "name": SCHEMA_NAME,
    "namespace": "ai.hopsworks.examples.feldera.fraud",
    "fields": [
        {
            "name": "t_id",
            "type": [
                "null",
                "string"
            ]
        },
        {
            "name": "ts",
            "type": [
                "null",
                {
                    "type": "long",
                    "logicalType": "timestamp-micros"
                }
            ]
        },
        {
            "name": "cc_num",
            "type": [
                "null",
                "string"
            ]
        },
        {
            "name": "merchant_id",
            "type": [
                "null",
                "string"
            ]
        },
        {
            "name": "amount",
            "type": [
                "null",
                "double"
            ]
        },
        {
            "name": "ip_addr",
            "type": [
                "null",
                "string"
            ]
        },
        {
            "name": "card_present",
            "type": [
                "null",
                "boolean"
            ]
        }
    ]
}

if KAFKA_TOPIC_NAME not in [topic.name for topic in kafka_api.get_topics()]:
    kafka_api.create_schema(SCHEMA_NAME, schema)
    kafka_api.create_topic(KAFKA_TOPIC_NAME, SCHEMA_NAME, 1, replicas=1, partitions=1)
    print("Created topic")
else:
    print("Did not create topic")

In [None]:
features = [
    Feature(name="t_id", type="string"),
    Feature(name="event_time", type="timestamp"),
    Feature(name="cc_num", type="string"),
    Feature(name="prev_t_id", type="string"),
    Feature(name="prev_ip_transaction", type="string"),
    Feature(name="prev_card_present_transaction", type="boolean"),
    Feature(name="count_trans_last_10mins", description="Number of transactions in the last 10 minutes", type="double"),
    Feature(name="sum_trans_last_10mins", description="Sum of the amount of transactions in the last 10 minutes", type="double"),
    Feature(name="count_trans_last_hr", description="Number of transactions in the last hour", type="double"),
    Feature(name="sum_trans_last_hr", description="Sum of the amount of transactions in the last day", type="double"),
    Feature(name="count_trans_last_day", description="Number of transactions in the last day", type="double"),
    Feature(name="sum_trans_last_day", description="Sum of the amount of transactions in the last week", type="double"),
    Feature(name="count_trans_last_week", description="Number of transactions in the last hour", type="double"),
    Feature(name="sum_trans_last_week", description="Sum of the amount of transactions in the last week", type="double"),
]

cc_trans_aggs_fg = fs.get_or_create_feature_group(
    name="cc_trans_aggs_fg",
    version=1,
    description="Time window aggregations and previous transaction for grouped by credit card",
    topic_name = f"{project.name}_cc_trans_aggs_fg",
    primary_key=["cc_num"],
    event_time="event_time",
    statistics_config={'histograms': True, 'correlations': False},
    online_enabled=True,
)

cc_trans_aggs_fg.save(features)
## Set a materialization job for the feature group in Hopsworks
cc_trans_aggs_fg.materialization_job.schedule(cron_expression = "0 0 */5 ? * * *", start_time=datetime.datetime.now(tz=datetime.timezone.utc))

In [None]:
# define source and sinks for streaming pipeline
transaction_source_config = read_stream_connector(fs, KAFKA_TOPIC_NAME)
cc_trans_aggs_sink_config = insert_stream_connector(project, fs, cc_trans_aggs)


### Create SQL program parameterized by source and sink connnector configurations.

In [None]:
def build_card_sql(transaction_source_config: str, windowed_sink_config: str) -> str:
    return f"""c
    -- get source transactions table    
    CREATE TABLE TRANSACTIONS(
        t_id STRING,
        ts TIMESTAMP,
        cc_num STRING,
        merchant_id STRING,
        amount DOUBLE,
        ip_addr STRING,
        card_present BOOLEAN
    ) WITH (
        'connectors' = '[{transaction_source_config}]'
    );

    -- Create a 7 hour hopping window aggregations from data from transactions table
    CREATE LOCAL VIEW SEVENHOURHOP as 
        SELECT * 
        FROM TABLE(HOP(TABLE TRANSACTIONS, DESCRIPTOR(transaction_time), INTERVAL 7 HOUR, INTERVAL 1 HOUR));

    -- Create a 24 hour hopping window aggregations from data from transactions table
    CREATE LOCAL VIEW DAYHOP as 
        SELECT * 
        FROM TABLE(HOP(TABLE TRANSACTIONS, DESCRIPTOR(transaction_time), INTERVAL 1 DAY, INTERVAL 1 HOUR));

    -- Create a 3 day hopping window aggregations from data from transactions table
    CREATE LOCAL VIEW TREEDAYHOP as 
        SELECT * 
        FROM TABLE(HOP(TABLE TRANSACTIONS, DESCRIPTOR(transaction_time), INTERVAL 3 DAY, INTERVAL 1 HOUR));

    -- Compute aggregates from it
    CREATE LOCAL VIEW SEVENHOURWINDOWED AS
        SELECT
            SUM(amount) AS sum_amount,
            window_end AS date_time,
            cc_num
        FROM SEVENHOURHOP
        GROUP BY cc_num, window_end;        

    CREATE LOCAL VIEW DAYWINDOWED AS
        SELECT
            SUM(amount) AS sum_amount,
            window_end AS date_time,
            cc_num
        FROM DAYHOP
        GROUP BY cc_num, window_end; 

    CREATE LOCAL VIEW TREEDAYWINDOWED AS
        SELECT
            SUM(amount) AS sum_amount,
            window_end AS date_time,
            cc_num
        FROM TREEDAYHOP
        GROUP BY cc_num, window_end; 


    CREATE VIEW COMBINED 
    WITH (
        'connectors' = '[{windowed_sink_config}]'
    ) AS
    SELECT
            T1.cc_num AS cc_num,
            T1.ts AS ts,
            T1.sum_amount AS c_spend_amount_7h,
            T2.sum_amount AS c_spend_amount_1d,
            T3.sum_amount  AS c_spend_amount_3d
    FROM
            SEVENHOURWINDOWED T1 JOIN DAYWINDOWED T2
            ON
                T1.cc_num = T2.cc_num
            JOIN TREEDAYWINDOWED T3
        ON
            T1.cc_num = T3.cc_num;

    """

In [None]:
card_sql = build_card_sql(transaction_source_config, card_windowed_sink_config)
card_pipeline = PipelineBuilder(client, name = "hopsworks_kafka_card", sql = card_sql).create_or_replace()
card_pipeline.start()

In [None]:
# Create SQL program parameterized by source and sink connnector configurations.
def build_last_tr_sql(transaction_source_config: str, windowed_sink_config: str) -> str:
    return f"""
    -- get source transactions table    
    CREATE TABLE TRANSACTIONS(
        t_id STRING,
        ts TIMESTAMP,
        cc_num STRING,
        merchant_id STRING,
        amount DOUBLE,
        transaction_category_name STRING,
        location STRING
    ) WITH (
        'connectors' = '[{transaction_source_config}]'
    );

    -- Create a laging view (shift=-1 in pandas equivalent) from data from transactions table
    CREATE LOCAL VIEW laging_view AS SELECT
        *,
        LAG(transaction_time) OVER (PARTITION BY cc_num ORDER BY transaction_time ASC) AS prev_transaction_time,
        LAG(transactions.transaction_id) OVER (PARTITION BY cc_num ORDER BY transaction_time ASC) AS prev_transaction_id        
    FROM transactions;
    
    -- Compute aggregates from it
    CREATE VIEW activity_delta 
    WITH (
        'connectors' = '[{windowed_sink_config}]'
    ) AS
        SELECT
        cc_num,
        t_id,
        event_time,
        prev_ts_transaction, 
        prev_ip_transaction,
        prev_card_present_transaction
    FROM laging_view;
    """

last_tr_sql = build_last_tr_sql(transaction_source_config, last_transaction_sink_config)
last_tr_pipeline = PipelineBuilder(client, name = "hopsworks_delta_kafka_last_tr", sql = last_tr_sql).create_or_replace()
# Start the Feldera pipeline.
last_tr_pipeline.start()

In [None]:
# Create SQL program parameterized by source and sink connnector configurations.
def build_tr_sql(transaction_source_config: str, transaction_sink_config: str) -> str:
    return f"""
    -- get source transactions table    
    CREATE TABLE TRANSACTIONS(
        t_id STRING,
        event_time TIMESTAMP,
        cc_num STRING,
        merchant_id STRING,
        amount DOUBLE,
        ip_addr STRING,
        card_present BOOLEAN
    ) WITH (
        'connectors' = '[{transaction_source_config}]'
    );
    
    -- Compute aggregates from it
    CREATE VIEW activity_delta 
    WITH (
        'connectors' = '[{transaction_sink_config}]'
    ) AS
        SELECT *
    FROM transactions;
    """

tr_sql = build_tr_sql(transaction_source_config, transaction_sink_config)
tr_pipeline = PipelineBuilder(client, name = "hopsworks_kafka_tr", sql = tr_sql).create_or_replace()
# Start the Feldera pipeline.
tr_pipeline.start()