In [1]:
from kafka import KafkaConsumer
from pyflink.table import EnvironmentSettings, TableEnvironment

import ibis
import ibis.expr.datatypes as dt
import ibis.expr.schema as sch

In [4]:
consumer = KafkaConsumer("transaction")
for msg in zip(range(10), consumer):
    print(msg)

(0, ConsumerRecord(topic='transaction', partition=0, offset=90000, timestamp=1705686143809, timestamp_type=0, key=None, value=b'{"trans_date_trans_time": "2012-02-23 00:10:01", "cc_num": 4428780000000000000, "merchant": "fraud_Olson, Becker and Koch", "category": "gas_transport", "amt": 82.55, "first": "Richard", "last": "Waters", "zipcode": "53186", "dob": "1/2/46", "trans_num": "dbf31d83eebdfe96d2fa213df2043586", "is_fraud": 0, "user_id": 7109464218691269943}', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=337, serialized_header_size=-1))
(1, ConsumerRecord(topic='transaction', partition=0, offset=90001, timestamp=1705686143809, timestamp_type=0, key=None, value=b'{"trans_date_trans_time": "2012-02-23 00:12:05", "cc_num": 6011490000000000, "merchant": "fraud_Schmitt Inc", "category": "gas_transport", "amt": 77.47, "first": "Gary", "last": "Barnes", "zipcode": "71762", "dob": "6/11/86", "trans_num": "8e2ab99602a3bc2ca943609b74b64871", "is_fraud": 0, "user_id

In [4]:
local = True

# 1. create a TableEnvironment
env_settings = EnvironmentSettings.in_streaming_mode()
table_env = TableEnvironment.create(env_settings)
# write all the data to one file
table_env.get_config().set("parallelism.default", "1")

# The `flink` backend does not create `TableEnvironment` objects; pass
# the `TableEnvironment` object created above to `ibis.flink.connect`.
connection = ibis.flink.connect(table_env)

# Flink’s streaming connectors aren't part of the binary distribution.
# Link the Kafka connector for cluster execution by adding a JAR file.
connection._exec_sql("ADD JAR '../../flink-sql-connector-kafka-3.0.2-1.18.jar'")

<pyflink.table.table_result.TableResult at 0x7f07e58e0a00>

In [5]:
# 2. Create source table
source_topic_name = "transaction"
kafka_offset = "earliest-offset"
source_schema = sch.Schema(
    {
        "user_id": dt.int64,
        "trans_date_trans_time": dt.timestamp(scale=3),
        "cc_num": dt.int64,
        "amt": dt.float64,
        "trans_num": dt.str,
        "merchant": dt.str,
        "category": dt.str ,      
        "is_fraud": dt.int32,
        "first": dt.str,
        "last": dt.str,
        "dob": dt.str,
        "zipcode": dt.str,
    }
)

# Configure the source table with Kafka connector properties.
source_configs = {
    "connector": "kafka",
    "topic": source_topic_name,
    "properties.bootstrap.servers": "localhost:9092" if local else "kafka:29092",
    "properties.group.id": "test",
    "scan.startup.mode": kafka_offset,
    "format": "json",
}

# Create the source table using the defined schema, Kafka connector properties,
# and set watermarking for real-time processing with a 15-second delay.
source_table = connection.create_table(
    source_topic_name,
    schema=source_schema,
    tbl_properties=source_configs,
    watermark=ibis.watermark(
        time_col="trans_date_trans_time", allowed_delay=ibis.interval(seconds=15)
    ),
)


In [6]:
source_table

In [7]:
# 3. Feature Generation using Flink backend
# Define a window specification for aggregating maximum transaction amount over the last 5 minutes.
# The aggregation is partitioned by user_id and ordered by trans_date_trans_time.
# The window range is set to the interval from 5 minutes ago to the current time.
user_trans_amt_last_360m_agg = source_table[
    source_table.user_id,
    # Calculate the maximum transaction amount over the specified window.
    source_table.amt.max().over(
        ibis.window(
            group_by=source_table.user_id,
            order_by=source_table.trans_date_trans_time,
            range=(-ibis.interval(minutes=360), 0),
        )
    ).name("user_max_trans_amt_last_360min"),
    # Calculate the min transaction amount over the specified window.
    source_table.amt.min().over(
        ibis.window(
            group_by=source_table.user_id,
            order_by=source_table.trans_date_trans_time,
            range=(-ibis.interval(minutes=360), 0),
        )
    ).name("user_min_trans_amt_last_360min"),
    # Calculate the average transaction amount over the specified window.
    source_table.amt.mean().over(
        ibis.window(
            group_by=source_table.user_id,
            order_by=source_table.trans_date_trans_time,
            range=(-ibis.interval(minutes=360), 0),
        )
    ).name("user_mean_trans_amt_last_360min"),
    # Calculate the average transaction amount over the specified window.
    source_table.amt.count().over(
        ibis.window(
            group_by=source_table.user_id,
            order_by=source_table.trans_date_trans_time,
            range=(-ibis.interval(minutes=360), 0),
        )
    ).name("user_trans_count_last_360min"),
    source_table.trans_date_trans_time
]

In [11]:
# 4. Creat Sink
sink_topic_name = "user_trans_amt_last_360min"
sink_schema = sch.Schema(
    {
        "user_id": dt.int64,
        "user_max_trans_amt_last_360min": dt.float64,
        "user_min_trans_amt_last_360min": dt.float64,
        "user_mean_trans_amt_last_360min": dt.float64,
        "user_trans_count_last_360min": dt.int64,
        "trans_date_trans_time": dt.timestamp(scale=3), # used for future temporal join
    }
)

# Configure the sink table with Kafka connector properties for writing results.
sink_configs = {
    "connector": "kafka",
    "topic": sink_topic_name,
    "properties.bootstrap.servers": "localhost:9092" if local else "kafka:29092",
    "format": "debezium-json", # "debezium-json" is needed for future temporal join.
}

sink_table = connection.create_table(
    sink_topic_name, schema=sink_schema, tbl_properties=sink_configs, overwrite=True
)
sink_table

In [12]:
# 5. Emit query result to sink table
connection.insert(sink_topic_name, user_trans_amt_last_360m_agg)



<pyflink.table.table_result.TableResult at 0x7f07dc5d3070>

In [13]:
if local:
    # Use the Kafka Python client to stream records from the sink topic.
    # Otherwise, the mini cluster will shut down upon script completion.
    consumer = KafkaConsumer(sink_topic_name)
    for _, msg in zip(range(10), consumer):
        print(msg)

ConsumerRecord(topic='user_trans_amt_last_360min', partition=0, offset=498016, timestamp=1705623329919, timestamp_type=0, key=None, value=b'{"before":null,"after":{"user_id":7584046863221534119,"user_max_trans_amt_last_360min":9.28,"user_min_trans_amt_last_360min":9.28,"user_mean_trans_amt_last_360min":9.28,"user_trans_count_last_360min":1,"trans_date_trans_time":"2012-03-05 10:02:42"},"op":"c"}', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=258, serialized_header_size=-1)
ConsumerRecord(topic='user_trans_amt_last_360min', partition=0, offset=498017, timestamp=1705623329919, timestamp_type=0, key=None, value=b'{"before":null,"after":{"user_id":-227411383219759975,"user_max_trans_amt_last_360min":238.98,"user_min_trans_amt_last_360min":129.62,"user_mean_trans_amt_last_360min":184.3,"user_trans_count_last_360min":2,"trans_date_trans_time":"2012-03-05 10:05:20"},"op":"c"}', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=263, serialized