In [1]:
from kafka import KafkaConsumer
from pyflink.table import EnvironmentSettings, TableEnvironment

import ibis
import ibis.expr.datatypes as dt
import ibis.expr.schema as sch

In [2]:
consumer = KafkaConsumer("transaction")
for msg in zip(range(10), consumer):
    print(msg)

(0, ConsumerRecord(topic='transaction', partition=0, offset=1138, timestamp=1705451256257, timestamp_type=0, key=None, value=b'{"trans_date_trans_time": "2024-01-17 00:27:36.257", "cc_num": 4099710000000000, "merchant": "fraud_Thiel PLC", "category": "misc_pos", "amt": 3.61, "first": "James", "last": "Stephens", "zipcode": "22968", "dob": "7/7/75", "trans_num": "f12f60901509562f538b1d3fa7cd5372", "is_fraud": 0, "user_id": 4662397635715803005}', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=319, serialized_header_size=-1))
(1, ConsumerRecord(topic='transaction', partition=0, offset=1139, timestamp=1705451256358, timestamp_type=0, key=None, value=b'{"trans_date_trans_time": "2024-01-17 00:27:36.358", "cc_num": 4908850000000000, "merchant": "fraud_White and Sons", "category": "home", "amt": 14.13, "first": "Lauren", "last": "Torres", "zipcode": "76050", "dob": "7/24/92", "trans_num": "7a76dfb0f8e26cc30f9ba00b27d51b22", "is_fraud": 0, "user_id": -5845256892773176

In [3]:
local = True

# 1. create a TableEnvironment
env_settings = EnvironmentSettings.in_streaming_mode()
table_env = TableEnvironment.create(env_settings)
# write all the data to one file
table_env.get_config().set("parallelism.default", "1")

# The `flink` backend does not create `TableEnvironment` objects; pass
# the `TableEnvironment` object created above to `ibis.flink.connect`.
connection = ibis.flink.connect(table_env)

# Flink’s streaming connectors aren't part of the binary distribution.
# Link the Kafka connector for cluster execution by adding a JAR file.
connection._exec_sql("ADD JAR '../../flink-sql-connector-kafka-3.0.2-1.18.jar'")

<pyflink.table.table_result.TableResult at 0x7efd5c710c40>

In [4]:
# 2. Create source table
source_topic_name = "transaction"
kafka_offset = "earliest-offset"
source_schema = sch.Schema(
    {
        "user_id": dt.int64,
        "trans_date_trans_time": dt.timestamp(scale=3),
        "cc_num": dt.int64,
        "amt": dt.float64,
        "trans_num": dt.str,
        "merchant": dt.str,
        "category": dt.str ,      
        "is_fraud": dt.int32,
        "first": dt.str,
        "last": dt.str,
        "dob": dt.str,
        "zipcode": dt.str,
    }
)

# Configure the source table with Kafka connector properties.
source_configs = {
    "connector": "kafka",
    "topic": source_topic_name,
    "properties.bootstrap.servers": "localhost:9092" if local else "kafka:29092",
    "properties.group.id": "test",
    "scan.startup.mode": kafka_offset,
    "format": "json",
}

# Create the source table using the defined schema, Kafka connector properties,
# and set watermarking for real-time processing with a 15-second delay.
source_table = connection.create_table(
    source_topic_name,
    schema=source_schema,
    tbl_properties=source_configs,
    watermark=ibis.watermark(
        time_col="trans_date_trans_time", allowed_delay=ibis.interval(seconds=15)
    ),
)


In [5]:
source_table

In [6]:
# 3. Feature Generation using Flink backend
# Define a window specification for aggregating maximum transaction amount over the last 5 minutes.
# The aggregation is partitioned by user_id and ordered by trans_date_trans_time.
# The window range is set to the interval from 5 minutes ago to the current time.
user_max_trans_amt_last_5min = source_table[
    source_table.user_id,
    # Calculate the maximum transaction amount over the specified window.
    source_table.amt.max().over(
        ibis.window(
            group_by=source_table.user_id,
            order_by=source_table.trans_date_trans_time,
            range=(-ibis.interval(minutes=5), 0),
        )
    ).name("user_max_trans_amt_last_5min"),
    source_table.amt.min().over(
        ibis.window(
            group_by=source_table.user_id,
            order_by=source_table.trans_date_trans_time,
            range=(-ibis.interval(minutes=5), 0),
        )
    ).name("user_max_trans_amt_last_5min"),
    source_table.trans_date_trans_time
]
user_max_trans_amt_last_5min

In [7]:
# 4. Creat Sink
sink_topic_name = "user_max_trans_amt_last_5min"
sink_schema = sch.Schema(
    {
        "user_id": dt.int64,
        "user_max_trans_amt_last_5min": dt.float64,
        "trans_date_trans_time": dt.timestamp(scale=3), # used for future temporal join
    }
)

# Configure the sink table with Kafka connector properties for writing results.
sink_configs = {
    "connector": "kafka",
    "topic": sink_topic_name,
    "properties.bootstrap.servers": "localhost:9092" if local else "kafka:29092",
    "format": "debezium-json", # "debezium-json" is needed for future temporal join.
}

sink_table = connection.create_table(
    sink_topic_name, schema=sink_schema, tbl_properties=sink_configs
)
sink_table

In [8]:
# 5. Emit query result to sink table
connection.insert(sink_topic_name, user_max_trans_amt_last_5min)



<pyflink.table.table_result.TableResult at 0x7efd5c505300>

In [9]:
if local:
    # Use the Kafka Python client to stream records from the sink topic.
    # Otherwise, the mini cluster will shut down upon script completion.
    consumer = KafkaConsumer(sink_topic_name)
    for _, msg in zip(range(10), consumer):
        print(msg)

ConsumerRecord(topic='user_max_trans_amt_last_5min', partition=0, offset=1340, timestamp=1705451276669, timestamp_type=0, key=None, value=b'{"before":null,"after":{"user_id":-7259942366803278983,"user_max_trans_amt_last_5min":74.41,"trans_date_trans_time":"2024-01-17 00:27:41.494"},"op":"c"}', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=152, serialized_header_size=-1)
ConsumerRecord(topic='user_max_trans_amt_last_5min', partition=0, offset=1341, timestamp=1705451276669, timestamp_type=0, key=None, value=b'{"before":null,"after":{"user_id":-7880314530316364770,"user_max_trans_amt_last_5min":41.46,"trans_date_trans_time":"2024-01-17 00:27:41.594"},"op":"c"}', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=152, serialized_header_size=-1)
ConsumerRecord(topic='user_max_trans_amt_last_5min', partition=0, offset=1342, timestamp=1705451276869, timestamp_type=0, key=None, value=b'{"before":null,"after":{"user_id":4229273253148774328,"user_