In [1]:
from kafka import KafkaConsumer
from pyflink.table import EnvironmentSettings, TableEnvironment

import ibis
import ibis.expr.datatypes as dt
import ibis.expr.schema as sch

In [4]:
consumer = KafkaConsumer("transaction")
for msg in zip(range(10), consumer):
    print(msg)

ValueError: Invalid file object: None

In [5]:
local = True

# 1. create a TableEnvironment
env_settings = EnvironmentSettings.in_streaming_mode()
table_env = TableEnvironment.create(env_settings)
# write all the data to one file
table_env.get_config().set("parallelism.default", "1")

# The `flink` backend does not create `TableEnvironment` objects; pass
# the `TableEnvironment` object created above to `ibis.flink.connect`.
connection = ibis.flink.connect(table_env)

# Flink’s streaming connectors aren't part of the binary distribution.
# Link the Kafka connector for cluster execution by adding a JAR file.
connection._exec_sql("ADD JAR '../../flink-sql-connector-kafka-3.0.2-1.18.jar'")

<pyflink.table.table_result.TableResult at 0x7f2465798c10>

In [6]:
# 2. Create source table
source_topic_name = "transaction"
kafka_offset = "earliest-offset"
source_schema = sch.Schema(
    {
        "user_id": dt.int64,
        "trans_date_trans_time": dt.timestamp(scale=3),
        "cc_num": dt.int64,
        "amt": dt.float64,
        "trans_num": dt.str,
        "merchant": dt.str,
        "category": dt.str ,      
        "is_fraud": dt.int32,
        "first": dt.str,
        "last": dt.str,
        "dob": dt.str,
        "zipcode": dt.str,
    }
)

# Configure the source table with Kafka connector properties.
source_configs = {
    "connector": "kafka",
    "topic": source_topic_name,
    "properties.bootstrap.servers": "localhost:9092" if local else "kafka:29092",
    "properties.group.id": "test",
    "scan.startup.mode": kafka_offset,
    "format": "json",
}

# Create the source table using the defined schema, Kafka connector properties,
# and set watermarking for real-time processing with a 15-second delay.
source_table = connection.create_table(
    source_topic_name,
    schema=source_schema,
    tbl_properties=source_configs,
    watermark=ibis.watermark(
        time_col="trans_date_trans_time", allowed_delay=ibis.interval(seconds=15)
    ),
)


In [7]:
source_table

In [9]:
# 3. Feature Generation using Flink backend
# Define a window specification for aggregating maximum transaction amount over the last 5 minutes.
# The aggregation is partitioned by user_id and ordered by trans_date_trans_time.
# The window range is set to the interval from 5 minutes ago to the current time.
user_trans_amt_agg = source_table[
    source_table.user_id,
    # Calculate the maximum transaction amount over the specified window.
    source_table.amt.max().over(
        ibis.window(
            group_by=source_table.user_id,
            order_by=source_table.trans_date_trans_time,
            range=(-ibis.interval(minutes=360), 0),
        )
    ).name("user_max_trans_amt_last_60min"),
    # Calculate the min transaction amount over the specified window.
    source_table.amt.min().over(
        ibis.window(
            group_by=source_table.user_id,
            order_by=source_table.trans_date_trans_time,
            range=(-ibis.interval(minutes=360), 0),
        )
    ).name("user_min_trans_amt_last_60min"),
    # Calculate the average transaction amount over the specified window.
    source_table.amt.mean().over(
        ibis.window(
            group_by=source_table.user_id,
            order_by=source_table.trans_date_trans_time,
            range=(-ibis.interval(minutes=360), 0),
        )
    ).name("user_mean_trans_amt_last_60min"),
    source_table.trans_date_trans_time
]

In [12]:
# 4. Creat Sink
sink_topic_name = "user_trans_amt_last_360min"
sink_schema = sch.Schema(
    {
        "user_id": dt.int64,
        "user_max_trans_amt_last_5min": dt.float64,
        "user_min_trans_amt_last_5min": dt.float64,
        "trans_date_trans_time": dt.timestamp(scale=3), # used for future temporal join
    }
)

# Configure the sink table with Kafka connector properties for writing results.
sink_configs = {
    "connector": "kafka",
    "topic": sink_topic_name,
    "properties.bootstrap.servers": "localhost:9092" if local else "kafka:29092",
    "format": "debezium-json", # "debezium-json" is needed for future temporal join.
}

sink_table = connection.create_table(
    sink_topic_name, schema=sink_schema, tbl_properties=sink_configs, overwrite=True
)
sink_table

In [13]:
# 5. Emit query result to sink table
connection.insert(sink_topic_name, user_max_trans_amt_last_5min)



<pyflink.table.table_result.TableResult at 0x7f24b0b52fe0>

In [14]:
if local:
    # Use the Kafka Python client to stream records from the sink topic.
    # Otherwise, the mini cluster will shut down upon script completion.
    consumer = KafkaConsumer(sink_topic_name)
    for _, msg in zip(range(10), consumer):
        print(msg)

ConsumerRecord(topic='user_max_trans_amt_last_5min', partition=0, offset=1444, timestamp=1705533214152, timestamp_type=0, key=None, value=b'{"before":null,"after":{"user_id":-5404855628358389721,"user_max_trans_amt_last_5min":96.29,"user_min_trans_amt_last_5min":96.29,"trans_date_trans_time":"2024-01-17 23:13:18.824"},"op":"c"}', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=189, serialized_header_size=-1)
ConsumerRecord(topic='user_max_trans_amt_last_5min', partition=0, offset=1445, timestamp=1705533216354, timestamp_type=0, key=None, value=b'{"before":null,"after":{"user_id":-4479665259876654427,"user_max_trans_amt_last_5min":7.77,"user_min_trans_amt_last_5min":7.77,"trans_date_trans_time":"2024-01-17 23:13:19.525"},"op":"c"}', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=187, serialized_header_size=-1)
ConsumerRecord(topic='user_max_trans_amt_last_5min', partition=0, offset=1446, timestamp=1705533216354, timestamp_type=0, key=No