In [1]:
from kafka import KafkaConsumer
from pyflink.table import EnvironmentSettings, TableEnvironment

import ibis
import ibis.expr.datatypes as dt
import ibis.expr.schema as sch

In [2]:
consumer = KafkaConsumer("transaction")
for msg in zip(range(10), consumer):
    print(msg)

(0, ConsumerRecord(topic='transaction', partition=0, offset=2871294, timestamp=1705710502529, timestamp_type=0, key=None, value=b'{"trans_date_trans_time": "2013-02-23 21:45:37", "cc_num": 3583640000000000, "merchant": "fraud_Nolan-Williamson", "category": "kids_pets", "amt": 46.83, "first": "Crystal", "last": "Gamble", "zipcode": "19149", "dob": "1/1/85", "trans_num": "937dd8f9ed831fd6f589185540253c03", "is_fraud": 0, "user_id": 3305555113370964802}', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=324, serialized_header_size=-1))
(1, ConsumerRecord(topic='transaction', partition=0, offset=2871295, timestamp=1705710502530, timestamp_type=0, key=None, value=b'{"trans_date_trans_time": "2013-02-23 21:46:14", "cc_num": 3568260000000000, "merchant": "fraud_Johns-Hoeger", "category": "entertainment", "amt": 107.75, "first": "Timothy", "last": "Gomez", "zipcode": "46765", "dob": "11/24/94", "trans_num": "575eb60c60d07f27f84281affbe0dd7b", "is_fraud": 0, "user_id": 4

In [3]:
local = True

# 1. create a TableEnvironment
env_settings = EnvironmentSettings.in_streaming_mode()
table_env = TableEnvironment.create(env_settings)
# write all the data to one file
table_env.get_config().set("parallelism.default", "1")

# The `flink` backend does not create `TableEnvironment` objects; pass
# the `TableEnvironment` object created above to `ibis.flink.connect`.
connection = ibis.flink.connect(table_env)

# Flink’s streaming connectors aren't part of the binary distribution.
# Link the Kafka connector for cluster execution by adding a JAR file.
connection._exec_sql("ADD JAR '../flink-sql-connector-kafka-3.0.2-1.18.jar'")

<pyflink.table.table_result.TableResult at 0x7f5229618340>

In [4]:
# 2. Create source table
source_topic_name = "transaction"
kafka_offset = "earliest-offset"
source_schema = sch.Schema(
    {
        "user_id": dt.int64,
        "trans_date_trans_time": dt.timestamp(scale=3),
        "cc_num": dt.int64,
        "amt": dt.float64,
        "trans_num": dt.str,
        "merchant": dt.str,
        "category": dt.str ,      
        "is_fraud": dt.int32,
        "first": dt.str,
        "last": dt.str,
        "dob": dt.str,
        "zipcode": dt.str,
    }
)

# Configure the source table with Kafka connector properties.
source_configs = {
    "connector": "kafka",
    "topic": source_topic_name,
    "properties.bootstrap.servers": "localhost:9092" if local else "kafka:29092",
    "properties.group.id": "test",
    "scan.startup.mode": kafka_offset,
    "format": "json",
}

# Create the source table using the defined schema, Kafka connector properties,
# and set watermarking for real-time processing with a 15-second delay.
source_table = connection.create_table(
    source_topic_name,
    schema=source_schema,
    tbl_properties=source_configs,
    watermark=ibis.watermark(
        time_col="trans_date_trans_time", allowed_delay=ibis.interval(seconds=15)
    ),
)


In [5]:
source_table

In [6]:
# 3. Feature Generation using Flink backend
# Define a window specification for aggregating maximum transaction amount over the last 5 minutes.
# The aggregation is partitioned by user_id and ordered by trans_date_trans_time.
# The window range is set to the interval from 5 minutes ago to the current time.
user_trans_amt_last_360m_agg = source_table[
    source_table.user_id,
    # Calculate the maximum transaction amount over the specified window.
    source_table.amt.max().over(
        ibis.window(
            group_by=source_table.user_id,
            order_by=source_table.trans_date_trans_time,
            range=(-ibis.interval(minutes=360), 0),
        )
    ).name("user_max_trans_amt_last_360min"),
    # Calculate the min transaction amount over the specified window.
    source_table.amt.min().over(
        ibis.window(
            group_by=source_table.user_id,
            order_by=source_table.trans_date_trans_time,
            range=(-ibis.interval(minutes=360), 0),
        )
    ).name("user_min_trans_amt_last_360min"),
    # Calculate the average transaction amount over the specified window.
    source_table.amt.mean().over(
        ibis.window(
            group_by=source_table.user_id,
            order_by=source_table.trans_date_trans_time,
            range=(-ibis.interval(minutes=360), 0),
        )
    ).name("user_mean_trans_amt_last_360min"),
    # Calculate the average transaction amount over the specified window.
    source_table.amt.count().over(
        ibis.window(
            group_by=source_table.user_id,
            order_by=source_table.trans_date_trans_time,
            range=(-ibis.interval(minutes=360), 0),
        )
    ).name("user_trans_count_last_360min"),
    source_table.trans_date_trans_time
]

In [8]:
# 4. Creat Sink
sink_topic_name = "user_trans_amt_last_360min"
sink_schema = sch.Schema(
    {
        "user_id": dt.int64,
        "user_max_trans_amt_last_360min": dt.float64,
        "user_min_trans_amt_last_360min": dt.float64,
        "user_mean_trans_amt_last_360min": dt.float64,
        "user_trans_count_last_360min": dt.int64,
        "trans_date_trans_time": dt.timestamp(scale=3), # used for future temporal join
    }
)

# Configure the sink table with Kafka connector properties for writing results.
sink_configs = {
    "connector": "kafka",
    "topic": sink_topic_name,
    "properties.bootstrap.servers": "localhost:9092" if local else "kafka:29092",
    "format": "debezium-json", # "debezium-json" is needed for future temporal join.
}

sink_table = connection.create_table(
    sink_topic_name, schema=sink_schema, tbl_properties=sink_configs, overwrite=True
)
sink_table

In [9]:
# 5. Emit query result to sink table
connection.insert(sink_topic_name, user_trans_amt_last_360m_agg)



<pyflink.table.table_result.TableResult at 0x7f5228232bc0>

In [10]:
if local:
    # Use the Kafka Python client to stream records from the sink topic.
    # Otherwise, the mini cluster will shut down upon script completion.
    consumer = KafkaConsumer(sink_topic_name)
    for _, msg in zip(range(10), consumer):
        print(msg)

ConsumerRecord(topic='user_trans_amt_last_360min', partition=0, offset=3582329, timestamp=1705710539783, timestamp_type=0, key=None, value=b'{"before":null,"after":{"user_id":-9066734533508613396,"user_max_trans_amt_last_360min":62.07,"user_min_trans_amt_last_360min":6.33,"user_mean_trans_amt_last_360min":25.04999999999999,"user_trans_count_last_360min":5,"trans_date_trans_time":"2012-03-10 19:39:43"},"op":"c"}', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=273, serialized_header_size=-1)
ConsumerRecord(topic='user_trans_amt_last_360min', partition=0, offset=3582330, timestamp=1705710539783, timestamp_type=0, key=None, value=b'{"before":null,"after":{"user_id":2858219371830316374,"user_max_trans_amt_last_360min":309.96,"user_min_trans_amt_last_360min":4.43,"user_mean_trans_amt_last_360min":107.69666666666666,"user_trans_count_last_360min":3,"trans_date_trans_time":"2012-03-10 19:40:32"},"op":"c"}', headers=[], checksum=None, serialized_key_size=-1, serialize

## Alternative ways to calculate window aggregations

In [15]:
''' 
Different window options 
    1. tumble(): a fixed size and do not overlap
    2. hop(): Hopping windows have a fixed size and can be overlapping if the slide is smaller than the window size 
    3. cumulate(): Cumulate windows don't have a fixed size and do overlap
'''
windowed_stream =  source_table.window_by(
        time_col=source_table.trans_date_trans_time,
    ).tumble(
        window_size=ibis.interval(minutes=360)
    )


user_trans_amt_last_360m_agg_windowed_stream = windowed_stream.group_by(
        ["window_start", "window_end", "user_id"]
    ).agg(
        user_max_trans_amt_last_360min=windowed_stream.amt.max(),
        user_min_trans_amt_last_360min=windowed_stream.amt.min(),
        user_mean_trans_amt_last_360min=windowed_stream.amt.mean(),
    )

In [16]:
# 4. Creat Sink
sink_topic_name = "user_trans_amt_last_360min_windowed"
sink_schema = sch.Schema(
    {
        
        "window_start": dt.timestamp(scale=3), # used for future temporal join
        "window_end": dt.timestamp(scale=3), # used for future temporal join
        "user_id": dt.int64,
        'user_max_trans_amt_last_360min': dt.float64,
        'user_min_trans_amt_last_360min': dt.float64,
        'user_mean_trans_amt_last_360min': dt.float64
    }
)

# Configure the sink table with Kafka connector properties for writing results.
sink_configs = {
    "connector": "kafka",
    "topic": sink_topic_name,
    "properties.bootstrap.servers": "localhost:9092" if local else "kafka:29092",
    "format": "debezium-json", # "debezium-json" is needed for future temporal join.
}

sink_table = connection.create_table(
    sink_topic_name, schema=sink_schema, tbl_properties=sink_configs, overwrite=True
)
sink_table

In [17]:
# 5. Emit query result to sink table
connection.insert(sink_topic_name, user_trans_amt_last_360m_agg_windowed_stream)

<pyflink.table.table_result.TableResult at 0x7f5228127880>

In [14]:
if local:
    # Use the Kafka Python client to stream records from the sink topic.
    # Otherwise, the mini cluster will shut down upon script completion.
    consumer = KafkaConsumer(sink_topic_name)
    for _, msg in zip(range(10), consumer):
        print(msg)

ConsumerRecord(topic='user_trans_amt_last_360min_windowed', partition=0, offset=1154519, timestamp=1705710553974, timestamp_type=0, key=None, value=b'{"before":null,"after":{"window_start":"2012-04-17 12:00:00","window_end":"2012-04-17 18:00:00","user_id":5430991157310724696,"user_max_trans_amt_last_360min":3.6,"user_min_trans_amt_last_360min":1.69,"user_mean_trans_amt_last_360min":2.645},"op":"c"}', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=251, serialized_header_size=-1)
ConsumerRecord(topic='user_trans_amt_last_360min_windowed', partition=0, offset=1154520, timestamp=1705710553974, timestamp_type=0, key=None, value=b'{"before":null,"after":{"window_start":"2012-04-17 12:00:00","window_end":"2012-04-17 18:00:00","user_id":542587461304505579,"user_max_trans_amt_last_360min":84.02,"user_min_trans_amt_last_360min":84.02,"user_mean_trans_amt_last_360min":84.02},"op":"c"}', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=253, seriali