In [1]:
import sys

import ibis
import ibis.expr.datatypes as dt
import ibis.expr.schema as sch
from kafka import KafkaConsumer, TopicPartition
from pyflink.table import EnvironmentSettings, TableEnvironment
import pandas as pd
import json

In [61]:
from pathlib import Path; import pyflink
print(Path(pyflink.__spec__.origin).parent / "bin" / "flink")

/home/codespace/.local/lib/python3.10/site-packages/pyflink/bin/flink


In [66]:
# !/home/codespace/.local/lib/python3.10/site-packages/pyflink/bin/flink run --jobmanager localhost:8081 --python ../create_multiple_streaming_features_from_yaml.py

Run this python script to produce multiple streaming features to kafka
```python
python src/create_single_streaming_feature.py
```

In [None]:
local = False
bootstrap_servers =  "localhost:9092" if local else "kafka:29092"

In [2]:
consumer = KafkaConsumer()
consumer.topics()

{'category_amt_max_1m',
 'category_amt_max_5m',
 'category_amt_max_60m',
 'category_amt_min_1m',
 'category_amt_min_5m',
 'category_amt_min_60m',
 'category_amt_sum_1m',
 'category_amt_sum_5m',
 'category_amt_sum_60m',
 'cc_num_amt_max_1m',
 'cc_num_amt_max_5m',
 'cc_num_amt_max_60m',
 'cc_num_amt_min_1m',
 'cc_num_amt_min_5m',
 'cc_num_amt_min_60m',
 'cc_num_amt_sum_1m',
 'cc_num_amt_sum_5m',
 'cc_num_amt_sum_60m',
 'merchant_amt_max_1m',
 'merchant_amt_max_5m',
 'merchant_amt_max_60m',
 'merchant_amt_min_1m',
 'merchant_amt_min_5m',
 'merchant_amt_min_60m',
 'merchant_amt_sum_1m',
 'merchant_amt_sum_5m',
 'merchant_amt_sum_60m',
 'payment_msg',
 'sink',
 'train_data',
 'transaction',
 'transaction_sink',
 'user_amt_max_1m',
 'user_amt_max_5m',
 'user_amt_max_60m',
 'user_amt_min_1m',
 'user_amt_min_5m',
 'user_amt_min_60m',
 'user_amt_sum_1m',
 'user_amt_sum_5m',
 'user_amt_sum_60m',
 'user_max_trans_amt_last_5min',
 'zipcode_amt_max_1m',
 'zipcode_amt_max_5m',
 'zipcode_amt_max_60m'

In [43]:
# Specify the topic and partition
topic = 'user_amt_max_5m'
partition = 0  # Replace with the partition number you want to set the offset for

# Create a KafkaConsumer
consumer = KafkaConsumer()
consumer.assign([TopicPartition(topic, partition)])
# Set the offset to the earliest available offset
consumer.seek_to_beginning(TopicPartition(topic, partition))
consumer.poll(200)

{}

In [44]:
import sys
sys.path.append('../src')
from utils.util import create_and_connect_kafka_table, create_schema

In [64]:
local = True

# 1. create a TableEnvironment
env_settings = EnvironmentSettings.in_streaming_mode()
table_env = TableEnvironment.create(env_settings)
# write all the data to one file
table_env.get_config().set("parallelism.default", "1")

# The `flink` backend does not create `TableEnvironment` objects; pass
# the `TableEnvironment` object created above to `ibis.flink.connect`.
connection = ibis.flink.connect(table_env)

# Flink’s streaming connectors aren't part of the binary distribution.
# Link the Kafka connector for cluster execution by adding a JAR file.
connection._exec_sql("ADD JAR '../../flink-sql-connector-kafka-3.0.2-1.18.jar'")

<pyflink.table.table_result.TableResult at 0x7f96eaae1180>

In [46]:
from schemas.source_schema import CreditCardTransaction
t = create_and_connect_kafka_table(
        connection=connection,
        topic_name="transaction",
        schema=CreditCardTransaction.__annotations__,
        time_col="trans_date_trans_time",
        kafka_data_format="json"
    )

In [47]:
t

In [48]:
original_cols = t.columns

In [49]:
import yaml
features_config_list = []
with open("../features/features.yaml", 'r') as file:
    features_config = yaml.load_all(file, Loader=yaml.FullLoader)
    for conf in features_config:
            
        features_config_list.append(conf)

In [50]:
col_names = []
for feature in features_config_list:
        for name_prefix, conf in feature.items():
            for agg_func in conf['agg_func']:

                primary_keys = conf['primary_keys']
                agg_col = conf['agg_col']
                time_col = conf['time_col']

                for interval_in_minutes in conf['interval_in_minutes']:              
                    name = f"{name_prefix}_{agg_func}_{interval_in_minutes}m"
                    schema = create_schema(name_prefix, primary_keys, agg_func, time_col, interval_in_minutes)
                    print(f"name = {name}")
                    sink_primary_key = [keys['name'] for keys in primary_keys][0]
                    f = create_and_connect_kafka_table(connection, name, schema, time_col, sink_primary_key)
                    rname = "{name}_" + f"{name}"
                    t  = t.asof_join(f, predicates=[t[time_col] >= f[time_col], t[sink_primary_key] == f[sink_primary_key]], rname=rname)
                    col_names.append(name)



name = cc_num_amt_sum_1m
name = cc_num_amt_sum_5m
name = cc_num_amt_min_1m
name = cc_num_amt_min_5m
name = cc_num_amt_max_1m
name = cc_num_amt_max_5m
name = user_amt_sum_1m
name = user_amt_sum_5m
name = user_amt_min_1m
name = user_amt_min_5m
name = user_amt_max_1m
name = user_amt_max_5m
name = zipcode_amt_sum_1m
name = zipcode_amt_sum_5m
name = zipcode_amt_min_1m
name = zipcode_amt_min_5m
name = zipcode_amt_max_1m
name = zipcode_amt_max_5m
name = merchant_amt_sum_1m
name = merchant_amt_sum_5m
name = merchant_amt_min_1m
name = merchant_amt_min_5m
name = merchant_amt_max_1m
name = merchant_amt_max_5m
name = category_amt_sum_1m
name = category_amt_sum_5m
name = category_amt_min_1m
name = category_amt_min_5m
name = category_amt_max_1m
name = category_amt_max_5m


In [51]:
col_names

['cc_num_amt_sum_1m',
 'cc_num_amt_sum_5m',
 'cc_num_amt_min_1m',
 'cc_num_amt_min_5m',
 'cc_num_amt_max_1m',
 'cc_num_amt_max_5m',
 'user_amt_sum_1m',
 'user_amt_sum_5m',
 'user_amt_min_1m',
 'user_amt_min_5m',
 'user_amt_max_1m',
 'user_amt_max_5m',
 'zipcode_amt_sum_1m',
 'zipcode_amt_sum_5m',
 'zipcode_amt_min_1m',
 'zipcode_amt_min_5m',
 'zipcode_amt_max_1m',
 'zipcode_amt_max_5m',
 'merchant_amt_sum_1m',
 'merchant_amt_sum_5m',
 'merchant_amt_min_1m',
 'merchant_amt_min_5m',
 'merchant_amt_max_1m',
 'merchant_amt_max_5m',
 'category_amt_sum_1m',
 'category_amt_sum_5m',
 'category_amt_min_1m',
 'category_amt_min_5m',
 'category_amt_max_1m',
 'category_amt_max_5m']

In [52]:
train_data = t[original_cols + col_names ]

In [53]:
class CreditCardTransaction:
    trans_num: dt.str # primary key
    user_id: dt.int64 # user_id
    cc_num: dt.int64 # creadit card number
    amt: dt.float64 # credit card transaction amount
    merchant: dt.str
    category: dt.str     
    is_fraud: dt.int32 # Fraud Label
    first: dt.str # first name
    last: dt.str # last name
    dob: dt.str # date of birth
    zipcode: dt.str
    trans_date_trans_time: dt.timestamp(scale=3)

In [54]:
sink_schema = sch.Schema(
    {
        "trans_num": dt.str,
        "user_id": dt.int64,
        "cc_num": dt.int64,
        "amt": dt.float64,
        "merchant": dt.str,
        "category": dt.str ,      
        "is_fraud": dt.int32,
        "first": dt.str,
        "last": dt.str,
        "dob": dt.str,
        "zipcode": dt.str,
        "trans_date_trans_time": dt.timestamp(scale=3),
        'cc_num_amt_sum_5m': dt.float64,
        'cc_num_amt_sum_60m': dt.float64,
        'cc_num_amt_min_5m': dt.float64,
        'cc_num_amt_min_60m': dt.float64,
        'cc_num_amt_max_5m': dt.float64,
        'cc_num_amt_max_60m': dt.float64,
        'user_amt_sum_5m': dt.float64,
        'user_amt_sum_60m': dt.float64,
        'user_amt_min_5m': dt.float64,
        'user_amt_min_60m': dt.float64,
        'user_amt_max_5m': dt.float64,
        'user_amt_max_60m': dt.float64,
        'zipcode_amt_sum_5m': dt.float64,
        'zipcode_amt_sum_60m': dt.float64,
        'zipcode_amt_min_5m': dt.float64,
        'zipcode_amt_min_60m': dt.float64,
        'zipcode_amt_max_5m': dt.float64,
        'zipcode_amt_max_60m': dt.float64,
        'merchant_amt_sum_5m': dt.float64,
        'merchant_amt_sum_60m': dt.float64,
        'merchant_amt_min_5m': dt.float64,
        'merchant_amt_min_60m': dt.float64,
        'merchant_amt_max_5m': dt.float64,
        'merchant_amt_max_60m': dt.float64,
        'category_amt_sum_5m': dt.float64,
        'category_amt_sum_60m': dt.float64,
        'category_amt_min_5m': dt.float64,
        'category_amt_min_60m': dt.float64,
        'category_amt_max_5m': dt.float64,
        'category_amt_max_60m': dt.float64,
    }
)

sink_configs = {
    "connector": "kafka",
    "topic": "train_data",
    "properties.bootstrap.servers": "localhost:9092" if local else "kafka:29092",
    "scan.startup.mode": "earliest-offset",
    # "format": "json",
    "format": "debezium-json",
}

t0 = connection.create_table(
    "train_data",
    schema=sink_schema,
    tbl_properties=sink_configs,
    watermark=ibis.watermark(
        time_col="trans_date_trans_time", allowed_delay=ibis.interval(seconds=15)
    ),
    primary_key = "trans_num",
    overwrite=True
)

In [55]:
t0

In [56]:
connection.insert("train_data", train_data)

<pyflink.table.table_result.TableResult at 0x7f96eaab6f20>

In [57]:

# Use the Kafka Python client to stream records from the sink topic.
# Otherwise, the mini cluster will shut down upon script completion.
consumer = KafkaConsumer("train_data")
rows = []
for _, msg in zip(range(1000), consumer):
    print(msg)
    rows.append(msg)

KeyboardInterrupt: 

In [None]:
import json

import pandas as pd

df = pd.DataFrame([json.loads(row.value.decode('utf-8'))['after'] for row in rows])

df

Unnamed: 0,trans_num,user_id,cc_num,amt,merchant,category,is_fraud,first,last,dob,...,merchant_amt_min_5m,merchant_amt_min_60m,merchant_amt_max_5m,merchant_amt_max_60m,category_amt_sum_5m,category_amt_sum_60m,category_amt_min_5m,category_amt_min_60m,category_amt_max_5m,category_amt_max_60m
0,0b242abb623afc578575680df30655b9,4800525036158247729,2703190000000000,4.97,"fraud_Rippin, Kub and Mann",misc_net,0,Jennifer,Banks,3/9/88,...,4.97,4.97,4.97,4.97,4.97,4.97,4.97,4.97,4.97,4.97
1,1f76529f8574734946361c461b024d99,-4687780397107955484,630423000000,107.23,"fraud_Heller, Gutmann and Zieme",grocery_pos,0,Stephanie,Gill,6/21/78,...,107.23,107.23,107.23,107.23,107.23,107.23,107.23,107.23,107.23,107.23
2,cb598ec00d349dc95e7b3c18de5ae800,929063974751684726,4599740000000000000,82.80,"fraud_Heller, Gutmann and Zieme",grocery_pos,0,Mary,Myers,12/30/64,...,82.80,82.80,107.23,107.23,190.03,190.03,82.80,82.80,107.23,107.23
3,a1a22d70485983eac12b5b88dad1cf95,1615055666352648205,38859500000000,220.11,fraud_Lind-Buckridge,entertainment,0,Edward,Sanchez,1/19/62,...,220.11,220.11,220.11,220.11,220.11,220.11,220.11,220.11,220.11,220.11
4,6b849c168bdad6f867558c3793159a81,2274729778245383962,3534090000000000,45.00,"fraud_Kutch, Hermiston and Farrell",gas_transport,0,Jeremy,White,1/12/67,...,45.00,45.00,45.00,45.00,45.00,45.00,45.00,45.00,45.00,45.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,740ba21b3f6dac605752b84ef73f56e4,-3655859676031312324,4452370000000000,19.12,fraud_Champlin and Sons,home,0,Linda,Davis,3/4/78,...,19.12,19.12,19.12,19.12,19.12,19.12,19.12,19.12,19.12,19.12
996,fc3e6cd1be3853f5bcd8fda06df5a7e8,7004744375844464512,4951650000000000,77.89,fraud_Hoppe-Parisian,kids_pets,0,Kimberly,Miller,6/15/76,...,77.89,77.89,77.89,77.89,77.89,77.89,77.89,77.89,77.89,77.89
997,91561e65399053586ad098022374ec45,3252491939450519963,3576020000000000,9.19,fraud_Hickle Group,shopping_pos,0,Dawn,Gray,12/30/04,...,9.19,9.19,142.88,142.88,279.16,279.16,9.19,9.19,142.88,142.88
998,f73411d6097ddcf5d1e45786bf55023e,-5585615269734509262,4025610000000000,1.39,fraud_Schuppe LLC,entertainment,0,Krystal,Key,3/20/49,...,1.39,1.39,1.39,1.39,1.39,1.39,1.39,1.39,1.39,1.39
