### Setup PythonPath for either local execution or execution in Colab

In [2]:
import sys
from pathlib import Path

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('ccfraud',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    sys.path.append(root_dir)
print(f"Added the following directory to the PYTHONPATH: {root_dir}")

Local environment
Added the following directory to the PYTHONPATH: /home/jdowling/Projects/mlfs-book


## Synthetic Data Generator for Credit Card Transaction Data

In [3]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta
from faker import Faker
import random
import json
import ipaddress
import os
import hopsworks
import pandas as pd
from mlfs.config import settings
import json


fake = Faker('en_US')

# Helper functions
def random_date_vec(start, end, size):
    """Generate a vector of random datetimes between `start` and `end`."""
    delta_seconds = (end - start).total_seconds()
    random_seconds = np.random.randint(0, int(delta_seconds), size)
    return pl.Series([start + timedelta(seconds=int(s)) for s in random_seconds])

def right_skewed_amount(size, mean=200, median=50):
    """Generate a right-skewed amount with mean and median."""
    return pl.Series(np.random.gamma(2, 100, size).clip(0.01, 10000).round(2))

def random_risk_score(size):
    """Generate right-skewed risk scores."""
    return pl.Series(np.random.beta(2, 8, size))


def random_amazon_ip():
    network = ipaddress.IPv4Network('3.0.0.0/8')
    return str(fake.ipv4(network=network))

def random_ip():
    return fake.ipv4()

def generate_ip():
    if random.choice([True, False]):
        return random_amazon_ip()  # 50% Amazon IP
    else:
        return random_ip()  

def is_amazon_ip(ip):
    network = ipaddress.IPv4Network('3.0.0.0/8')
    try:
        return ipaddress.IPv4Address(ip) in network
    except ipaddress.AddressValueError:
        return False

def generate_exponential_value(scale, max_value):
    value = np.random.exponential(scale)
    return min(value, max_value) 

# Generate Transactions Table
n_transactions = 500_000
n_cards = 10_000
n_customers = 8_000
n_merchants = 2_000
n_banks = 1_000
n_days = 10

now = datetime.now()

transaction_df = pl.DataFrame({
    "t_id": pl.Series([fake.uuid4() for _ in range(n_transactions)]),
    "cc_num": pl.Series(np.random.randint(1, n_cards, n_transactions).astype(str)),
    "merchant_id": pl.Series(np.random.randint(1, n_merchants, n_transactions).astype(str)),
    "amount": right_skewed_amount(n_transactions),
    "ip_address": pl.Series([generate_ip() for _ in range(n_transactions)]),
    "ts": random_date_vec(now - timedelta(days=n_days), now, n_transactions),
})

# card_present is computed using an existing column
transaction_df = transaction_df.with_columns( pl.col("ip_address").map_elements(is_amazon_ip).alias("card_present"))

# Generate Cards Table
issue_dates = random_date_vec(now - timedelta(days=5*365), now, n_cards)
expiry_dates = issue_dates + pl.Series(np.random.randint(3*365, 5*365, n_cards)).cast(pl.Duration)
card_df = pl.DataFrame({
    "cc_num": pl.Series(np.arange(1, n_cards+1).astype(str)),
    "account_id": pl.Series(np.random.randint(1, n_cards, n_cards).astype(str)),
    "bank_id": pl.Series(np.random.randint(1, n_banks, n_cards).astype(str)),
    "card_type": pl.Series(np.random.choice(['Credit', 'Debit', 'Prepaid'], n_cards, p=[0.6, 0.35, 0.05])),
    "issue_date": issue_dates,
    "expiry_date": expiry_dates,
    "status": pl.Series(np.random.choice(['Active', 'Blocked', 'Lost/Stolen'], n_cards, p=[0.95, 0.04, 0.01]))
})

# Generate Customers Table
customer_df = pl.DataFrame({
    "account_id": pl.Series(np.arange(1, n_customers+1).astype(str)),
    "name": pl.Series(fake.name() for _ in range(n_customers)),
    "address": pl.Series(fake.address() for _ in range(n_customers)),
    "email": pl.Series(fake.email() for _ in range(n_customers)),
    "creation_date": random_date_vec(now - timedelta(days=10*365), now, n_customers),
    "debt_end_prev_month": pl.Series(generate_exponential_value(0, 50000) for _ in range(n_customers)),
    "last_modified": random_date_vec(now - timedelta(days=30), now - timedelta(days=1), n_customers),
})

# Generate Merchants Table
merchant_df = pl.DataFrame({
    "merchant_id": pl.Series(np.arange(1, n_merchants+1).astype(str)),
    "merchant_name": pl.Series(fake.company() for _ in range(n_merchants)),
    "category": pl.Series(np.random.choice(['Grocery', 'Electronics', 'Clothing', 'Entertainment', 'Restaurants'], n_merchants)),
    "country": pl.Series(fake.country() for _ in range(n_merchants)),
    "last_modified": random_date_vec(now - timedelta(days=8), now - timedelta(days=7), n_merchants)
})

    # "cnt_chargeback_prev_week": right_skewed_amount(n_merchants, 100, 50),
    # "average_transaction_amount": right_skewed_amount(n_merchants, 100, 50)

# Generate Banks Table
bank_df = pl.DataFrame({
    "bank_id": pl.Series(np.arange(1, n_banks+1).astype(str)),
    "country": pl.Series(fake.country() for _ in range(n_banks)),
    "credit_rating": pl.Series(np.random.choice(['AAA', 'AA', 'A', 'BBB', 'BB', 'B', 'C'], n_banks)),
    "last_modified": random_date_vec(now - timedelta(days=7), now - timedelta(days=6), n_banks),
})

# Check time taken
time_taken = datetime.now() - now
print(f"Time taken={time_taken}")


Time taken=0:00:29.481881


In [4]:
transaction_df.head()

t_id,cc_num,merchant_id,amount,ip_address,ts,card_present
str,str,str,f64,str,datetime[μs],bool
"""17fb63c2-56c9-4cf6-80a2-cc4166…","""3433""","""1961""",180.46,"""16.0.0.0/7""",2025-01-08 08:29:56.430086,False
"""5e8f6964-be62-4d23-8909-8f1942…","""8386""","""381""",130.95,"""9.0.0.0/10""",2025-01-03 09:21:43.430086,False
"""a03e83ed-1eae-4bb0-bc36-20bbd6…","""7177""","""432""",221.79,"""81.190.185.23""",2024-12-31 14:11:22.430086,False
"""79b492d2-3748-4a72-9758-149aaa…","""7078""","""243""",191.24,"""111.44.111.176/29""",2024-12-30 15:11:21.430086,False
"""0a375d83-dfb8-4758-bb3d-8f9c72…","""3784""","""386""",137.5,"""211.0.0.0/8""",2025-01-01 08:45:23.430086,False


In [5]:
n_fraud_transactions = 5000

# Function to generate random IP addresses
def generate_fraud_ip():
    return f"{np.random.randint(0, 256)}.{np.random.randint(0, 256)}.{np.random.randint(0, 256)}.{np.random.randint(0, 256)}"

# Simulate right-skewed transaction amounts
def right_skewed_amount(n):
    return np.random.gamma(shape=2, scale=50, size=n)

# Generate random boolean values
def random_boolean(size=None):
    return np.random.choice([True, False], size=size)

# Generate random timestamps in a vectorized way
def random_date_vec(start, end, n):
    delta = (end - start).total_seconds()
    return [start + timedelta(seconds=np.random.randint(0, delta)) for _ in range(n)]

# Generate grouped transactions
def generate_grouped_transactions(n_transactions, min_group=4, max_group=10, window=30, two_ip=False, card_present_val=False):
    transactions = []
    current_time = now - timedelta(days=n_days)
    cc_nums = np.random.randint(1, n_cards, n_transactions).astype(str)
    
    while len(transactions) < n_fraud_transactions:
        group_size = np.random.randint(min_group, max_group + 1)
        group_size = min(group_size, n_transactions - len(transactions))
        
        base_time = current_time + timedelta(minutes=np.random.randint(0, window))
        time_offsets = np.random.normal(loc=15, scale=5, size=group_size)
        time_offsets = np.clip(time_offsets, 0, window * 60)
        cc_num = np.random.choice(cc_nums)
        
        for i, offset in enumerate(time_offsets):
            transactions.append({
                "t_id": fake.uuid4(),
                "cc_num": cc_num,
                "merchant_id": str(np.random.randint(1, n_merchants)),
                "amount": np.random.gamma(2, 50),
                "ip_address": generate_fraud_ip() if not two_ip or i % 2 == 0 else generate_fraud_ip(),
                "ts": base_time + timedelta(seconds=offset),
                "card_present": card_present_val if not two_ip else True
            })
        
        # Move to next window
        current_time += timedelta(minutes=window)
    
    return transactions

# Generate 5000 transactions (4-10 per group)
transactions_chain = generate_grouped_transactions(n_fraud_transactions, 4, 10, 30)

# Generate 2000 transactions (2 per group, different IPs, card_present=True)
transactions_loc = generate_grouped_transactions(n_fraud_transactions, 2, 2, 10, two_ip=True, card_present_val=True)

transactions_fraud = pl.DataFrame(transactions_chain + transactions_loc)
# Combine and create DataFrame
transactions_fraud.head()

t_id,cc_num,merchant_id,amount,ip_address,ts,card_present
str,str,str,f64,str,datetime[μs],bool
"""5db14ba9-e317-4a54-ae06-e73828…","""9034""","""99""",15.962583,"""163.9.160.25""",2024-12-29 10:24:15.587592,False
"""d7e54270-f707-48d5-b34f-fad578…","""9034""","""1536""",41.633051,"""3.62.105.242""",2024-12-29 10:24:17.399387,False
"""d5309219-82fd-4574-bf7f-334160…","""9034""","""1561""",49.074603,"""174.173.113.223""",2024-12-29 10:24:16.528860,False
"""0a747cd9-725b-4d0b-b478-ccae62…","""9034""","""1998""",79.793348,"""22.180.247.109""",2024-12-29 10:24:16.610745,False
"""fc9b3af2-7776-4e0a-975f-6bf0d2…","""9034""","""1214""",227.80841,"""246.63.174.76""",2024-12-29 10:24:19.877930,False


In [6]:
transaction_df = pl.concat([transaction_df, transactions_fraud])
print(transaction_df.head())

shape: (5, 7)
┌─────────────────┬────────┬─────────────┬────────┬────────────────┬────────────────┬──────────────┐
│ t_id            ┆ cc_num ┆ merchant_id ┆ amount ┆ ip_address     ┆ ts             ┆ card_present │
│ ---             ┆ ---    ┆ ---         ┆ ---    ┆ ---            ┆ ---            ┆ ---          │
│ str             ┆ str    ┆ str         ┆ f64    ┆ str            ┆ datetime[μs]   ┆ bool         │
╞═════════════════╪════════╪═════════════╪════════╪════════════════╪════════════════╪══════════════╡
│ 17fb63c2-56c9-4 ┆ 3433   ┆ 1961        ┆ 180.46 ┆ 16.0.0.0/7     ┆ 2025-01-08 08: ┆ false        │
│ cf6-80a2-cc4166 ┆        ┆             ┆        ┆                ┆ 29:56.430086   ┆              │
│ …               ┆        ┆             ┆        ┆                ┆                ┆              │
│ 5e8f6964-be62-4 ┆ 8386   ┆ 381         ┆ 130.95 ┆ 9.0.0.0/10     ┆ 2025-01-03 09: ┆ false        │
│ d23-8909-8f1942 ┆        ┆             ┆        ┆                ┆ 21:43.43

In [7]:

# Generate Historical Transaction Fraud Reports Table
n_fraud_reports = int(n_fraud_transactions * 2)
fraud_df = pl.DataFrame({
    "report_id": pl.Series([fake.uuid4() for _ in range(n_fraud_reports)]),
    "t_id": transactions_fraud["t_id"],
    "report_time": random_date_vec(now, now + timedelta(days=1), n_fraud_reports),
    "fraud_type": pl.Series(np.random.choice(['Card Not Present', 'Skimming', 'Lost/Stolen'], n_fraud_reports, p=[0.5, 0.3, 0.2]))
})



In [8]:
print(fraud_df)

shape: (10_000, 4)
┌──────────────────────────────┬──────────────────────────────┬─────────────────┬──────────────────┐
│ report_id                    ┆ t_id                         ┆ report_time     ┆ fraud_type       │
│ ---                          ┆ ---                          ┆ ---             ┆ ---              │
│ str                          ┆ str                          ┆ datetime[μs]    ┆ str              │
╞══════════════════════════════╪══════════════════════════════╪═════════════════╪══════════════════╡
│ a9fa10a3-4231-4879-ac53-7876 ┆ 5db14ba9-e317-4a54-ae06-e738 ┆ 2025-01-08      ┆ Lost/Stolen      │
│ 04…                          ┆ 28…                          ┆ 21:25:41.430086 ┆                  │
│ eb9307f8-35bb-4e91-892e-309c ┆ d7e54270-f707-48d5-b34f-fad5 ┆ 2025-01-08      ┆ Lost/Stolen      │
│ 7a…                          ┆ 78…                          ┆ 14:51:37.430086 ┆                  │
│ 64db3c32-f5df-4191-8984-8f83 ┆ d5309219-82fd-4574-bf7f-3341 ┆ 2025-01-

In [9]:
weekly_fraud = (
    transactions_fraud
    .with_columns(
        pl.col("ts").dt.week().alias("week"),
        pl.col("ts").dt.year().alias("year")
    )
    .group_by(["merchant_id", "year", "week"])
    .agg(
        pl.sum("amount").alias("total_fraud_amount")
    )
)

# Calculate total transactions per merchant per week and month
weekly_transactions = (
    transaction_df
    .with_columns(
        pl.col("ts").dt.week().alias("week"),
        pl.col("ts").dt.year().alias("year")
    )
    .group_by(["merchant_id", "year", "week"])
    .agg(
        pl.sum("amount").alias("total_amount")
    )
)

# Calculate fraud rate by week and month
weekly_fraud_rate = (
    weekly_fraud.join(
        weekly_transactions, on=["merchant_id", "year", "week"], how="inner"
    )
    .with_columns(
        (pl.col("total_fraud_amount") / pl.col("total_amount")).alias("weekly_fraud_rate")
    )
    .join(
        merchant_df, on="merchant_id", how="inner"
    )
    .filter(
        (pl.col("year") == pl.col("last_modified").dt.year()) &
        (pl.col("week") == pl.col("last_modified").dt.week())
    )
)

merchant_df = merchant_df.join(weekly_fraud_rate.select(["merchant_id", "weekly_fraud_rate"]), on="merchant_id", how="left")

nan_count = merchant_df["weekly_fraud_rate"].is_nan().sum()

print("NaN count in weekly_fraud_rate:", nan_count)


NaN count in weekly_fraud_rate: 0


In [10]:
merchant_df = merchant_df.with_columns(
    pl.col("weekly_fraud_rate").fill_null(0)
)
print(merchant_df)

shape: (2_000, 6)
┌─────────────┬─────────────────┬─────────────┬─────────────────┬─────────────────┬────────────────┐
│ merchant_id ┆ merchant_name   ┆ category    ┆ country         ┆ last_modified   ┆ weekly_fraud_r │
│ ---         ┆ ---             ┆ ---         ┆ ---             ┆ ---             ┆ ate            │
│ str         ┆ str             ┆ str         ┆ str             ┆ datetime[μs]    ┆ ---            │
│             ┆                 ┆             ┆                 ┆                 ┆ f64            │
╞═════════════╪═════════════════╪═════════════╪═════════════════╪═════════════════╪════════════════╡
│ 1           ┆ Larson Group    ┆ Clothing    ┆ Northern        ┆ 2024-12-31      ┆ 0.022447       │
│             ┆                 ┆             ┆ Mariana Islands ┆ 17:20:20.430086 ┆                │
│ 2           ┆ Gonzales-Patric ┆ Clothing    ┆ Guinea-Bissau   ┆ 2025-01-01      ┆ 0.002302       │
│             ┆ k               ┆             ┆                 ┆ 06:59:1

In [11]:
# Check if HOPSWORKS_API_KEY env variable is set or if it is set in ~/.env
if 'HOPSWORKS_API_KEY' not in os.environ and settings.HOPSWORKS_API_KEY:
    api_key = settings.HOPSWORKS_API_KEY.get_secret_value()
    os.environ['HOPSWORKS_API_KEY'] = api_key

# Initialize Hopsworks connection and get a reference to the feature store using the project
project = hopsworks.login()
fs = project.get_feature_store()


2025-01-08 10:23:32,481 INFO: Initializing external client
2025-01-08 10:23:32,482 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-08 10:23:34,798 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/17565


In [12]:
# Define descriptions for each table schema's features
feature_descriptions = {
    "credit_card_transactions": {
        "t_id": "Unique identifier for each transaction.",
        "cc_num": "Identifier for the card used in the transaction.",
        "merchant_id": "Identifier for the merchant where the transaction took place.",
        "ts": "Timestamp for when the transaction occurred.",
        "amount": "Monetary amount of the transaction, right-skewed, median $50, mean $200.",
        "ip_address": "IP address for  the credit terminal that executed the transaction",
        "card_present": "Point-of-sale terminal (true) or an online transaction (false)."
    },
    "card_details": {
        "cc_num": "Unique identifier for each card.",
        "account_id": "Owner of the card.",
        "bank_id": "Issuer of the card.",
        "card_type": "Type of card ('Credit', 'Debit', 'Prepaid').",
        "issue_date": "Date when the card was issued.",
        "expiry_date": "Card's expiration date, typically 3 to 5 years after issue.",
        "status": "Current status of the card ('Active', 'Blocked', 'Lost/Stolen')."
    },
    "account_details": {
        "account_id": "Unique identifier for each customer.",
        "name": "First and last name of the customer.",
        "email": "Email address of the customer.",
        "address": "Address of the customer.",
        "debt_end_prev_month": "Outstanding credit card debt at end of most recent month",        
        "creation_date": "Date when the customer joined.",
        "last_modified": "Timestamp of last update for this row."
    },
    "merchant_details": {
        "merchant_id": "Unique identifier for each merchant.",
        "merchant_name": "Name of the merchant.",
        "category": "Category of the merchant (e.g., 'Grocery', 'Electronics').",
        "country": "Location of the merchant.",
        "last_modified": "Timestamp of last update for this row."
    },
    "bank_details": {
        "bank_id": "Unique identifier for each merchant.",
        "country": "Home country for the bank.",
        "credit_rating": "Bank credit rating (from AAA to C)",
        "last_modified": "Timestamp of last update for this row."
    },
    "cc_fraud": {
        "report_id": "Unique identifier for each fraud report.",
        "t_id": "The transaction reported as fraudulent.",
        "report_time": "Timestamp when the fraud was reported.",
        "fraud_type": "Type of fraud ('Card Not Present', 'Skimming', 'Lost/Stolen')."
    },
}

# Function to create and insert DataFrame into a feature group
def create_feature_group(feature_group_name, description, df, feature_descriptions, event_time, online, wait):
    # Create feature group in the Hopsworks feature store
    feature_group = fs.get_or_create_feature_group(
        name=feature_group_name,
        version=1,
        description=description,
        primary_key=list(df.columns[:1]),
        event_time=event_time,
        online_enabled=online
    )
    
    # Insert data into the feature group
    feature_group.insert(df, wait=wait)
    
    # Update feature descriptions
    for feature, desc in feature_descriptions.items():
        feature_group.update_feature_description(feature, desc)


In [None]:
# Create and insert data into feature groups
create_feature_group(
    "credit_card_transactions",
    "Records of all financial transactions made using cards.",
    transaction_df,
    feature_descriptions["credit_card_transactions"],
    "ts",
    False,
    False
)


In [None]:
create_feature_group(
    "card_details",
    "Information about the cards used by customers.",
    card_df,
    feature_descriptions["card_details"],
    None,
    False,
    False
)


In [None]:

create_feature_group(
    "account_details",
    "Personal information about the customer acoounts.",
    customer_df,
    feature_descriptions["account_details"],
    None,
    False,
    False
)


In [None]:

create_feature_group(
    "merchant_details",
    "Details about merchants who execute credit card transactions.",
    merchant_df,
    feature_descriptions["merchant_details"],
    None,
    False,
    False    
)



In [None]:

create_feature_group(
    "bank_details",
    "Details about banks - credit card issuers.",
    bank_df,
    feature_descriptions["bank_details"],
    None,
    False,
    True    
)



In [None]:
create_feature_group(
    "cc_fraud",
    "Records of past fraudulent activities reported.",
    fraud_df,
    feature_descriptions["cc_fraud"],
    "report_time",
    False,
    False    
)



In [None]:
# import json
# import os
# from confluent_kafka import Producer

In [None]:
# KAFKA_TOPIC_NAME = f"{project.name}_real_time_live_transactions"
# SCHEMA_NAME = "live_transactions_schema"
# kafka_api = project.get_kafka_api()

In [None]:
# schema = {
#     "type": "record",
#     "name": SCHEMA_NAME,
#     "namespace": "ai.hopsworks.examples.feldera.fraud",
#     "fields": [
#         {
#             "name": "t_id",
#             "type": [
#                 "null",
#                 "string"
#             ]
#         },
#         {
#             "name": "ts",
#             "type": [
#                 "null",
#                 {
#                     "type": "long",
#                     "logicalType": "timestamp-micros"
#                 }
#             ]
#         },
#         {
#             "name": "cc_num",
#             "type": [
#                 "null",
#                 "string"
#             ]
#         },
#         {
#             "name": "merchant_id",
#             "type": [
#                 "null",
#                 "string"
#             ]
#         },
#         {
#             "name": "amount",
#             "type": [
#                 "null",
#                 "double"
#             ]
#         },
#         {
#             "name": "ip_addr",
#             "type": [
#                 "null",
#                 "string"
#             ]
#         },
#         {
#             "name": "card_present",
#             "type": [
#                 "null",
#                 "boolean"
#             ]
#         }
#     ]
# }

# if KAFKA_TOPIC_NAME not in [topic.name for topic in kafka_api.get_topics()]:
#     kafka_api.create_schema(SCHEMA_NAME, schema)
#     kafka_api.create_topic(KAFKA_TOPIC_NAME, SCHEMA_NAME, 1, replicas=1, partitions=1)
#     print("Created topic")
# else:
#     print("Did not create topic")

In [None]:
# # Uncomment and run this cell to delete the schema and topic

# topics = kafka_api.get_topics()
# if topics[0].name == KAFKA_TOPIC_NAME:
#     try:
#         topics[0].delete()
#         print(f"Deleted topic {KAFKA_TOPIC_NAME}")
#     except hopsworks.RestAPIError:
#         print("Could not find topic to delete")

# try:
#     schema = kafka_api.get_schema(SCHEMA_NAME, 1)
#     print(f"Deleting schema {SCHEMA_NAME}")
#     schema.delete()
# except hopsworks.RestAPIError:
#     print("Could not find schema to delete")

In [None]:
# # Get kafka config, so that you can produce to the Topic
# kafka_config = fs._storage_connector_api.get_kafka_connector(fs.id, True).confluent_options()

# print(kafka_config)
# producer = Producer(kafka_config)

# for index, transaction in transaction_df.iterrows():
#     producer.produce(KAFKA_TOPIC_NAME, transaction.to_json())
    
#     if index % 50000 == 0:
#         producer.flush()
#         print(f'Finished sending index {index}')

# producer.flush()