In [1]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('ccfraud',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: /home/jdowling/Projects/mlfs-book
Added the following directory to the PYTHONPATH: /home/jdowling/Projects/mlfs-book
HopsworksSettings initialized!


In [2]:
import hopsworks
from datetime import datetime, timedelta
import synth_transactions as st  # if you keep module separation, else import functions directly
from hsfs.feature import Feature
from mlfs.ccfraud.features import merchant_fg

# parameters (adjust as needed)
num_merchants = 500
num_banks = 1000
num_accounts = 10000
num_cards = 20000
num_transactions = 500_000  # set lower for testing if needed

fraud_rate=0.0001  # 0.01%
chain_attack_ratio=0.9 # 90% chain attacks, 10% geographic attacks

current_date = datetime(2025, 10, 5)
transactions_start_date = current_date - timedelta(days=30)
issue_date = current_date - timedelta(days=365 * 3)
expiry_date = current_date + timedelta(days=365 * 3)
account_creation_start_date = current_date - timedelta(days=365 * 5)
account_last_modified_start_date = current_date - timedelta(days=365)
bank_last_modified_start_date = current_date - timedelta(days=365)
merchant_last_modified_start_date = current_date - timedelta(days=365)

# Connect to Hopsworks (keeps original flow)
project = hopsworks.login(engine="python")
fs = project.get_feature_store()

print("Starting data generation process...")

# 1. Merchants
merchant_df = st.generate_merchant_details(rows=num_merchants, start_date=merchant_last_modified_start_date, end_date=current_date)



2025-11-10 21:59:14,258 INFO: Initializing external client
2025-11-10 21:59:14,260 INFO: Base URL: https://stagingmain.devnet.hops.works:443
2025-11-10 21:59:15,329 INFO: Python Engine initialized.

Logged in to project, explore it here https://stagingmain.devnet.hops.works:443/p/122
Starting data generation process...
Generating merchant details...


In [3]:
# features=[        
#     Feature("merchant_id", type="string"),
#     Feature("category",type="string"),
#     Feature("country",type="string"),
#     Feature("cnt_chrgeback_prev_day",type="double"),
#     Feature("cnt_chrgeback_prev_week",type="double"),
#     Feature("cnt_chrgeback_prev_month",type="double"),
#     Feature("last_modified", type="timestamp"),
# ]
merchant_fg = st.create_feature_group_with_descriptions(
    fs,
    merchant_df,
    "merchant_details",
    "Details about merchants that execute transactions",
    ["merchant_id"],
    "last_modified",
    # features=features,
    online_enabled=True
)


# features=[        
#     Feature("bank_id", type="string"),
#     Feature("country",type="string"),
#     Feature("credit_rating",type="bigint"),
#     Feature("days_since_bank_cr_changed",type="bigint"),
#     Feature("last_modified", type="timestamp"),
# ]

# 2. Banks
bank_df = st.generate_bank_details(rows=num_banks, start_date=bank_last_modified_start_date, end_date=current_date)
bank_fg = st.create_feature_group_with_descriptions(
    fs,
    bank_df,
    "bank_details",
    "Details about banks that issue credit cards",
    ["bank_id"],
    "last_modified",
    # features=features,
    online_enabled=True
)

# 3. Accounts
account_df = st.generate_account_details(rows=num_accounts,
                                      account_creation_start_date=account_creation_start_date,
                                      current_date=current_date,
                                      account_last_modified_start_date=account_last_modified_start_date)
account_fg = st.create_feature_group_with_descriptions(
    fs,
    account_df,
    "account_details",
    "Information about the account and card",
    ["account_id"],
    "last_modified",
    online_enabled=True
)

# 4. Cards
card_df = st.generate_card_details(rows=num_cards,
                                num_accounts=num_accounts,
                                num_banks=num_banks,
                                current_date=current_date,
                                issue_date=issue_date,
                                expiry_date=expiry_date)
card_fg = st.create_feature_group_with_descriptions(
    fs,
    card_df,
    "card_details",
    "Information about the account and card",
    ["cc_num"],
    "last_modified",
    topic_name=f"{project.name}_card_details",
    online_enabled=True
)

# 5. Transactions (FROM existing card + merchant)
transaction_df = st.generate_credit_card_transactions_from_existing(
    card_df=card_df,
    merchant_df=merchant_df,
    start_date=transactions_start_date,
    end_date=current_date,
    rows=num_transactions,
    tid_offset=0,
    seed=42
)

transaction_df, fraud_df = st.generate_fraud(
    transaction_df=transaction_df,
    card_df=card_df,
    merchant_df=merchant_df,
    fraud_rate=fraud_rate,
    chain_attack_ratio = chain_attack_ratio,
    seed=42
)

transactions_fg = st.create_feature_group_with_descriptions(
    fs,
    transaction_df,
    "credit_card_transactions",
    "Details about credit card transactions",
    ["t_id"],
    "ts",
    topic_name=f"{project.name}_credit_card_transactions",
    online_enabled=False
)

fraud_fg = st.create_feature_group_with_descriptions(
    fs,
    fraud_df,
    "cc_fraud",
    "Credit card transaction fraud",
    ["t_id"],
    "ts",
    online_enabled=False
)

Creating feature group: merchant_details
Feature Group created successfully, explore it at 
https://stagingmain.devnet.hops.works:443/p/122/fs/70/fg/73


Uploading Dataframe: 100.00% |█████████████████████████████████████████████████████████████████████████████| Rows 500/500 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: merchant_details_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://stagingmain.devnet.hops.works:443/p/122/jobs/named/merchant_details_1_offline_fg_materialization/executions
  Added description for: merchant_id
  Added description for: category
  Added description for: country
  Added description for: cnt_chrgeback_prev_day
  Added description for: cnt_chrgeback_prev_week
  Added description for: cnt_chrgeback_prev_month
  Added description for: last_modified
Generating bank details...
Creating feature group: bank_details
Feature Group created successfully, explore it at 
https://stagingmain.devnet.hops.works:443/p/122/fs/70/fg/74


Uploading Dataframe: 100.00% |███████████████████████████████████████████████████████████████████████████| Rows 1000/1000 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: bank_details_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://stagingmain.devnet.hops.works:443/p/122/jobs/named/bank_details_1_offline_fg_materialization/executions
  Added description for: bank_id
  Added description for: country
  Added description for: credit_rating
  Added description for: last_modified
Generating account details...
Creating feature group: account_details
Feature Group created successfully, explore it at 
https://stagingmain.devnet.hops.works:443/p/122/fs/70/fg/75


Uploading Dataframe: 100.00% |█████████████████████████████████████████████████████████████████████████| Rows 10000/10000 | Elapsed Time: 00:04 | Remaining Time: 00:00


Launching job: account_details_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://stagingmain.devnet.hops.works:443/p/122/jobs/named/account_details_1_offline_fg_materialization/executions
  Added description for: account_id
  Added description for: name
  Added description for: address
  Added description for: debt_end_prev_month
  Added description for: last_modified
  Added description for: creation_date
  Added description for: end_date
Generating card details...
Creating feature group: card_details
Feature Group created successfully, explore it at 
https://stagingmain.devnet.hops.works:443/p/122/fs/70/fg/76


Uploading Dataframe: 100.00% |█████████████████████████████████████████████████████████████████████████| Rows 20000/20000 | Elapsed Time: 00:07 | Remaining Time: 00:00


Launching job: card_details_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://stagingmain.devnet.hops.works:443/p/122/jobs/named/card_details_1_offline_fg_materialization/executions
  Added description for: cc_num
  Added description for: cc_expiry_date
  Added description for: account_id
  Added description for: bank_id
  Added description for: issue_date
  Added description for: card_type
  Added description for: status
  Added description for: last_modified
Generating credit card transactions from existing card + merchant tables...
Generating fraudulent transactions...
Total transactions: 500000
Generating 50 fraudulent transactions:
  - Chain attacks: 45 transactions
  - Geographic fraud: 5 transactions
Generated 49 fraudulent transaction records
  - Chain attacks: 45
  - Geographic fraud: 4
Updated transaction_df now has 500049 total transactions
Creating feature group: credit_card_transactions
Feature Group created successfully, explor

Uploading Dataframe: 100.00% |███████████████████████████████████████████████████████████████████████| Rows 500049/500049 | Elapsed Time: 02:04 | Remaining Time: 00:00


Launching job: credit_card_transactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://stagingmain.devnet.hops.works:443/p/122/jobs/named/credit_card_transactions_1_offline_fg_materialization/executions
  Added description for: t_id
  Added description for: cc_num
  Added description for: account_id
  Added description for: merchant_id
  Added description for: amount
  Added description for: ip_address
  Added description for: card_present
  Added description for: ts
Creating feature group: cc_fraud
Feature Group created successfully, explore it at 
https://stagingmain.devnet.hops.works:443/p/122/fs/70/fg/78


Uploading Dataframe: 100.00% |███████████████████████████████████████████████████████████████████████████████| Rows 49/49 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: cc_fraud_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://stagingmain.devnet.hops.works:443/p/122/jobs/named/cc_fraud_1_offline_fg_materialization/executions
  Added description for: t_id
  Added description for: cc_num
  Added description for: explanation
  Added description for: ts


In [4]:

print("All feature groups created successfully!")
print("\nSummary Statistics:")
print(f"Merchant Details: {len(merchant_df)} rows")
print(f"Bank Details: {len(bank_df)} rows")
print(f"Account Details: {len(account_df)} rows")
print(f"Card Details: {len(card_df)} rows")
print(f"Credit Card Transactions: {len(transaction_df)} rows")
print(f"Fraudulent Credit Card Transactions: {len(fraud_df)} rows")


All feature groups created successfully!

Summary Statistics:
Merchant Details: 500 rows
Bank Details: 1000 rows
Account Details: 10000 rows
Card Details: 20000 rows
Credit Card Transactions: 500049 rows
Fraudulent Credit Card Transactions: 49 rows
