In [1]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('ccfraud',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: /home/jdowling/Projects/mlfs-book
Added the following directory to the PYTHONPATH: /home/jdowling/Projects/mlfs-book
HopsworksSettings initialized!


In [2]:
import hopsworks
from datetime import datetime, timedelta
from mlfs.ccfraud import synth_transactions as st  # if you keep module separation, else import functions directly
from hsfs.feature import Feature
from mlfs.ccfraud.features import merchant_fg

# parameters (adjust as needed)
num_merchants = 500
num_banks = 1000
num_accounts = 10000
num_cards = 20000
num_transactions = 500_000  # set lower for testing if needed

fraud_rate=0.0001  # 0.01%
chain_attack_ratio=0.9 # 90% chain attacks, 10% geographic attacks

current_date = datetime(2025, 10, 5)
transactions_start_date = current_date - timedelta(days=30)
issue_date = current_date - timedelta(days=365 * 3)
expiry_date = current_date + timedelta(days=365 * 3)
account_creation_start_date = current_date - timedelta(days=365 * 5)
account_last_modified_start_date = current_date - timedelta(days=365)
bank_last_modified_start_date = current_date - timedelta(days=365)
merchant_last_modified_start_date = current_date - timedelta(days=365)

# Connect to Hopsworks (keeps original flow)
project = hopsworks.login(engine="python", hostname_verification=False)
fs = project.get_feature_store()

print("Starting data generation process...")

# 1. Merchants
merchant_df = st.generate_merchant_details(rows=num_merchants, start_date=merchant_last_modified_start_date, end_date=current_date)



2025-11-22 07:47:15,850 INFO: Initializing external client
2025-11-22 07:47:15,852 INFO: Base URL: https://stagingmain.devnet.hops.works:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.7) by running 'pip install hopsworks==4.7.*'







2025-11-22 07:47:16,913 INFO: Python Engine initialized.

Logged in to project, explore it here https://stagingmain.devnet.hops.works:443/p/122
Starting data generation process...
Generating merchant details...


In [3]:
# 1. Merchants
merchant_fg = st.create_feature_group_with_descriptions(
    fs,
    merchant_df,
    "merchant_details",
    "Details about merchants that execute transactions",
    ["merchant_id"],
    "last_modified",
    online_enabled=True
)

# 2. Banks
bank_df = st.generate_bank_details(rows=num_banks, start_date=bank_last_modified_start_date, end_date=current_date)
bank_fg = st.create_feature_group_with_descriptions(
    fs,
    bank_df,
    "bank_details",
    "Details about banks that issue credit cards",
    ["bank_id"],
    "last_modified",
    online_enabled=True
)

# 3. Accounts
account_df = st.generate_account_details(rows=num_accounts,
                                      account_creation_start_date=account_creation_start_date,
                                      current_date=current_date,
                                      account_last_modified_start_date=account_last_modified_start_date)
account_fg = st.create_feature_group_with_descriptions(
    fs,
    account_df,
    "account_details",
    "Information about the account and card",
    ["account_id"],
    "last_modified",
    online_enabled=True
)

# 4. Cards
card_df = st.generate_card_details(rows=num_cards,
                                num_accounts=num_accounts,
                                num_banks=num_banks,
                                current_date=current_date,
                                issue_date=issue_date,
                                expiry_date=expiry_date)
card_fg = st.create_feature_group_with_descriptions(
    fs,
    card_df,
    "card_details",
    "Information about the account and card",
    ["cc_num"],
    "last_modified",
    topic_name=f"{project.name}_card_details",
    online_enabled=True
)

# 5. Transactions (FROM existing card + merchant)
transaction_df = st.generate_credit_card_transactions_from_existing(
    card_df=card_df,
    merchant_df=merchant_df,
    start_date=transactions_start_date,
    end_date=current_date,
    rows=num_transactions,
    tid_offset=0,
    seed=42
)

transaction_df, fraud_df = st.generate_fraud(
    transaction_df=transaction_df,
    card_df=card_df,
    merchant_df=merchant_df,
    fraud_rate=fraud_rate,
    chain_attack_ratio = chain_attack_ratio,
    seed=42
)

transactions_fg = st.create_feature_group_with_descriptions(
    fs,
    transaction_df,
    "credit_card_transactions",
    "Details about credit card transactions",
    ["t_id"],
    "ts",
    topic_name=f"{project.name}_credit_card_transactions",
    online_enabled=False
)

fraud_fg = st.create_feature_group_with_descriptions(
    fs,
    fraud_df,
    "cc_fraud",
    "Credit card transaction fraud",
    ["t_id"],
    "ts",
    online_enabled=False
)

Creating feature group: merchant_details


ConnectionError: Stream 25 was reset by remote peer. Reason: 0x1.

In [None]:

print("All feature groups created successfully!")
print("\nSummary Statistics:")
print(f"Merchant Details: {len(merchant_df)} rows")
print(f"Bank Details: {len(bank_df)} rows")
print(f"Account Details: {len(account_df)} rows")
print(f"Card Details: {len(card_df)} rows")
print(f"Credit Card Transactions: {len(transaction_df)} rows")
print(f"Fraudulent Credit Card Transactions: {len(fraud_df)} rows")
