In [1]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

root_dir = Path().absolute()
# Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
if root_dir.parts[-1:] == ('notebooks',):
    root_dir = Path(*root_dir.parts[:-1])
    sys.path.append(str(root_dir))
if root_dir.parts[-1:] == ('ccfraud',):
    root_dir = Path(*root_dir.parts[:-1])
    sys.path.append(str(root_dir))
root_dir = str(root_dir) 

print(f"Root dir: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Root dir: /home/jdowling/Projects/mlfs-book
HopsworksSettings initialized!


In [2]:
import hopsworks
from datetime import datetime, timedelta
from ccfraud import synth_transactions as st  # if you keep module separation, else import functions directly
from hsfs.feature import Feature
from ccfraud.features import merchant_fg

# parameters (adjust as needed)
num_merchants = 500
num_banks = 100
num_accounts = 1000
num_cards = 2000
num_transactions = 50_000  # set lower for testing if needed
fraud_rate=0.001  # 0.01%
chain_attack_ratio=0.9 # 90% chain attacks, 10% geographic attacks

current_date = datetime(2025, 12, 5)
transactions_start_date = current_date - timedelta(days=30)
issue_date = current_date - timedelta(days=365 * 3)
expiry_date = current_date + timedelta(days=365 * 3)
account_creation_start_date = current_date - timedelta(days=365 * 5)
account_last_modified_start_date = current_date - timedelta(days=365)
bank_last_modified_start_date = current_date - timedelta(days=365)
merchant_last_modified_start_date = current_date - timedelta(days=365)

# Connect to Hopsworks (keeps original flow)
project = hopsworks.login()
fs = project.get_feature_store()

print("Starting data generation process...")

# 1. Merchants
merchant_df = st.generate_merchant_details(rows=num_merchants, start_date=merchant_last_modified_start_date, end_date=current_date)



2025-12-28 10:41:18,532 INFO: Initializing external client
2025-12-28 10:41:18,533 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-12-28 10:41:24,978 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/398
Starting data generation process...
Generating merchant details...


In [5]:
card_df = st.generate_card_details(rows=num_cards,
                                num_accounts=num_accounts,
                                num_banks=num_banks,
                                current_date=current_date,
                                issue_date=issue_date,
                                expiry_date=expiry_date)
card_fg = st.create_feature_group_with_descriptions(
    fs,
    card_df,
    "card_details",
    "Information about the account and card",
    ["cc_num"],
    "last_modified",
    topic_name=f"{project.name}_card_details_onlinefs",
    online_enabled=True
)


Generating card details...
Creating feature group: card_details


RestAPIError: Metadata operation error: (url: https://c.app.hopsworks.ai/hopsworks-api/api/project/398/featurestores/335/featuregroups). Server response: 
HTTP code: 400, HTTP reason: Bad Request, body: b'{"errorCode":270089,"usrMsg":"project: dowlingj, featurestoreId: 335","errorMsg":"The feature group you are trying to create does already exist."}', error code: 270089, error msg: The feature group you are trying to create does already exist., user msg: project: dowlingj, featurestoreId: 335

In [3]:
# 1. Merchants
merchant_fg = st.create_feature_group_with_descriptions(
    fs,
    merchant_df,
    "merchant_details",
    "Details about merchants that execute transactions",
    ["merchant_id"],
    "last_modified",
    online_enabled=True
)

# 2. Banks
bank_df = st.generate_bank_details(rows=num_banks, start_date=bank_last_modified_start_date, end_date=current_date)
bank_fg = st.create_feature_group_with_descriptions(
    fs,
    bank_df,
    "bank_details",
    "Details about banks that issue credit cards",
    ["bank_id"],
    "last_modified",
    online_enabled=True
)

# 3. Accounts
account_df = st.generate_account_details(rows=num_accounts,
                                      account_creation_start_date=account_creation_start_date,
                                      current_date=current_date,
                                      account_last_modified_start_date=account_last_modified_start_date)

# IMPORTANT: Assign home locations to cardholders for realistic geographic patterns
account_df = st.assign_cardholder_home_locations(account_df, seed=42)

account_fg = st.create_feature_group_with_descriptions(
    fs,
    account_df,
    "account_details",
    "Information about the account and card",
    ["account_id"],
    "last_modified",
    online_enabled=True
)

# 4. Cards
card_df = st.generate_card_details(rows=num_cards,
                                num_accounts=num_accounts,
                                num_banks=num_banks,
                                current_date=current_date,
                                issue_date=issue_date,
                                expiry_date=expiry_date)
card_fg = st.create_feature_group_with_descriptions(
    fs,
    card_df,
    "card_details",
    "Information about the account and card",
    ["cc_num"],
    "last_modified",
    topic_name=f"{project.name}_card_details_onlinefs",
    online_enabled=True
)

# 5. Transactions - Using NEW function with location continuity
# This generates realistic transaction patterns where cardholders stay in their home country
# most of the time, with realistic travel patterns and appropriate time gaps
transaction_df = st.generate_credit_card_transactions_with_location_continuity(
    card_df=card_df,
    account_df=account_df,  # Must have 'home_country' column from assign_cardholder_home_locations
    merchant_df=merchant_df,
    start_date=transactions_start_date,
    end_date=current_date,
    rows=num_transactions,
    tid_offset=0,
    seed=42
)

# 6. Fraud - Now with improved geographic and chain attack patterns
transaction_df, fraud_df = st.generate_fraud(
    transaction_df=transaction_df,
    card_df=card_df,
    merchant_df=merchant_df,
    fraud_rate=fraud_rate,
    chain_attack_ratio = chain_attack_ratio,
    seed=42
)

transactions_fg = st.create_feature_group_with_descriptions(
    fs,
    transaction_df,
    "credit_card_transactions",
    "Details about credit card transactions",
    ["t_id"],
    "ts",
    topic_name=f"{project.name}_credit_card_transactions_onlinefs",
    online_enabled=True
)

fraud_fg = st.create_feature_group_with_descriptions(
    fs,
    fraud_df,
    "cc_fraud",
    "Credit card transaction fraud",
    ["t_id"],
    "ts",
    online_enabled=False
)

Creating feature group: merchant_details
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/398/fs/335/fg/1876419


Uploading Dataframe: 100.00% |█████████████████████████████████████████████████████████████████████████████| Rows 500/500 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: merchant_details_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/398/jobs/named/merchant_details_1_offline_fg_materialization/executions
  Added description for: merchant_id
  Added description for: category
  Added description for: country
  Added description for: cnt_chrgeback_prev_day
  Added description for: cnt_chrgeback_prev_week
  Added description for: cnt_chrgeback_prev_month
  Added description for: last_modified
Generating bank details...
Creating feature group: bank_details
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/398/fs/335/fg/1878479


Uploading Dataframe: 100.00% |█████████████████████████████████████████████████████████████████████████████| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: bank_details_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/398/jobs/named/bank_details_1_offline_fg_materialization/executions
  Added description for: bank_id
  Added description for: country
  Added description for: credit_rating
  Added description for: last_modified
Generating account details...
Assigning home locations to cardholders...
Creating feature group: account_details
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/398/fs/335/fg/1876420


Uploading Dataframe: 100.00% |███████████████████████████████████████████████████████████████████████████| Rows 1000/1000 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: account_details_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/398/jobs/named/account_details_1_offline_fg_materialization/executions
  Added description for: account_id
  Added description for: name
  Added description for: address
  Added description for: debt_end_prev_month
  Added description for: last_modified
  Added description for: creation_date
  Added description for: end_date
Generating card details...
Creating feature group: card_details
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/398/fs/335/fg/1876421


Uploading Dataframe: 100.00% |███████████████████████████████████████████████████████████████████████████| Rows 2000/2000 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: card_details_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/398/jobs/named/card_details_1_offline_fg_materialization/executions
  Added description for: cc_num
  Added description for: cc_expiry_date
  Added description for: account_id
  Added description for: bank_id
  Added description for: issue_date
  Added description for: card_type
  Added description for: status
  Added description for: last_modified
Generating credit card transactions with location continuity (vectorized)...
  Generating 50000 transactions for 2000 cards...
  Average 25 transactions per card
  Assigning countries to transactions...
  Assigning merchants...
  Generating amounts and other attributes...
  Generating IP addresses...
  Generated 50000 transactions
Generating fraudulent transactions...
Total transactions: 50000
Generating 50 fraudulent transactions:
  - Chain attacks: 45 transactions
  - Geographic fraud: 5 transa

Uploading Dataframe: 100.00% |█████████████████████████████████████████████████████████████████████████| Rows 50049/50049 | Elapsed Time: 00:09 | Remaining Time: 00:00


Launching job: credit_card_transactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/398/jobs/named/credit_card_transactions_1_offline_fg_materialization/executions
  Added description for: t_id
  Added description for: cc_num
  Added description for: account_id
  Added description for: merchant_id
  Added description for: amount
  Added description for: ip_address
  Added description for: card_present
  Added description for: ts
Creating feature group: cc_fraud
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/398/fs/335/fg/1878480


Uploading Dataframe: 100.00% |███████████████████████████████████████████████████████████████████████████████| Rows 49/49 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: cc_fraud_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/398/jobs/named/cc_fraud_1_offline_fg_materialization/executions
  Added description for: t_id
  Added description for: cc_num
  Added description for: explanation
  Added description for: ts


In [4]:

print("All feature groups created successfully!")
print("\nSummary Statistics:")
print(f"Merchant Details: {len(merchant_df)} rows")
print(f"Bank Details: {len(bank_df)} rows")
print(f"Account Details: {len(account_df)} rows")
print(f"Card Details: {len(card_df)} rows")
print(f"Credit Card Transactions: {len(transaction_df)} rows")
print(f"Fraudulent Credit Card Transactions: {len(fraud_df)} rows")


All feature groups created successfully!

Summary Statistics:
Merchant Details: 500 rows
Bank Details: 100 rows
Account Details: 1000 rows
Card Details: 2000 rows
Credit Card Transactions: 50049 rows
Fraudulent Credit Card Transactions: 49 rows
