In [1]:
"""Create dummy data for testing ETL pipeline."""
import os
from pathlib import Path

# Create the dummy data directory structure
base_path = Path("data/dummy/")
base_path.mkdir(parents=True, exist_ok=True)

# Define paths for each table
table_paths = {
    "customer": base_path / "customer_table.csv",
    "accounts": base_path / "accounts_table.csv",
    "account_ownership": base_path / "account_ownership_table.csv",
    "deposit_transactions": base_path / "deposit_transaction_table.csv",
    "credit_transactions": base_path / "credit_transaction_table.csv"
}

table_paths

{'customer': PosixPath('data/dummy/customer_table.csv'),
 'accounts': PosixPath('data/dummy/accounts_table.csv'),
 'account_ownership': PosixPath('data/dummy/account_ownership_table.csv'),
 'deposit_transactions': PosixPath('data/dummy/deposit_transaction_table.csv'),
 'credit_transactions': PosixPath('data/dummy/credit_transaction_table.csv')}

In [3]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random
from dateutil.relativedelta import relativedelta

fake = Faker()
random.seed(42)
np.random.seed(42)

# Generate 14 IMAGE_DT values for the last day of each month
def generate_month_end_dates(months=14):
    today = datetime.today().replace(day=1)
    return [(today - relativedelta(months=i) + relativedelta(day=31)).date()
            for i in reversed(range(months))]

image_dates = generate_month_end_dates()

# 1. Customer Table
def create_customer_table():
    cust_ids = list(range(100000, 100000 + 200))
    segments = ['P', 'B', 'S']
    data = []

    for date in image_dates:
        for cust_id in cust_ids:
            row = {
                "CUST_ID": cust_id,
                "SEGMENT": random.choices(segments, weights=[0.7, 0.2, 0.1])[0],
                "IMAGE_DT": date
            }
            data.append(row)

    df = pd.DataFrame(data)
    return df

# 2. Account Table
def create_accounts_table(customer_df):
    account_data = []
    product_categories = {
        'DEP': [1001, 1002, 1003, 1004, 1005],
        'CC': [2001, 2002, 2003, 2004, 2005]
    }

    account_id = 500000
    for date in image_dates:
        for cust_id in customer_df['CUST_ID'].unique():
            num_accounts = np.random.randint(0, 6)
            for _ in range(num_accounts):
                category = random.choice(list(product_categories.keys()))
                product_code = random.choice(product_categories[category])
                row = {
                    "CUST_ID": cust_id,
                    "ACNO": account_id,
                    "PRODUCT_CODE": product_code,
                    "PRODUCT_CATEGORY": category,
                    "SEGMENT": customer_df.loc[customer_df['CUST_ID'] == cust_id, 'SEGMENT'].iloc[0],
                    "IMAGE_DT": date
                }
                account_data.append(row)
                account_id += 1

    return pd.DataFrame(account_data)

# 3. Account Ownership Table
def create_account_ownership_table(accounts_df):
    ownership_data = []
    for _, row in accounts_df.iterrows():
        primary_holder = row['CUST_ID']
        ownership_data.append({
            "ACNO": row['ACNO'],
            "CUST_ID": primary_holder,
            "RELATIONSHIP": "P",
            "SEGMENT": row['SEGMENT'],
            "IMAGE_DT": row['IMAGE_DT']
        })

        # Optional secondary holder
        if random.random() < 0.3:
            secondary_cust = random.choice(accounts_df['CUST_ID'].unique())
            ownership_data.append({
                "ACNO": row['ACNO'],
                "CUST_ID": secondary_cust,
                "RELATIONSHIP": "S",
                "SEGMENT": row['SEGMENT'],
                "IMAGE_DT": row['IMAGE_DT']
            })

    return pd.DataFrame(ownership_data)

# 4 & 5. Transaction Tables
def create_transaction_table(accounts_df, category):
    transaction_data = []
    tran_types = ['Customer-Initiated Credit', 'Customer-Initiated Debit', 'Bank Fee', 'Self-to-Self Transfer']

    filtered = accounts_df[accounts_df['PRODUCT_CATEGORY'] == category]
    for _, row in filtered.iterrows():
        for _ in range(np.random.randint(0, 11)):
            tran_date = row['IMAGE_DT'].replace(day=np.random.randint(1, 28))
            tran_time = timedelta(hours=np.random.randint(0, 24), minutes=np.random.randint(0, 60))
            transaction_data.append({
                "ACNO": row['ACNO'],
                "CUST_ID": row['CUST_ID'],
                "TRAN_TYPE": random.choice(tran_types),
                "TRAN_DATE": datetime.combine(tran_date, datetime.min.time()) + tran_time,
                "AMOUNT": round(np.random.uniform(10, 1000), 2),
                "SEGMENT": row['SEGMENT'],
                "IMAGE_DT": row['IMAGE_DT']
            })

    return pd.DataFrame(transaction_data)

# Generate all tables
customer_df = create_customer_table()
accounts_df = create_accounts_table(customer_df)
ownership_df = create_account_ownership_table(accounts_df)
deposit_df = create_transaction_table(accounts_df, 'DEP')
credit_df = create_transaction_table(accounts_df, 'CC')

# Save to CSV
customer_df.to_csv(table_paths["customer"], index=False)
accounts_df.to_csv(table_paths["accounts"], index=False)
ownership_df.to_csv(table_paths["account_ownership"], index=False)
deposit_df.to_csv(table_paths["deposit_transactions"], index=False)
credit_df.to_csv(table_paths["credit_transactions"], index=False)