In [0]:
## %pip install faker

In [0]:
# Databricks notebook: 01_ingest_generate_daily.ipynb

from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

# Initialize
spark = SparkSession.builder.getOrCreate()
fake = Faker()
np.random.seed(88)

# Parameters
NUM_RECORDS = np.random.randint(400, 600)
TODAY = datetime.today().date() - timedelta(days=1)  # yesterday for daily batch

# Static Data
companies = ['Spendora', 'Expensivus', 'ClarityLedger', 'TrueSpend', 'Fintrix', 'Procuro', 'LedgrIQ', 'Zentro']
departments = ['Engineering', 'Marketing', 'Sales', 'Finance', 'HR', 'Operations']
categories = ['Travel', 'Meals', 'Supplies', 'Entertainment', 'Misc']
merchants = {
    'Travel': ['Delta', 'Uber', 'Lyft', 'Marriott', 'Hilton'],
    'Meals': ['Starbucks', 'Chipotle', 'Panera', 'Olive Garden'],
    'Supplies': ['Staples', 'Office Depot', 'Amazon'],
    'Entertainment': ['AMC', 'TopGolf', "Dave & Buster's"],
    'Misc': ['Etsy', 'Other', 'Unknown']
}
category_amounts = {
    'Travel': (500, 150),
    'Meals': (40, 10),
    'Supplies': (50, 20),
    'Entertainment': (100, 50),
    'Misc': (75, 75)
}

# Generate synthetic data
data = []
for _ in range(NUM_RECORDS):
    company = random.choice(companies)
    department = random.choice(departments)
    category = random.choice(categories)
    merchant = random.choice(merchants[category])
    mean, std = category_amounts[category]
    amount = round(max(1, np.random.normal(loc=mean, scale=std)), 2)
    employee = fake.name()

    data.append([
        employee, company, department, category,
        merchant, amount, str(TODAY), 'transaction'
    ])

# Create DataFrame
df_pd = pd.DataFrame(data, columns=[
    'employee', 'company', 'department', 'category',
    'merchant', 'amount', 'date', 'type'
])
df_spark = spark.createDataFrame(df_pd)

# Write to Bronze Delta table path
bronze_path = "dbfs:/mnt/your_mount_point/bronze/daily_transactions"
df_spark.write.format("delta").mode("append").save(bronze_path)

print(f"✅ {NUM_RECORDS} daily transactions written to {bronze_path} for {TODAY}")
