In [0]:
import json
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta # For easier month iteration

# Define the base Unity Catalog Volume path
UC_VOLUME_BASE_PATH = "/Volumes/dbndev/raw/inbound"

def generate_customers(num_customers):
    customers = []
    cities = ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "London", "Paris", "Berlin", "Tokyo", "Sydney"]
    countries = ["USA", "USA", "USA", "USA", "USA", "UK", "France", "Germany", "Japan", "Australia"]
    first_names = ["Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace", "Heidi", "Ivan", "Judy"]
    last_names = ["Smith", "Jones", "Williams", "Brown", "Davis", "Miller", "Wilson", "Moore", "Taylor", "Anderson"]

    for i in range(1, num_customers + 1):
        customer_id = f"CUST{i:04d}"
        first_name = random.choice(first_names)
        last_name = random.choice(last_names)
        email = f"{first_name.lower()}.{last_name.lower()}{random.randint(1, 99)}@example.com"
        city = random.choice(cities)
        country = countries[cities.index(city)] # Ensure country matches city
        registration_date = (datetime.now() - timedelta(days=random.randint(30, 365*3))).strftime("%Y-%m-%d")
        customers.append({
            "customer_id": customer_id,
            "first_name": first_name,
            "last_name": last_name,
            "email": email,
            "city": city,
            "country": country,
            "registration_date": registration_date
        })
    return customers

def generate_products(num_products):
    products = []
    product_names = [
        "Laptop Pro", "Gaming Mouse", "Mechanical Keyboard", "Monitor Ultra", "Webcam HD",
        "Smartphone X", "Wireless Earbuds", "Smartwatch Pro", "Tablet Mini", "E-Reader Lite",
        "Coffee Maker Deluxe", "Blender Power", "Toaster Oven Smart", "Air Fryer XL", "Electric Kettle",
        "Desk Chair Ergonomic", "Standing Desk", "Bookshelf Modern", "Table Lamp LED", "Area Rug Soft"
    ]
    categories = ["Electronics", "Electronics", "Electronics", "Electronics", "Electronics",
                  "Mobile & Accessories", "Mobile & Accessories", "Mobile & Accessories", "Mobile & Accessories", "Mobile & Accessories",
                  "Home Appliances", "Home Appliances", "Home Appliances", "Home Appliances", "Home Appliances",
                  "Furniture", "Furniture", "Furniture", "Furniture", "Home Decor"]

    for i in range(1, num_products + 1):
        product_id = f"PROD{i:03d}"
        product_name = random.choice(product_names)
        category = categories[product_names.index(product_name)] # Ensure category matches product
        price = round(random.uniform(20.0, 1500.0), 2)
        stock_quantity = random.randint(0, 500)
        products.append({
            "product_id": product_id,
            "product_name": product_name,
            "category": category,
            "price": price,
            "stock_quantity": stock_quantity
        })
    return products

def generate_orders(num_orders, customers, products, start_date=None, end_date=None):
    orders = []
    order_statuses = ["PENDING", "SHIPPED", "DELIVERED", "CANCELLED"]

    if not start_date:
        start_date = datetime(2024, 1, 1) # Default start if not provided
    if not end_date:
        end_date = start_date + timedelta(days=364) # Default to approx 1 year if not provided

    time_diff_seconds = int((end_date - start_date).total_seconds())

    for i in range(1, num_orders + 1):
        order_id = f"ORD{datetime.now().strftime('%Y%m%d%H%M%S%f')}{i:04d}" # More unique order ID with microseconds
        customer = random.choice(customers)
        product = random.choice(products)
        quantity = random.randint(1, 5)
        total_price = round(quantity * product["price"], 2)
        status = random.choice(order_statuses)
        
        # Distribute orders randomly between start_date and end_date
        random_seconds = random.randint(0, time_diff_seconds)
        order_date = start_date + timedelta(seconds=random_seconds)

        orders.append({
            "order_id": order_id,
            "customer_id": customer["customer_id"],
            "order_date": order_date.isoformat(),
            "product_id": product["product_id"],
            "quantity": quantity,
            "total_price": total_price,
            "status": status
        })
    return orders

# Generate base data (customers and products) - usually static or less frequent updates
num_customers = 50
num_products = 20
customers_data = generate_customers(num_customers)
products_data = generate_products(num_products)

# Define paths for each entity within the UC Volume
customers_output_dir = f"{UC_VOLUME_BASE_PATH}/customers"
products_output_dir = f"{UC_VOLUME_BASE_PATH}/products"
orders_output_dir = f"{UC_VOLUME_BASE_PATH}/orders"

# Helper function to check and create directory
def ensure_directory_exists(path):
    try:
        dbutils.fs.ls(path) # Try to list contents
        print(f"Directory '{path}' already exists.")
    except Exception as e:
        print(f"Directory '{path}' does not exist. Attempting to create...")
        try:
            dbutils.fs.mkdirs(path)
            print(f"Directory '{path}' created successfully.")
        except Exception as create_e:
            print(f"Error creating directory '{path}': {create_e}")
            raise # Re-raise if creation also fails, as it's a critical error

# Ensure all necessary directories exist
ensure_directory_exists(customers_output_dir)
ensure_directory_exists(products_output_dir)
ensure_directory_exists(orders_output_dir)

# --- Save CUSTOMERS DATA ---
customer_file_path = f"{customers_output_dir}/customer_data.json"
# Convert JSON data to a string before writing with dbutils.fs.put
customer_json_string = json.dumps(customers_data, indent=2)
dbutils.fs.put(customer_file_path, customer_json_string, overwrite=True)
print(f"Customer data saved to: {customer_file_path}")

# --- Save PRODUCTS DATA ---
product_file_path = f"{products_output_dir}/product_data.json"
product_json_string = json.dumps(products_data, indent=2)
dbutils.fs.put(product_file_path, product_json_string, overwrite=True)
print(f"Product data saved to: {product_file_path}")


# --- NEW FUNCTIONS FOR ORDER GENERATION ---

def generate_monthly_full_load_orders(year: int, month: int, num_orders: int,
                                       customers_data: list, products_data: list):
    """
    Generates and saves a single monthly full load order JSON file to the UC Volume.
    Filename: monthly_full_load_YYYY_MM.json
    """
    try:
        month_start = datetime(year, month, 1)
        if month == 12:
            month_end = datetime(year + 1, 1, 1) - timedelta(microseconds=1)
        else:
            month_end = datetime(year, month + 1, 1) - timedelta(microseconds=1)
    except ValueError:
        print(f"Error: Invalid month or year provided for full load: {year}-{month}")
        return

    ensure_directory_exists(orders_output_dir) # Ensure the orders output directory exists

    monthly_orders = generate_orders(num_orders, customers_data, products_data, month_start, month_end)
    
    month_suffix = month_start.strftime("%Y_%m") # YYYY_MM format
    monthly_order_file_name = f"monthly_full_load_{month_suffix}.json"
    monthly_order_file_path = f"{orders_output_dir}/{monthly_order_file_name}"

    monthly_orders_json_string = json.dumps(monthly_orders, indent=2)
    dbutils.fs.put(monthly_order_file_path, monthly_orders_json_string, overwrite=True)
    print(f"Generated and saved {len(monthly_orders)} full load orders for {month_suffix} to: {monthly_order_file_path}")

def generate_daily_incremental_orders(date_obj: datetime, num_orders: int,
                                      customers_data: list, products_data: list):
    """
    Generates and saves a single daily incremental order JSON file to the UC Volume.
    Filename: orders_YYYY_MM_DD.json
    """
    ensure_directory_exists(orders_output_dir) # Ensure the orders output directory exists

    day_start = datetime(date_obj.year, date_obj.month, date_obj.day)
    day_end = day_start + timedelta(days=1) - timedelta(microseconds=1)

    daily_orders = generate_orders(num_orders, customers_data, products_data, day_start, day_end)
    
    day_suffix = date_obj.strftime("%Y_%m_%d") # YYYY_MM_DD format
    daily_order_file_name = f"orders_{day_suffix}.json"
    daily_order_file_path = f"{orders_output_dir}/{daily_order_file_name}"

    daily_orders_json_string = json.dumps(daily_orders, indent=2)
    dbutils.fs.put(daily_order_file_path, daily_orders_json_string, overwrite=True)
    print(f"Generated and saved {len(daily_orders)} daily orders for {day_suffix} to: {daily_order_file_path}")


# --- MAIN EXECUTION BLOCK ---

# --- 1. Generate Monthly Full Load Orders (Jan to May 2025) ---
print("\n--- Generating Monthly Full Load Orders (Jan 2025 - May 2025) ---")
start_month_full_load = datetime(2025, 1, 1)
end_month_full_load = datetime(2025, 5, 1) # End before this month for loop condition
orders_per_month_full_load = 500 # Adjust as needed

current_month_date = start_month_full_load
while current_month_date <= end_month_full_load: # Loop up to and including May
    generate_monthly_full_load_orders(current_month_date.year, current_month_date.month,
                                      orders_per_month_full_load, customers_data, products_data)
    current_month_date += relativedelta(months=1)

# --- 2. Generate Daily Incremental Orders for June 2025 (from 1st until today) ---
print("\n--- Generating Daily Incremental Orders for June 2025 (from 1st until today) ---")
start_day_incremental = datetime(2025, 6, 1)
today = datetime.now()
orders_per_day_incremental = 25 # Adjust as needed

current_day_date = start_day_incremental
while current_day_date <= today:
    generate_daily_incremental_orders(current_day_date, orders_per_day_incremental,
                                     customers_data, products_data)
    current_day_date += timedelta(days=1)

# --- How to call on-demand: ---
print("\n--- On-Demand Generation Examples (Run these in separate cells if needed) ---")
print("# Example: Generate a monthly full load for a specific month (e.g., July 2024)")
print("# generate_monthly_full_load_orders(2024, 7, 1000, customers_data, products_data)")
print("\n# Example: Generate a daily incremental file for a specific date (e.g., today's date)")
print("# generate_daily_incremental_orders(datetime.now(), 50, customers_data, products_data)")
print("\n# Example: Generate a daily incremental file for a past date (e.g., July 1, 2024)")
print("# generate_daily_incremental_orders(datetime(2024, 7, 1), 30, customers_data, products_data)")


In [0]:
from datetime import datetime

# Assuming customers_data and products_data are already loaded in your session
# by running the main execution block of the data-generation-code Canvas.

# Define the specific date you want to generate orders for
specific_date = datetime(2025, 6, 15) # For July 15, 2024

# Define the number of orders for that day
num_orders_for_day = 40

# Call the function to generate and save the daily file
generate_daily_incremental_orders(specific_date, num_orders_for_day, customers_data, products_data)

print(f"Daily order file for {specific_date.strftime('%Y-%m-%d')} generated successfully.")