
## Data creation

The following block creates the sample datasets in Python, which we can then use within a Databricks notebook to create initial Delta tables.

In [0]:

import random
from datetime import datetime, timedelta

# --- 1. Generate Books Data (10 books) ---
books_data = []
book_titles = [
    "The Hitchhiker's Guide to the Galaxy", "Pride and Prejudice", "1984",
    "To Kill a Mockingbird", "The Great Gatsby", "Moby Dick",
    "War and Peace", "The Catcher in the Rye", "Lord of the Rings",
    "Dune"
]
book_authors = [
    "Douglas Adams", "Jane Austen", "George Orwell", "Harper Lee",
    "F. Scott Fitzgerald", "Herman Melville", "Leo Tolstoy",
    "J.D. Salinger", "J.R.R. Tolkien", "Frank Herbert"
]
book_categories = [
    "Science Fiction", "Classic", "Dystopian", "Fiction", "Fantasy", "Adventure"
]

for i in range(1, 11): # 10 books
    books_data.append({
        "book_id": i,
        "title": book_titles[i-1],
        "author": book_authors[i-1],
        "category": random.choice(book_categories),
        "price": round(random.uniform(9.99, 39.99), 2)
    })

print("--- Books Data ---")
for book in books_data:
    print(book)
print("\n")


# --- 2. Generate Customers Data (12 clients) ---
customers_data = []
domains = ["example.com", "mail.org", "service.net", "test.io"]

for i in range(1, 13): # 12 clients
    customer_id = i
    email = f"customer_{customer_id}@{random.choice(domains)}"
    profile = f"Profile details for customer {customer_id}."
    updated = (datetime.now() - timedelta(days=random.randint(1, 365))).strftime("%Y-%m-%d %H:%M:%S")
    customers_data.append({
        "customer_id": customer_id,
        "email": email,
        "profile": profile,
        "updated": updated
    })

print("--- Customers Data ---")
for customer in customers_data:
    print(customer)
print("\n")


# --- 3. Generate Orders Data (20 orders) ---
orders_data = []

# Get lists of available customer_ids and book_ids
available_customer_ids = [c["customer_id"] for c in customers_data]
available_book_ids = [b["book_id"] for b in books_data]

for i in range(1, 21): # 20 orders
    order_id = i
    timestamp = (datetime.now() - timedelta(days=random.randint(1, 30), hours=random.randint(0, 23), minutes=random.randint(0, 59))).strftime("%Y-%m-%d %H:%M:%S")
    
    customer_id = random.choice(available_customer_ids)
    
    # Randomly select 1 to 3 books for the order
    num_books_in_order = random.randint(1, 3)
    selected_book_ids = random.sample(available_book_ids, min(num_books_in_order, len(available_book_ids)))
    
    # Calculate quantity and total
    total_quantity = 0
    total_price = 0.0
    
    # For the 'books' field in orders, we will store a list of book_ids and their quantity within that order
    # This is a common approach when dealing with many-to-many relationships within a single order record.
    ordered_books_details = []
    for book_id in selected_book_ids:
        book_price = next(b["price"] for b in books_data if b["book_id"] == book_id)
        item_quantity = random.randint(1, 2) # Each book in the order can have 1 or 2 copies
        total_quantity += item_quantity
        total_price += (book_price * item_quantity)
        ordered_books_details.append({"book_id": book_id, "quantity": item_quantity})

    orders_data.append({
        "order_id": order_id,
        "timestamp": timestamp,
        "customer_id": customer_id,
        "quantity": total_quantity, # Total items in this order
        "total": round(total_price, 2),
        "books": ordered_books_details # Array of structs: [{"book_id": x, "quantity": y}, ...]
    })

print("--- Orders Data ---")
for order in orders_data:
    print(order)
print("\n")


## Create Spark DataFrame

Using Spark to create DataFrames from these Python lists generated above

In [0]:
# Create Spark DataFrames
df_books = spark.createDataFrame(books_data)
df_customers = spark.createDataFrame(customers_data)
df_orders = spark.createDataFrame(orders_data)

print("Schema for Books DataFrame:")
df_books.printSchema()
print("\nSchema for Customers DataFrame:")
df_customers.printSchema()
print("\nSchema for Orders DataFrame:")
df_orders.printSchema()

print("\nSample Books Data:")
df_books.display()
print("\nSample Customers Data:")
df_customers.display()
print("\nSample Orders Data:")
df_orders.display()


## Save as Delta Tables

In [0]:
# Define your Unity Catalog target
catalog_name = "default" 
schema_name = "default"  

# Save as Delta tables in Unity Catalog
df_books.write.mode("overwrite").saveAsTable(f"{catalog_name}.{schema_name}.books_raw")
df_customers.write.mode("overwrite").saveAsTable(f"{catalog_name}.{schema_name}.customers_raw")
df_orders.write.mode("overwrite").saveAsTable(f"{catalog_name}.{schema_name}.orders_raw")

print(f"Data saved to Unity Catalog tables in {catalog_name}.{schema_name}:")
print(f"- {catalog_name}.{schema_name}.books_raw")
print(f"- {catalog_name}.{schema_name}.customers_raw")
print(f"- {catalog_name}.{schema_name}.orders_raw")