In [14]:
from faker import Faker
import pandas as pd
import random

fake = Faker()

def create_orders_data__xlsx(num_customers = 500, num_products = 20, num_orders = 10000, base_path_out = "out"):
    schema = "orders_data"    

    # Pre-generate random values to select from
    customer_names = [fake.name() for _ in range(num_customers)]
    emails = [fake.email() for _ in range(30)]
    phone_numbers = [fake.phone_number() for _ in range(30)]
    addresses = [fake.address().replace("\n", ", ") for _ in range(30)]
    countries = [fake.country() for _ in range(10)] + ["USA", "Germany"]

    print("Customers")
    # Create Customers DataFrame
    customers_data = {
        "CustomerID": range(1, num_customers + 1),
        "CustomerName": random.choices(customer_names, k=num_customers),
        "Email": random.choices(emails, k=num_customers),
        "PhoneNumber": random.choices(phone_numbers, k=num_customers),
        "Address": random.choices(addresses, k=num_customers),
        "Country": random.choices(countries, k=num_customers)
    }
    customers_df = pd.DataFrame(customers_data)
    customers_df.to_csv(f"{base_path_out}/customers.csv", index=False)

    print("Products")
    # Define categories with sample products for each
    categories = {
        "Electronics": ["Smartphone", "Laptop", "Headphones", "Smartwatch", "Tablet"],
        "Clothing": ["T-Shirt", "Jeans", "Sweater", "Jacket", "Shoes"],
        "Toys": ["Action Figure", "Board Game", "Puzzle", "Doll", "Remote Control Car"],
        "Home": ["Blender", "Vacuum Cleaner", "Microwave", "Coffee Maker", "Toaster"],
        "Sports": ["Basketball", "Soccer Ball", "Tennis Racket", "Yoga Mat", "Dumbbells"]
    }

    # Number of products
    num_products = 20

    # Generate product data
    products_data = {
        "ProductID": range(1, num_products + 1),
        "ProductName": [],
        "Category": [],
        "Price": [],
        "Stock": []
    }

    for _ in range(num_products):
        # Select a random category
        category = random.choice(list(categories.keys()))
        
        # Select a product name that belongs to the chosen category
        product_name = random.choice(categories[category])
        
        # Generate random price and stock
        price = round(random.uniform(10, 500), 2)
        stock = random.randint(10, 1000)
        
        # Append data to the dictionary
        products_data["ProductName"].append(product_name)
        products_data["Category"].append(category)
        products_data["Price"].append(price)
        products_data["Stock"].append(stock)
        
    products_df = pd.DataFrame(products_data)
    products_df.to_csv(f"{base_path_out}/products.csv", index=False)

    print("Orders")
    order_dates = [fake.date_this_year() for _ in range(50)]
    # Create Orders DataFrame without TotalAmount initially
    orders_data = {
        "OrderID": range(1, num_orders + 1),
        "CustomerID": random.choices(customers_data["CustomerID"], k=num_orders),
        "ProductID": random.choices(products_data["ProductID"], k=num_orders),
        "OrderDate": random.choices(order_dates, k=num_orders),
        "Quantity": random.choices(range(1, 20), k=num_orders),
    }

    # Convert orders_data to DataFrame
    orders_df = pd.DataFrame(orders_data)

    # Calculate TotalAmount based on ProductID and Quantity, then add it to the DataFrame
    orders_df["TotalAmount"] = orders_df.apply(
        lambda row: round(products_df.loc[row["ProductID"] - 1, "Price"] * row["Quantity"], 2), axis=1
    )
    orders_df.to_csv(f"{base_path_out}/orders.csv", index=False)

    return customers_df, products_df, orders_df


customers_df, products_df, orders_df = create_orders_data__xlsx()

Customers
Products
Orders


In [11]:
print(f"Columns Customer: {list(customers_df.columns)}")
print(f"Columns Products: {list(products_df.columns)}")
print(f"Columns Orders: {list(orders_df.columns)}")

Columns Customer: ['CustomerID', 'CustomerName', 'Email', 'PhoneNumber', 'Address', 'Country']
Columns Products: ['ProductID', 'ProductName', 'Category', 'Price', 'Stock']
Columns Orders: ['OrderID', 'CustomerID', 'ProductID', 'OrderDate', 'Quantity', 'TotalAmount']
