In [None]:
!pip install faker pandas numpy timedelta

import random
from faker import Faker
import pandas as pd
import numpy as np
from datetime import timedelta

In [None]:
fake = Faker()
Faker.seed(42)
random.seed(42)

# ========================
# CONFIG
# ========================
NUM_CUSTOMERS = 10_000
NUM_SUPPLIERS = 100
NUM_PRODUCTS = 2_500
NUM_ORDERS = 120_000
NUM_ORDER_ITEMS = 450_000

START_DATE = pd.to_datetime("2022-01-01")
END_DATE = pd.to_datetime("2024-12-31")

def generate_10_digit_phone():
    first_digit = random.choice(['7', '8', '9'])
    remaining_digits = ''.join(str(random.randint(0, 9)) for _ in range(9))
    return first_digit + remaining_digits

In [None]:
# ========================
# CUSTOMERS
# ========================
customers = []
for cid in range(1, NUM_CUSTOMERS + 1):
    reg_date = fake.date_between(start_date="-4y", end_date="-1y")
    customers.append([
        cid,
        fake.first_name(),
        fake.last_name(),
        fake.unique.email(),
        generate_10_digit_phone(),
        fake.country(),
        fake.state(),
        fake.city(),
        fake.postcode(),
        random.choice(["Premium", "Standard", "Budget"]),
        reg_date,
        fake.date_between(start_date=reg_date, end_date="today"),
        random.choice([1, 1, 1, 0])  # mostly active
    ])

customers_df = pd.DataFrame(customers, columns=[
    "customer_id","first_name","last_name","email","phone",
    "country","state","city","postal_code","customer_segment",
    "registration_date","last_purchase_date","is_active"
])

In [None]:
# ========================
# SUPPLIERS
# ========================
suppliers = []
for sid in range(1, NUM_SUPPLIERS + 1):
    suppliers.append([
        sid,
        fake.company(),
        fake.name(),
        fake.company_email(),
        generate_10_digit_phone(),
        fake.country(),
        fake.city(),
        fake.address(),
        random.choice(["Net 30","Net 45","Net 60"]),
        random.randint(3, 30),
        1,
        fake.date_between(start_date="-6y", end_date="-1y")
    ])

suppliers_df = pd.DataFrame(suppliers, columns=[
    "supplier_id","supplier_name","contact_person","email","phone",
    "country","city","address","payment_terms","lead_time_days",
    "is_active","registration_date"
])

In [None]:
# ========================
# PRODUCTS
# ========================
categories = {
    "Electronics": ["Phones","Laptops","Accessories"],
    "Clothing": ["Men","Women","Kids"],
    "Home": ["Furniture","Kitchen","Decor"],
    "Sports": ["Outdoor","Indoor","Fitness"]
}

products = []
for pid in range(1, NUM_PRODUCTS + 1):
    category = random.choice(list(categories.keys()))
    sub_category = random.choice(categories[category])
    cost = round(random.uniform(5, 500), 2)
    price = round(cost * random.uniform(1.3, 2.2), 2)

    products.append([
        pid,
        f"{fake.word().title()} {sub_category}",
        category,
        sub_category,
        fake.company(),
        fake.text(50),
        price,
        cost,
        random.randint(0, 500),
        random.randint(20, 100),
        random.randint(1, NUM_SUPPLIERS),
        random.choice([0, 0, 0, 1]),
        fake.date_between(start_date="-4y", end_date="-1y"),
        fake.date_time_this_year()
    ])

products_df = pd.DataFrame(products, columns=[
    "product_id","product_name","category","sub_category","brand",
    "description","price","cost","stock_quantity","reorder_level",
    "supplier_id","is_discontinued","created_date","last_updated"
])

In [None]:
# ========================
# ORDERS
# ========================
orders = []
order_dates = pd.to_datetime(
    np.random.choice(
        pd.date_range(START_DATE, END_DATE),
        NUM_ORDERS
    )
)

for oid in range(1, NUM_ORDERS + 1):
    order_date = order_dates[oid - 1]
    delivery_date = order_date + timedelta(days=random.randint(2, 10))

    orders.append([
        oid,
        random.randint(1, NUM_CUSTOMERS),
        order_date.date(),
        delivery_date.date(),
        0,  # placeholder
        round(random.uniform(0, 30), 2),
        round(random.uniform(5, 25), 2),
        round(random.uniform(5, 20), 2),
        random.choice(["Pending","Shipped","Delivered","Cancelled","Returned"]),
        random.choice(["Credit Card","Debit Card","PayPal","Cash"]),
        fake.sentence()
    ])

orders_df = pd.DataFrame(orders, columns=[
    "order_id","customer_id","order_date","delivery_date",
    "total_amount","discount_applied","tax_amount","shipping_cost",
    "order_status","payment_method","notes"
])

In [None]:
# ========================
# ORDER ITEMS
# ========================
order_items = []
order_ids = np.random.choice(range(1, NUM_ORDERS + 1), NUM_ORDER_ITEMS)

for oid in order_ids:
    product = products_df.sample(1).iloc[0]
    qty = random.randint(1, 5)
    line_total = round(qty * product["price"], 2)

    order_items.append([
        oid,
        product["product_id"],
        qty,
        product["price"],
        line_total,
        round(random.uniform(0, 15), 2),
        round(random.uniform(5, 18), 2),
        random.choice(["Pending","Shipped","Delivered","Cancelled"]),
        fake.word()
    ])

order_items_df = pd.DataFrame(order_items, columns=[
    "order_id","product_id","quantity","unit_price",
    "line_total","discount_percent","tax_percent",
    "item_status","notes"
])

order_items_df.insert(0, "order_item_id", range(1, NUM_ORDER_ITEMS + 1))

In [None]:
# ========================
# FIX ORDER TOTALS
# ========================
totals = order_items_df.groupby("order_id")["line_total"].sum().reset_index()
orders_df = orders_df.merge(totals, on="order_id", how="left")
orders_df["total_amount"] = orders_df["line_total"].fillna(0)
orders_df.drop(columns=["line_total"], inplace=True)

In [None]:
# ========================
# EXPORT
# ========================
customers_df.to_csv("customers.csv", index=False)
suppliers_df.to_csv("suppliers.csv", index=False)
products_df.to_csv("products.csv", index=False)
orders_df.to_csv("orders.csv", index=False)
order_items_df.to_csv("order_items.csv", index=False)

print("âœ… Retail datasets generated successfully")

In [None]:
!ls -lh /content

import os
for f in ["customers.csv","suppliers.csv","products.csv","orders.csv","order_items.csv"]:
    if os.path.exists(f):
        print(f"Ready: {f}")

!zip retail_dataset.zip *.csv
from google.colab import files
files.download("retail_dataset.zip")