# Sales Performance Dashboard - Data Generation

**Purpose:** Generate a realistic retail sales dataset with 50,000+ transactions for comprehensive business analysis

**Dataset Features:**
- 50,000 transaction records (Jan 2023 - Dec 2025)
- 12 product categories
- Multi-channel sales (online/offline)
- Customer segmentation data
- Regional distribution

---

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Define business parameters
NUM_RECORDS = 50000

# Product categories (12 categories as per requirement)
CATEGORIES = [
    'Electronics', 'Clothing', 'Home & Kitchen', 'Sports & Outdoors',
    'Beauty & Personal Care', 'Books', 'Toys & Games', 'Grocery',
    'Automotive', 'Health & Wellness', 'Office Supplies', 'Pet Supplies'
]

# Regions
REGIONS = ['North', 'South', 'East', 'West', 'Central']

# Sales channels
CHANNELS = ['Online', 'Offline']

# Customer types
CUSTOMER_TYPES = ['New', 'Returning']

print(f"Generating {NUM_RECORDS:,} transaction records...")

Generating 50,000 transaction records...


In [3]:
# Generate date range (Jan 2023 - Dec 2025)
start_date = datetime(2023, 1, 1)
end_date = datetime(2025, 12, 31)
date_range = (end_date - start_date).days

# Generate transaction dates with realistic patterns (more recent transactions)
dates = []
for _ in range(NUM_RECORDS):
    # Weighted towards recent dates
    random_days = int(np.random.beta(2, 5) * date_range)
    dates.append(start_date + timedelta(days=random_days))

print(f"Date range: {min(dates).date()} to {max(dates).date()}")

Date range: 2023-01-01 to 2025-09-02


In [4]:
# Generate customer IDs (simulating 10,000 unique customers)
NUM_CUSTOMERS = 10000
customer_ids = np.random.randint(1001, 1001 + NUM_CUSTOMERS, NUM_RECORDS)

# Generate product IDs (500 unique products)
NUM_PRODUCTS = 500
product_ids = np.random.randint(5001, 5001 + NUM_PRODUCTS, NUM_RECORDS)

print(f"Unique customers: {len(np.unique(customer_ids)):,}")
print(f"Unique products: {len(np.unique(product_ids)):,}")

Unique customers: 9,939
Unique products: 500


In [5]:
# Generate product categories with realistic distribution
category_weights = [0.15, 0.12, 0.10, 0.08, 0.09, 0.07, 0.08, 0.09, 0.06, 0.07, 0.05, 0.04]
categories = np.random.choice(CATEGORIES, NUM_RECORDS, p=category_weights)

# Generate regions
regions = np.random.choice(REGIONS, NUM_RECORDS, p=[0.22, 0.20, 0.20, 0.20, 0.18])

# Generate sales channels (60% online, 40% offline)
sales_channels = np.random.choice(CHANNELS, NUM_RECORDS, p=[0.60, 0.40])

# Generate customer types (30% new, 70% returning)
customer_types = np.random.choice(CUSTOMER_TYPES, NUM_RECORDS, p=[0.30, 0.70])

print("Categorical variables generated")

Categorical variables generated


In [6]:
# Generate quantities (1-10 items per transaction)
quantities = np.random.choice(range(1, 11), NUM_RECORDS, p=[0.35, 0.25, 0.15, 0.10, 0.06, 0.04, 0.02, 0.01, 0.01, 0.01])

# Generate unit prices based on category
category_price_ranges = {
    'Electronics': (50, 2000),
    'Clothing': (15, 200),
    'Home & Kitchen': (20, 500),
    'Sports & Outdoors': (25, 800),
    'Beauty & Personal Care': (10, 150),
    'Books': (8, 60),
    'Toys & Games': (10, 150),
    'Grocery': (5, 100),
    'Automotive': (30, 1000),
    'Health & Wellness': (15, 300),
    'Office Supplies': (5, 200),
    'Pet Supplies': (10, 150)
}

unit_prices = []
for cat in categories:
    price_range = category_price_ranges[cat]
    price = round(np.random.uniform(price_range[0], price_range[1]), 2)
    unit_prices.append(price)

unit_prices = np.array(unit_prices)

print(f"Price range: ${unit_prices.min():.2f} - ${unit_prices.max():.2f}")

Price range: $5.00 - $1999.93


In [7]:
# Generate discounts (0%, 5%, 10%, 15%, 20%, 25%)
discount_options = [0, 0.05, 0.10, 0.15, 0.20, 0.25]
discount_weights = [0.40, 0.25, 0.15, 0.10, 0.07, 0.03]
discounts = np.random.choice(discount_options, NUM_RECORDS, p=discount_weights)

# Calculate revenue
revenue = quantities * unit_prices * (1 - discounts)
revenue = np.round(revenue, 2)

print(f"Total revenue: ${revenue.sum():,.2f}")
print(f"Average order value: ${revenue.mean():.2f}")

Total revenue: $36,754,735.77
Average order value: $735.09


In [8]:
# Create the master dataset
sales_data = pd.DataFrame({
    'transaction_id': range(100001, 100001 + NUM_RECORDS),
    'order_date': dates,
    'customer_id': customer_ids,
    'product_id': product_ids,
    'product_category': categories,
    'quantity': quantities,
    'unit_price': unit_prices,
    'discount': discounts,
    'revenue': revenue,
    'region': regions,
    'sales_channel': sales_channels,
    'customer_type': customer_types
})

# Sort by date
sales_data = sales_data.sort_values('order_date').reset_index(drop=True)

print(f"\nDataset created: {sales_data.shape[0]:,} rows Ã— {sales_data.shape[1]} columns")
sales_data.head(10)


Dataset created: 50,000 rows Ã— 12 columns


Unnamed: 0,transaction_id,order_date,customer_id,product_id,product_category,quantity,unit_price,discount,revenue,region,sales_channel,customer_type
0,106173,2023-01-01,9243,5417,Health & Wellness,2,266.26,0.1,479.27,Central,Online,Returning
1,128560,2023-01-03,10369,5388,Sports & Outdoors,2,559.82,0.0,1119.64,Central,Online,Returning
2,147363,2023-01-03,1059,5499,Books,1,52.48,0.2,41.98,South,Online,New
3,137886,2023-01-03,5532,5146,Clothing,1,182.1,0.0,182.1,North,Offline,Returning
4,118475,2023-01-04,6144,5368,Beauty & Personal Care,5,121.21,0.15,515.14,West,Online,Returning
5,133074,2023-01-04,8784,5355,Books,6,9.79,0.0,58.74,East,Online,Returning
6,138465,2023-01-04,7989,5427,Electronics,4,923.06,0.05,3507.63,North,Online,New
7,115657,2023-01-04,8979,5323,Beauty & Personal Care,1,123.96,0.05,117.76,Central,Offline,Returning
8,130152,2023-01-04,10108,5437,Health & Wellness,4,64.86,0.05,246.47,North,Online,Returning
9,107128,2023-01-04,4470,5043,Books,2,45.7,0.1,82.26,Central,Online,Returning


In [9]:
# Data quality check
print("=" * 60)
print("DATA QUALITY REPORT")
print("=" * 60)
print(f"\nTotal Records: {len(sales_data):,}")
print(f"Missing Values: {sales_data.isnull().sum().sum()}")
print(f"Duplicate Transactions: {sales_data['transaction_id'].duplicated().sum()}")
print(f"\nDate Range: {sales_data['order_date'].min().date()} to {sales_data['order_date'].max().date()}")
print(f"Unique Customers: {sales_data['customer_id'].nunique():,}")
print(f"Unique Products: {sales_data['product_id'].nunique():,}")
print(f"\nTotal Revenue: ${sales_data['revenue'].sum():,.2f}")
print(f"Average Order Value: ${sales_data['revenue'].mean():.2f}")
print(f"Median Order Value: ${sales_data['revenue'].median():.2f}")

DATA QUALITY REPORT

Total Records: 50,000
Missing Values: 0
Duplicate Transactions: 0

Date Range: 2023-01-01 to 2025-09-02
Unique Customers: 9,939
Unique Products: 500

Total Revenue: $36,754,735.77
Average Order Value: $735.09
Median Order Value: $253.31


In [10]:
# Display summary statistics
print("\n" + "=" * 60)
print("CATEGORY DISTRIBUTION")
print("=" * 60)
print(sales_data['product_category'].value_counts())

print("\n" + "=" * 60)
print("CHANNEL DISTRIBUTION")
print("=" * 60)
print(sales_data['sales_channel'].value_counts())

print("\n" + "=" * 60)
print("CUSTOMER TYPE DISTRIBUTION")
print("=" * 60)
print(sales_data['customer_type'].value_counts())


CATEGORY DISTRIBUTION
product_category
Electronics               7426
Clothing                  6155
Home & Kitchen            5008
Beauty & Personal Care    4441
Grocery                   4428
Sports & Outdoors         4074
Toys & Games              3975
Books                     3489
Health & Wellness         3477
Automotive                3091
Office Supplies           2522
Pet Supplies              1914
Name: count, dtype: int64

CHANNEL DISTRIBUTION
sales_channel
Online     30030
Offline    19970
Name: count, dtype: int64

CUSTOMER TYPE DISTRIBUTION
customer_type
Returning    35110
New          14890
Name: count, dtype: int64


In [11]:
# Save the raw dataset
output_path = '../data/sales_data_raw.csv'
sales_data.to_csv(output_path, index=False)
print(f"\nâœ“ Raw dataset saved to: {output_path}")
print(f"âœ“ File size: {sales_data.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
print("\nðŸŽ¯ Dataset ready for SQL analysis and Python EDA!")


âœ“ Raw dataset saved to: ../data/sales_data_raw.csv
âœ“ File size: 13.49 MB

ðŸŽ¯ Dataset ready for SQL analysis and Python EDA!
