# Sample Data Generation Strategy

## Overview
This notebook outlines the strategic approach for generating sample data for all 14 retail tables, ensuring referential integrity and realistic business scenarios.

## Data Generation Order
Tables must be populated in dependency order to maintain foreign key relationships and data integrity.

---

In [None]:
# Sample Data Generation Strategy
print("🎯 RETAIL SAMPLE DATA GENERATION STRATEGY")
print("="*60)

# Define the data generation order based on dependencies
generation_order = {
    "Phase 1 - Foundation Tables (No Dependencies)": {
        "order": 1,
        "tables": [
            "CustomerRelationshipType",
            "BrandCategory" 
        ],
        "description": "Lookup/reference tables with no foreign key dependencies",
        "sample_counts": {
            "CustomerRelationshipType": 5,  # VIP, Premium, Standard, New, Inactive
            "BrandCategory": 8              # Electronics, Clothing, Home, etc.
        }
    },
    
    "Phase 2 - Core Entity Tables": {
        "order": 2, 
        "tables": [
            "Location",
            "Brand",
            "Customer"
        ],
        "description": "Core business entities that other tables reference",
        "sample_counts": {
            "Location": 15,     # Mix of customer addresses, stores, warehouses
            "Brand": 20,        # Various product brands
            "Customer": 50      # Your approved customer names
        }
    },
    
    "Phase 3 - Dependent Entity Tables": {
        "order": 3,
        "tables": [
            "Product",
            "CustomerTradeName", 
            "CustomerLocation",
            "CustomerAccount"
        ],
        "description": "Entities that depend on Phase 2 tables",
        "sample_counts": {
            "Product": 100,             # Your product sample data
            "CustomerTradeName": 10,    # Business customers only
            "CustomerLocation": 75,     # Customers can have multiple addresses
            "CustomerAccount": 60       # Some customers have multiple accounts
        }
    },
    
    "Phase 4 - Order Tables": {
        "order": 4,
        "tables": [
            "Order"
        ], 
        "description": "Order headers that reference customers and accounts",
        "sample_counts": {
            "Order": 200    # Mix of completed, pending, cancelled orders
        }
    },
    
    "Phase 5 - Order Detail Tables": {
        "order": 5,
        "tables": [
            "OrderLine",
            "OrderStatus", 
            "OrderPayment",
            "CustomerAccountLocation"
        ],
        "description": "Tables that depend on orders and other entities",
        "sample_counts": {
            "OrderLine": 500,               # Multiple products per order
            "OrderStatus": 400,             # Status changes for orders
            "OrderPayment": 180,            # Most orders have payments
            "CustomerAccountLocation": 40   # Account-specific locations
        }
    }
}

# Display the strategy
for phase_name, phase_info in generation_order.items():
    print(f"\n{phase_info['order']}. {phase_name}")
    print(f"   Purpose: {phase_info['description']}")
    print(f"   Tables: {', '.join(phase_info['tables'])}")
    print(f"   Sample Counts:")
    for table, count in phase_info['sample_counts'].items():
        print(f"     • {table}: {count} records")

print(f"\n📊 Total Sample Records: {sum(sum(phase['sample_counts'].values()) for phase in generation_order.values())}")

## Detailed Table Generation Guidelines

### Phase 1: Foundation Tables

In [None]:
print("📋 PHASE 1 - FOUNDATION TABLES")
print("="*40)

phase1_details = {
    "CustomerRelationshipType": {
        "purpose": "Customer loyalty/tier classifications",
        "sample_data": [
            "VIP - Premium customers with highest value",
            "Premium - High-value repeat customers", 
            "Standard - Regular customers",
            "New - Recently acquired customers",
            "Inactive - Dormant customers"
        ],
        "key_fields": ["CustomerRelationshipTypeId", "CustomerRelationshipTypeName", "CustomerRelationshipTypeDescription"]
    },
    
    "BrandCategory": {
        "purpose": "Product brand classifications",
        "sample_data": [
            "Electronics - Consumer electronics brands",
            "Clothing - Apparel and fashion brands",
            "Home & Garden - Home improvement brands", 
            "Sports & Outdoors - Athletic and outdoor brands",
            "Health & Beauty - Personal care brands",
            "Automotive - Car and vehicle brands",
            "Books & Media - Publishing and media brands",
            "Toys & Games - Children's entertainment brands"
        ],
        "key_fields": ["BrandCategoryId", "BrandCategoryName", "BrandCategoryDescription"]
    }
}

for table, details in phase1_details.items():
    print(f"\n🗂️ {table}")
    print(f"   Purpose: {details['purpose']}")
    print(f"   Key Fields: {', '.join(details['key_fields'])}")
    print(f"   Sample Data Examples:")
    for example in details['sample_data'][:3]:
        print(f"     • {example}")
    print(f"   💡 Note: Use simple incremental IDs (1, 2, 3, etc.)")

### Phase 2: Core Entities

In [None]:
print("📋 PHASE 2 - CORE ENTITY TABLES") 
print("="*40)

phase2_details = {
    "Location": {
        "purpose": "Physical addresses for customers, stores, warehouses",
        "data_mix": {
            "Customer addresses": "60% (9 records)",
            "Store locations": "27% (4 records)", 
            "Warehouse/Distribution": "13% (2 records)"
        },
        "key_considerations": [
            "Include realistic addresses with proper coordinates",
            "Mix of residential and commercial locations",
            "Use IsActive = true for all initially"
        ]
    },
    
    "Brand": {
        "purpose": "Product manufacturers and brand owners", 
        "data_mix": {
            "Electronics brands": "25% (5 records)",
            "Clothing brands": "25% (5 records)",
            "Other categories": "50% (10 records)"
        },
        "key_considerations": [
            "Link to BrandCategory via BrandCategoryId",
            "Use realistic brand names or generic equivalents",
            "Set IsActive = true for active brands"
        ]
    },
    
    "Customer": {
        "purpose": "Individual and business customers",
        "data_mix": {
            "Individual customers": "80% (40 records)",
            "Business customers": "20% (10 records)"
        }, 
        "key_considerations": [
            "Use your approved customer names",
            "Set CustomerTypeId to distinguish individual vs business",
            "Business customers will get entries in CustomerTradeName",
            "Include realistic contact information",
            "Set IsActive = true, CreatedBy = 'DATA_LOAD'"
        ]
    }
}

for table, details in phase2_details.items():
    print(f"\n🗂️ {table}")
    print(f"   Purpose: {details['purpose']}")
    print(f"   Data Mix:")
    for mix_type, percentage in details['data_mix'].items():
        print(f"     • {mix_type}: {percentage}")
    print(f"   Key Considerations:")
    for consideration in details['key_considerations']:
        print(f"     • {consideration}")

### Phase 3-5: Dependent Tables

In [None]:
print("📋 PHASES 3-5 - DEPENDENT TABLES")
print("="*40)

dependency_details = {
    "Product": {
        "phase": 3,
        "dependencies": ["Brand", "BrandCategory"],
        "strategy": "Use your product sample data, link to Brand via BrandId"
    },
    
    "CustomerTradeName": {
        "phase": 3, 
        "dependencies": ["Customer"],
        "strategy": "Only for business customers (20% of Customer records)"
    },
    
    "CustomerLocation": {
        "phase": 3,
        "dependencies": ["Customer", "Location"], 
        "strategy": "Link customers to addresses, some customers have multiple"
    },
    
    "CustomerAccount": {
        "phase": 3,
        "dependencies": ["Customer"],
        "strategy": "Most customers have 1 account, some have multiple"
    },
    
    "Order": {
        "phase": 4,
        "dependencies": ["Customer", "CustomerAccount"],
        "strategy": "Mix of order statuses and dates over past 12 months"
    },
    
    "OrderLine": {
        "phase": 5, 
        "dependencies": ["Order", "Product"],
        "strategy": "2-3 products per order on average"
    },
    
    "OrderStatus": {
        "phase": 5,
        "dependencies": ["Order"],
        "strategy": "Track status changes: Pending → Processing → Shipped → Delivered"
    },
    
    "OrderPayment": {
        "phase": 5,
        "dependencies": ["Order"], 
        "strategy": "90% of orders have payments, 10% pending"
    }
}

for table, details in dependency_details.items():
    print(f"\n📊 {table} (Phase {details['phase']})")
    print(f"   Depends on: {', '.join(details['dependencies'])}")
    print(f"   Strategy: {details['strategy']}")

## Data Generation Tools & Templates

In [None]:
print("🛠️ RECOMMENDED DATA GENERATION APPROACH")
print("="*50)

approach = {
    "1. Use Your Existing Data": [
        "✅ Legal approved customer names → Customer table",
        "✅ Product sample data → Product table", 
        "✅ Generate realistic IDs using UUID or incremental numbers"
    ],
    
    "2. Generate Supporting Data": [
        "📍 Realistic addresses for Location table",
        "🏢 Brand names (or use generic: 'Brand_001', 'Brand_002')",
        "📅 Order dates spread over past 12 months",
        "💰 Realistic prices and quantities"
    ],
    
    "3. Maintain Relationships": [
        "🔗 Every OrderLine must reference valid Order and Product",
        "🔗 Every Order must reference valid Customer", 
        "🔗 Every Product must reference valid Brand",
        "🔗 Use consistent date ranges (no future dates)"
    ],
    
    "4. Quality Guidelines": [
        "✨ Use consistent naming patterns",
        "✨ Include mix of active/inactive records where applicable",
        "✨ Ensure realistic business scenarios",
        "✨ Test data integrity after each phase"
    ]
}

for category, guidelines in approach.items():
    print(f"\n{category}")
    for guideline in guidelines:
        print(f"   {guideline}")

print(f"\n🎯 SUCCESS CRITERIA:")
print(f"   • All 14 tables populated with sample data")
print(f"   • Referential integrity maintained")
print(f"   • Realistic business scenarios represented")
print(f"   • Ready for analytics and reporting demos")