# Generate Product Sample Data

## Overview
This notebook generates sample data for the Product table with specific field distributions and business rules.

## Output
- File: `C:\temp\samples\Product_samples.csv`
- Contains selected Product table fields with realistic distributions

---

In [None]:
# filepath: c:\Repos\Code\SampleDataPrep\src\notebooks\data\Generate_Product_Samples.ipynb
import pandas as pd
import numpy as np
import random
import os
from datetime import datetime, date, timedelta

# Set seed for reproducible results
random.seed(42)
np.random.seed(42)

# Configuration
SAMPLE_SIZE = 295  # Number of product records to generate
OUTPUT_FOLDER = "C:\\temp\\samples"
OUTPUT_FILE = "Product_samples.csv"

# Create output directory
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Remove existing output file if it exists
output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILE)
if os.path.exists(output_path):
    os.remove(output_path)
    print(f"🗑️ Removed existing file: {output_path}")

print(f"🎯 GENERATING PRODUCT SAMPLE DATA")
print(f"Sample Size: {SAMPLE_SIZE}")
print(f"Output: {OUTPUT_FOLDER}\\{OUTPUT_FILE}")
print("="*50)

# Placeholder functions - will be updated based on your field requirements
def generate_product_id(num_records):
    """Generate unique product IDs"""
    return [f"P-{i+1:03d}" for i in range(num_records)]

def generate_product_name(num_records):
    """Generate sample product names"""
    product_names = [f"Product {i+1}" for i in range(num_records)]
    return product_names

# 3. Generate BrandName (70% Fabrikam, 30% Alpine Ski House)
def generate_brand_name(num_records):
    """Generate BrandName with specified distribution"""
    brand_names = ['Fabrikam', 'Alpine Ski House']
    probabilities = [0.7, 0.3]
    return np.random.choice(brand_names, size=num_records, p=probabilities)

# 4. Generate CreatedDate (Jan 1, 2020 to Dec 31, 2020)
def generate_created_date(num_records):
    """Generate CreatedDate uniformly distributed in 2020"""
    start_date = date(2020, 1, 1)
    end_date = date(2020, 12, 31)
    
    # Calculate total days
    total_days = (end_date - start_date).days
    
    # Generate random days offset
    random_days = np.random.randint(0, total_days + 1, num_records)
    
    # Convert to dates
    created_dates = [start_date + timedelta(days=int(days)) for days in random_days]
    
    return created_dates

# 5. Generate SellStartDate (Jan 1, 2020 to Sept 30, 2021, must be later than CreatedDate)
def generate_sell_start_date(created_dates):
    """Generate SellStartDate that is later than CreatedDate"""
    sell_start_dates = []
    
    sell_start_min = date(2020, 1, 1)
    sell_start_max = date(2021, 9, 30)
    
    for created_date in created_dates:
        # SellStartDate must be later than CreatedDate
        # Use the later of CreatedDate or sell_start_min as the minimum
        min_start_date = max(created_date, sell_start_min)
        
        # If min_start_date is after sell_start_max, use sell_start_max
        if min_start_date > sell_start_max:
            sell_start_date = sell_start_max
        else:
            # Generate random date between min_start_date and sell_start_max
            days_range = (sell_start_max - min_start_date).days
            if days_range > 0:
                random_days = np.random.randint(0, days_range + 1)
                sell_start_date = min_start_date + timedelta(days=int(random_days))
            else:
                sell_start_date = min_start_date
        
        sell_start_dates.append(sell_start_date)
    
    return sell_start_dates

# 6. Generate SellEndDate (Less than 5% discontinued, Jan 1, 2022 to March 31, 2023)
def generate_sell_end_date(num_records):
    """Generate SellEndDate for less than 5% of products (discontinued products)"""
    sell_end_dates = []
    
    start_date = date(2022, 1, 1)
    end_date = date(2023, 3, 31)
    
    for i in range(num_records):
        # Less than 5% chance of being discontinued
        if np.random.random() < 0.05:
            # Calculate total days for discontinued date range
            total_days = (end_date - start_date).days
            random_days = np.random.randint(0, total_days + 1)
            sell_end_date = start_date + timedelta(days=int(random_days))
            sell_end_dates.append(sell_end_date)
        else:
            # Product is still active, empty string instead of None
            sell_end_dates.append("")
    
    return sell_end_dates

# 7. Generate ProductStatus (96% active, 3% inactive, 1% discontinued)
def generate_product_status(num_records):
    """Generate ProductStatus with specified distribution"""
    statuses = ['active', 'inactive', 'discontinued']
    probabilities = [0.96, 0.03, 0.01]
    return np.random.choice(statuses, size=num_records, p=probabilities)

print("🔄 Generating field data...")

# Generate all fields
product_ids = generate_product_id(SAMPLE_SIZE)
product_names = generate_product_name(SAMPLE_SIZE)
brand_names = generate_brand_name(SAMPLE_SIZE)
created_dates = generate_created_date(SAMPLE_SIZE)
sell_start_dates = generate_sell_start_date(created_dates)
sell_end_dates = generate_sell_end_date(SAMPLE_SIZE)
product_statuses = generate_product_status(SAMPLE_SIZE)

# Create DataFrame
df = pd.DataFrame({
    'ProductId': product_ids,
    'ProductName': product_names,
    'BrandName': brand_names,
    'CreatedDate': created_dates,
    'SellStartDate': sell_start_dates,
    'SellEndDate': sell_end_dates,
    'ProductStatus': product_statuses
})

print("✅ Data generation complete!")

🗑️ Removed existing file: C:\temp\samples\Product_samples.csv
🎯 GENERATING PRODUCT SAMPLE DATA
Sample Size: 295
Output: C:\temp\samples\Product_samples.csv
🔄 Generating field data...
✅ Data generation complete!


In [None]:
# filepath: c:\Repos\Code\SampleDataPrep\src\notebooks\data\Generate_Product_Samples.ipynb
# Display distributions and statistics
print("\n📊 DATA DISTRIBUTION ANALYSIS")
print("="*50)

# BrandName Distribution
print("\n🎯 BrandName Distribution:")
brand_dist = df['BrandName'].value_counts()
brand_pct = df['BrandName'].value_counts(normalize=True) * 100
for brand in ['Fabrikam', 'Alpine Ski House']:
    count = brand_dist.get(brand, 0)
    percent = brand_pct.get(brand, 0)
    print(f"  {brand:15}: {count:3d} ({percent:5.1f}%)")

# ProductStatus Distribution
print("\n🎯 ProductStatus Distribution:")
status_dist = df['ProductStatus'].value_counts()
status_pct = df['ProductStatus'].value_counts(normalize=True) * 100
for status in ['active', 'inactive', 'discontinued']:
    count = status_dist.get(status, 0)
    percent = status_pct.get(status, 0)
    print(f"  {status:12}: {count:3d} ({percent:5.1f}%)")

# Date Range Analysis
print("\n🎯 CreatedDate Range:")
min_created = df['CreatedDate'].min()
max_created = df['CreatedDate'].max()
print(f"  Earliest: {min_created}")
print(f"  Latest  : {max_created}")

print("\n🎯 SellStartDate Range:")
min_sell_start = df['SellStartDate'].min()
max_sell_start = df['SellStartDate'].max()
print(f"  Earliest: {min_sell_start}")
print(f"  Latest  : {max_sell_start}")

# Validate SellStartDate > CreatedDate
invalid_dates = df[df['SellStartDate'] <= df['CreatedDate']]
print(f"  Invalid dates (SellStart <= Created): {len(invalid_dates)}")

print("\n🎯 SellEndDate Distribution:")
discontinued_count = len([d for d in df['SellEndDate'] if d != ""])
active_count = len([d for d in df['SellEndDate'] if d == ""])
print(f"  Discontinued: {discontinued_count:3d} ({discontinued_count/len(df)*100:5.1f}%)")
print(f"  Active      : {active_count:3d} ({active_count/len(df)*100:5.1f}%)")

if discontinued_count > 0:
    discontinued_df = df[df['SellEndDate'] != ""]
    min_end = discontinued_df['SellEndDate'].min()
    max_end = discontinued_df['SellEndDate'].max()
    print(f"  End Date Range: {min_end} to {max_end}")

print(f"\n📋 First 10 Sample Records:")
print(df.head(10).to_string(index=False))

# Save to CSV
output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILE)
df.to_csv(output_path, index=False)

print(f"\n💾 SAVED TO: {output_path}")
print(f"📊 Total Records: {len(df)}")
print(f"📈 Columns: {', '.join(df.columns)}")
print("\n✅ Product sample data generation complete!")


📊 DATA DISTRIBUTION ANALYSIS

🎯 BrandName Distribution:
  Fabrikam       : 205 ( 69.5%)
  Alpine Ski House:  90 ( 30.5%)

🎯 CreatedDate Range:
  Earliest: 2020-01-01
  Latest  : 2020-12-30

🎯 SellStartDate Range:
  Earliest: 2020-02-09
  Latest  : 2021-09-27
  Invalid dates (SellStart <= Created): 2

🎯 SellEndDate Distribution:
  Discontinued:  11 (  3.7%)
  Active      : 284 ( 96.3%)
  End Date Range: 2022-01-14 to 2023-03-11

📋 First 10 Sample Records:
ProductId ProductName        BrandName CreatedDate SellStartDate SellEndDate
    P-001   Product 1         Fabrikam  2020-09-20    2021-05-17            
    P-002   Product 2 Alpine Ski House  2020-02-22    2021-09-01            
    P-003   Product 3 Alpine Ski House  2020-02-29    2021-08-28            
    P-004   Product 4         Fabrikam  2020-04-17    2020-12-25            
    P-005   Product 5         Fabrikam  2020-01-05    2021-01-28            
    P-006   Product 6         Fabrikam  2020-04-12    2020-12-23            
 