# Generate Customer Sample Data

## Overview
This notebook generates sample data for the Customer table with specific field distributions and business rules.

## Output
- File: `C:\temp\samples\Customer_samples.csv`
- Contains all Customer table fields with realistic distributions

---

In [3]:
import pandas as pd
import numpy as np
import random
import os
from datetime import datetime, date, timedelta

# Set seed for reproducible results
random.seed(42)
np.random.seed(42)

# Configuration
SAMPLE_SIZE = 513  # We have 513 approved customer names generated separately. 
OUTPUT_FOLDER = "C:\\temp\\samples"
OUTPUT_FILE = "Customer_samples.csv"

# Create output directory
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

print(f"🎯 GENERATING CUSTOMER SAMPLE DATA")
print(f"Sample Size: {SAMPLE_SIZE}")
print(f"Output: {OUTPUT_FOLDER}\\{OUTPUT_FILE}")
print("="*50)

# 1. Generate CustomerTypeId (70% Individual, 20% Business, 10% Government)
def generate_customer_type_id(num_records):
    customer_types = ['Individual', 'Business', 'Government']
    probabilities = [0.7, 0.2, 0.1]
    return np.random.choice(customer_types, size=num_records, p=probabilities)

# 2. Generate IsActive (80% True, 20% False)
def generate_is_active(num_records):
    random_values = np.random.random(num_records)
    return random_values < 0.8  # 80% True, 20% False

# 3. Generate DateOfBirth (Age 18-105 at March 31, 2025)
def generate_date_of_birth(num_records):
    reference_date = date(2025, 3, 31)
    birth_dates = []
    
    for _ in range(num_records):
        # 90% between ages 18-75, 10% between ages 76-105
        if np.random.random() < 0.9:
            # Age 18-75
            age = np.random.randint(18, 76)
        else:
            # Age 76-105
            age = np.random.randint(76, 106)
        
        # Calculate birth year
        birth_year = reference_date.year - age
        
        # Random month and day
        birth_month = np.random.randint(1, 13)
        
        # Handle different months' day ranges
        if birth_month in [1, 3, 5, 7, 8, 10, 12]:
            max_day = 31
        elif birth_month in [4, 6, 9, 11]:
            max_day = 30
        else:  # February
            # Simple leap year check
            if birth_year % 4 == 0 and (birth_year % 100 != 0 or birth_year % 400 == 0):
                max_day = 29
            else:
                max_day = 28
        
        birth_day = np.random.randint(1, max_day + 1)
        
        try:
            birth_date = date(birth_year, birth_month, birth_day)
            birth_dates.append(birth_date)
        except ValueError:
            # Fallback to January 1st if date is invalid
            birth_dates.append(date(birth_year, 1, 1))
    
    return birth_dates

# 4. Generate CustomerEstablishedDate (Jan 1, 2020 to March 31, 2025, uniform)
def generate_customer_established_date(num_records):
    start_date = date(2020, 1, 1)
    end_date = date(2025, 3, 31)
    
    # Calculate total days
    total_days = (end_date - start_date).days
    
    # Generate random days offset
    random_days = np.random.randint(0, total_days + 1, num_records)
    
    # Convert to dates
    established_dates = [start_date + timedelta(days=int(days)) for days in random_days]
    
    return established_dates

# 5. Generate CreatedBy (90% Sales, 10% Services)
def generate_created_by(num_records):
    created_by_values = ['Sales', 'Services']
    probabilities = [0.9, 0.1]
    return np.random.choice(created_by_values, size=num_records, p=probabilities)

# 6. Generate CustomerRelationshipTypeId (20% VIP, 35% Premium, 45% Standard)
def generate_customer_relationship_type_id(num_records):
    relationship_types = ['VIP', 'Premium', 'Standard']
    probabilities = [0.2, 0.35, 0.45]
    return np.random.choice(relationship_types, size=num_records, p=probabilities)

# 7. Generate PrimaryPhone (100% of records)
def generate_primary_phone(num_records):
    phone_numbers = []
    
    # Valid North American area codes (excluding 800, 866, 877, 888, 900)
    na_area_codes = [
        201, 202, 203, 205, 206, 207, 208, 209, 210, 212, 213, 214, 215, 216, 217, 218, 219,
        224, 225, 228, 229, 231, 234, 239, 240, 248, 251, 252, 253, 254, 256, 260, 262, 267,
        269, 270, 276, 281, 301, 302, 303, 304, 305, 307, 308, 309, 310, 312, 313, 314, 315,
        316, 317, 318, 319, 320, 321, 323, 325, 330, 331, 334, 336, 337, 339, 341, 347, 351,
        352, 360, 361, 364, 380, 385, 386, 401, 402, 404, 405, 406, 407, 408, 409, 410, 412,
        413, 414, 415, 417, 419, 423, 424, 425, 430, 432, 434, 435, 440, 443, 458, 463, 464,
        469, 470, 475, 478, 479, 480, 484, 501, 502, 503, 504, 505, 507, 508, 509, 510, 512,
        513, 515, 516, 517, 518, 520, 530, 540, 541, 551, 559, 561, 562, 563, 564, 567, 570,
        571, 573, 574, 575, 580, 585, 586, 601, 602, 603, 605, 606, 607, 608, 609, 610, 612,
        614, 615, 616, 617, 618, 619, 620, 623, 626, 628, 630, 631, 636, 641, 646, 650, 651,
        657, 660, 661, 662, 667, 678, 682, 701, 702, 703, 704, 706, 707, 708, 712, 713, 714,
        715, 716, 717, 718, 719, 720, 724, 725, 727, 731, 732, 734, 737, 740, 747, 754, 757,
        760, 762, 763, 765, 769, 770, 772, 773, 774, 775, 781, 785, 786, 801, 802, 803, 804,
        805, 806, 808, 810, 812, 813, 814, 815, 816, 817, 818, 828, 830, 831, 832, 843, 845,
        847, 848, 850, 856, 857, 858, 859, 860, 862, 863, 864, 865, 870, 872, 878, 901, 903,
        904, 906, 907, 908, 909, 910, 912, 913, 914, 915, 916, 917, 918, 919, 920, 925, 928,
        929, 930, 931, 934, 936, 937, 940, 941, 947, 949, 951, 952, 954, 956, 959, 970, 971,
        972, 973, 978, 979, 980, 984, 985, 989
    ]
    
    # International country codes for non-US numbers
    intl_country_codes = ['+44', '+33', '+49', '+39', '+34', '+61', '+81', '+86', '+91', '+55']
    
    # Counter for sequential international numbers
    intl_counter = 1000000
    
    for i in range(num_records):
        # 85% US numbers, 15% international
        if np.random.random() < 0.85:
            # US number with area code + 555-01XX format
            area_code = np.random.choice(na_area_codes)
            last_two_digits = np.random.randint(0, 100)  # 00-99 for 555-0100 to 555-0199
            phone = f"({area_code}) 555-01{last_two_digits:02d}"
        else:
            # International number with country code + sequential number
            country_code = np.random.choice(intl_country_codes)
            phone = f"{country_code} {intl_counter}"
            intl_counter += 1
        
        phone_numbers.append(phone)
    
    return phone_numbers

# 8. Generate SecondaryPhone (30% of records have secondary phone)
def generate_secondary_phone(num_records):
    phone_numbers = []
    
    # Valid North American area codes (excluding 800, 866, 877, 888, 900)
    na_area_codes = [
        201, 202, 203, 205, 206, 207, 208, 209, 210, 212, 213, 214, 215, 216, 217, 218, 219,
        224, 225, 228, 229, 231, 234, 239, 240, 248, 251, 252, 253, 254, 256, 260, 262, 267,
        269, 270, 276, 281, 301, 302, 303, 304, 305, 307, 308, 309, 310, 312, 313, 314, 315,
        316, 317, 318, 319, 320, 321, 323, 325, 330, 331, 334, 336, 337, 339, 341, 347, 351,
        352, 360, 361, 364, 380, 385, 386, 401, 402, 404, 405, 406, 407, 408, 409, 410, 412,
        413, 414, 415, 417, 419, 423, 424, 425, 430, 432, 434, 435, 440, 443, 458, 463, 464,
        469, 470, 475, 478, 479, 480, 484, 501, 502, 503, 504, 505, 507, 508, 509, 510, 512,
        513, 515, 516, 517, 518, 520, 530, 540, 541, 551, 559, 561, 562, 563, 564, 567, 570,
        571, 573, 574, 575, 580, 585, 586, 601, 602, 603, 605, 606, 607, 608, 609, 610, 612,
        614, 615, 616, 617, 618, 619, 620, 623, 626, 628, 630, 631, 636, 641, 646, 650, 651,
        657, 660, 661, 662, 667, 678, 682, 701, 702, 703, 704, 706, 707, 708, 712, 713, 714,
        715, 716, 717, 718, 719, 720, 724, 725, 727, 731, 732, 734, 737, 740, 747, 754, 757,
        760, 762, 763, 765, 769, 770, 772, 773, 774, 775, 781, 785, 786, 801, 802, 803, 804,
        805, 806, 808, 810, 812, 813, 814, 815, 816, 817, 818, 828, 830, 831, 832, 843, 845,
        847, 848, 850, 856, 857, 858, 859, 860, 862, 863, 864, 865, 870, 872, 878, 901, 903,
        904, 906, 907, 908, 909, 910, 912, 913, 914, 915, 916, 917, 918, 919, 920, 925, 928,
        929, 930, 931, 934, 936, 937, 940, 941, 947, 949, 951, 952, 954, 956, 959, 970, 971,
        972, 973, 978, 979, 980, 984, 985, 989
    ]
    
    # International country codes for non-US numbers
    intl_country_codes = ['+44', '+33', '+49', '+39', '+34', '+61', '+81', '+86', '+91', '+55']
    
    # Counter for sequential international numbers (start higher to avoid duplicates with primary)
    intl_counter = 2000000
    
    for i in range(num_records):
        # 30% chance of having a secondary phone
        if np.random.random() < 0.3:
            # 85% US numbers, 15% international
            if np.random.random() < 0.85:
                # US number with area code + 555-01XX format
                area_code = np.random.choice(na_area_codes)
                last_two_digits = np.random.randint(0, 100)  # 00-99 for 555-0100 to 555-0199
                phone = f"({area_code}) 555-01{last_two_digits:02d}"
            else:
                # International number with country code + sequential number
                country_code = np.random.choice(intl_country_codes)
                phone = f"{country_code} {intl_counter}"
                intl_counter += 1
            phone_numbers.append(phone)
        else:
            phone_numbers.append(None)  # No secondary phone
    
    return phone_numbers

print("🔄 Generating field data...")

# Generate all fields
customer_type_ids = generate_customer_type_id(SAMPLE_SIZE)
is_active_values = generate_is_active(SAMPLE_SIZE)
birth_dates = generate_date_of_birth(SAMPLE_SIZE)
established_dates = generate_customer_established_date(SAMPLE_SIZE)
created_by_values = generate_created_by(SAMPLE_SIZE)
relationship_type_ids = generate_customer_relationship_type_id(SAMPLE_SIZE)
primary_phones = generate_primary_phone(SAMPLE_SIZE)
secondary_phones = generate_secondary_phone(SAMPLE_SIZE)

# Create DataFrame
df = pd.DataFrame({
    'CustomerTypeId': customer_type_ids,
    'IsActive': is_active_values,
    'DateOfBirth': birth_dates,
    'CustomerEstablishedDate': established_dates,
    'CreatedBy': created_by_values,
    'CustomerRelationshipTypeId': relationship_type_ids,
    'PrimaryPhone': primary_phones,
    'SecondaryPhone': secondary_phones
})

print("✅ Data generation complete!")

🎯 GENERATING CUSTOMER SAMPLE DATA
Sample Size: 513
Output: C:\temp\samples\Customer_samples.csv
🔄 Generating field data...
✅ Data generation complete!


In [4]:
# Display distributions and statistics
print("\n📊 DATA DISTRIBUTION ANALYSIS")
print("="*50)

# CustomerTypeId Distribution
print("\n🎯 CustomerTypeId Distribution:")
type_dist = df['CustomerTypeId'].value_counts()
type_pct = df['CustomerTypeId'].value_counts(normalize=True) * 100
for ctype in ['Individual', 'Business', 'Government']:
    count = type_dist.get(ctype, 0)
    percent = type_pct.get(ctype, 0)
    print(f"  {ctype:12}: {count:3d} ({percent:5.1f}%)")

# IsActive Distribution
print("\n🎯 IsActive Distribution:")
active_dist = df['IsActive'].value_counts()
active_pct = df['IsActive'].value_counts(normalize=True) * 100
for active in [True, False]:
    count = active_dist.get(active, 0)
    percent = active_pct.get(active, 0)
    print(f"  {str(active):5}: {count:3d} ({percent:5.1f}%)")

# Age Distribution Analysis
print("\n🎯 Age Distribution (as of March 31, 2025):")
reference_date = date(2025, 3, 31)
ages = [(reference_date - birth_date).days // 365 for birth_date in df['DateOfBirth']]
df_temp = pd.DataFrame({'Age': ages})

age_18_75 = len([age for age in ages if 18 <= age <= 75])
age_76_105 = len([age for age in ages if 76 <= age <= 105])
print(f"  Age 18-75 : {age_18_75:3d} ({age_18_75/len(ages)*100:5.1f}%)")
print(f"  Age 76-105: {age_76_105:3d} ({age_76_105/len(ages)*100:5.1f}%)")
print(f"  Min Age   : {min(ages)}")
print(f"  Max Age   : {max(ages)}")

# Date Range Analysis
print("\n🎯 CustomerEstablishedDate Range:")
min_date = df['CustomerEstablishedDate'].min()
max_date = df['CustomerEstablishedDate'].max()
print(f"  Earliest: {min_date}")
print(f"  Latest  : {max_date}")

# CreatedBy Distribution
print("\n🎯 CreatedBy Distribution:")
created_dist = df['CreatedBy'].value_counts()
created_pct = df['CreatedBy'].value_counts(normalize=True) * 100
for creator in ['Sales', 'Services']:
    count = created_dist.get(creator, 0)
    percent = created_pct.get(creator, 0)
    print(f"  {creator:8}: {count:3d} ({percent:5.1f}%)")

# CustomerRelationshipTypeId Distribution
print("\n🎯 CustomerRelationshipTypeId Distribution:")
relationship_dist = df['CustomerRelationshipTypeId'].value_counts()
relationship_pct = df['CustomerRelationshipTypeId'].value_counts(normalize=True) * 100
for rel_type in ['VIP', 'Premium', 'Standard']:
    count = relationship_dist.get(rel_type, 0)
    percent = relationship_pct.get(rel_type, 0)
    print(f"  {rel_type:8}: {count:3d} ({percent:5.1f}%)")

# Phone Number Distribution Analysis
print("\n🎯 PrimaryPhone Distribution:")
us_primary = len([p for p in df['PrimaryPhone'] if p.startswith('(')])
intl_primary = len([p for p in df['PrimaryPhone'] if p.startswith('+')])
print(f"  US Numbers      : {us_primary:3d} ({us_primary/len(df)*100:5.1f}%)")
print(f"  International   : {intl_primary:3d} ({intl_primary/len(df)*100:5.1f}%)")

print("\n🎯 SecondaryPhone Distribution:")
secondary_count = len([p for p in df['SecondaryPhone'] if p is not None])
secondary_null = len([p for p in df['SecondaryPhone'] if p is None])
us_secondary = len([p for p in df['SecondaryPhone'] if p is not None and p.startswith('(')])
intl_secondary = len([p for p in df['SecondaryPhone'] if p is not None and p.startswith('+')])
print(f"  Has Secondary   : {secondary_count:3d} ({secondary_count/len(df)*100:5.1f}%)")
print(f"  No Secondary    : {secondary_null:3d} ({secondary_null/len(df)*100:5.1f}%)")
if secondary_count > 0:
    print(f"  US Numbers      : {us_secondary:3d} ({us_secondary/secondary_count*100:5.1f}% of secondary)")
    print(f"  International   : {intl_secondary:3d} ({intl_secondary/secondary_count*100:5.1f}% of secondary)")

# Display sample records
print(f"\n📋 First 10 Sample Records:")
print(df.head(10).to_string(index=False))

# Save to CSV
output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILE)
df.to_csv(output_path, index=False)

print(f"\n💾 SAVED TO: {output_path}")
print(f"📊 Total Records: {len(df)}")
print(f"📈 Columns: {', '.join(df.columns)}")
print("\n✅ Customer sample data generation complete!")


📊 DATA DISTRIBUTION ANALYSIS

🎯 CustomerTypeId Distribution:
  Individual  : 353 ( 68.8%)
  Business    : 102 ( 19.9%)
  Government  :  58 ( 11.3%)

🎯 IsActive Distribution:
  True : 415 ( 80.9%)
  False:  98 ( 19.1%)

🎯 Age Distribution (as of March 31, 2025):
  Age 18-75 : 459 ( 89.5%)
  Age 76-105:  51 (  9.9%)
  Min Age   : 17
  Max Age   : 105

🎯 CustomerEstablishedDate Range:
  Earliest: 2020-01-01
  Latest  : 2025-03-24

🎯 CreatedBy Distribution:
  Sales   : 461 ( 89.9%)
  Services:  52 ( 10.1%)

🎯 CustomerRelationshipTypeId Distribution:
  VIP     :  86 ( 16.8%)
  Premium : 177 ( 34.5%)
  Standard: 250 ( 48.7%)

🎯 PrimaryPhone Distribution:
  US Numbers      : 451 ( 87.9%)
  International   :  62 ( 12.1%)

🎯 SecondaryPhone Distribution:
  Has Secondary   : 157 ( 30.6%)
  No Secondary    : 356 ( 69.4%)
  US Numbers      : 140 ( 89.2% of secondary)
  International   :  17 ( 10.8% of secondary)

📋 First 10 Sample Records:
CustomerTypeId  IsActive DateOfBirth CustomerEstablishedDa