# Generate Text and numbers 

In [6]:
import pandas as pd
import numpy as np
import random
import os

# Set seed for reproducible results
random.seed(42)
np.random.seed(42)

# Generate CustomerTypeId column
def generate_customer_type_id(num_records=100):
    """
    Generate CustomerTypeId values with specified distribution:
    - 70% Individual
    - 20% Business  
    - 10% Government
    """
    
    # Define the values and their probabilities
    customer_types = ['Individual', 'Business', 'Government']
    probabilities = [0.7, 0.2, 0.1]  # 70%, 20%, 10%
    
    # Generate random values based on probabilities
    customer_type_ids = np.random.choice(
        customer_types, 
        size=num_records, 
        p=probabilities
    )
    
    return customer_type_ids

# Generate IsActive column
def generate_is_active(num_records=100):
    """
    Generate IsActive boolean values with specified distribution:
    - 80% True
    - 20% False
    """
    
    # Generate random numbers and convert to boolean based on threshold
    random_values = np.random.random(num_records)
    is_active_values = random_values < 0.8  # 80% will be True, 20% False
    
    return is_active_values

# Example: Generate 500 CustomerTypeId and IsActive values
sample_size = 500
customer_types = generate_customer_type_id(sample_size)
is_active_values = generate_is_active(sample_size)

# Create DataFrame to display results
df = pd.DataFrame({
    'CustomerTypeId': customer_types,
    'IsActive': is_active_values
})

# Display distribution
print("🎯 CUSTOMER TYPE DISTRIBUTION")
print("="*40)
distribution = df['CustomerTypeId'].value_counts()
percentages = df['CustomerTypeId'].value_counts(normalize=True) * 100

for customer_type in ['Individual', 'Business', 'Government']:
    count = distribution.get(customer_type, 0)
    percent = percentages.get(customer_type, 0)
    print(f"{customer_type:12}: {count:2d} records ({percent:5.1f}%)")

print(f"\n🎯 IS ACTIVE DISTRIBUTION")
print("="*30)
active_distribution = df['IsActive'].value_counts()
active_percentages = df['IsActive'].value_counts(normalize=True) * 100

for active_value in [True, False]:
    count = active_distribution.get(active_value, 0)
    percent = active_percentages.get(active_value, 0)
    print(f"{str(active_value):5}: {count:3d} records ({percent:5.1f}%)")

print(f"\nTotal records: {sample_size}")

# Display first 15 values as sample
print(f"\n📋 First 15 generated values:")
print(df.head(15)[['CustomerTypeId', 'IsActive']].to_string(index=False))

# Save to CSV for use in Customer table
output_path = "C:\\temp\\retail_headers\\CustomerTypeId_sample.csv"

# Create directory if it doesn't exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)

df.to_csv(output_path, index=False)
print(f"\n💾 Saved to: {output_path}")
print(f"📊 Columns: CustomerTypeId, IsActive")

🎯 CUSTOMER TYPE DISTRIBUTION
Individual  : 346 records ( 69.2%)
Business    : 99 records ( 19.8%)
Government  : 55 records ( 11.0%)

🎯 IS ACTIVE DISTRIBUTION
True : 407 records ( 81.4%)
False:  93 records ( 18.6%)

Total records: 500

📋 First 15 generated values:
CustomerTypeId  IsActive
    Individual      True
    Government      True
      Business      True
    Individual     False
    Individual      True
    Individual      True
    Individual     False
      Business     False
    Individual     False
      Business      True
    Individual      True
    Government      True
      Business     False
    Individual     False
    Individual      True

💾 Saved to: C:\temp\retail_headers\CustomerTypeId_sample.csv
📊 Columns: CustomerTypeId, IsActive
