# Assignment 6: Data Generator

This notebook generates the datasets you'll use for Assignment 6 (Data Wrangling).

**Run this notebook ONCE to create the data files**, then work on `assignment.ipynb`.

---

## Generated Files

This notebook creates:
- `data/customers.csv` - Customer information (100 customers)
- `data/purchases.csv` - Purchase transactions (2,000 purchases)
- `data/products.csv` - Product catalog (50 products)

---

## Setup

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)
print("✓ Libraries imported")

✓ Libraries imported


---

## Generate Customer Data

In [2]:
# Customer IDs
customer_ids = [f'C{i:03d}' for i in range(1, 101)]

# Names (realistic distribution)
first_names = ['Alice', 'Bob', 'Charlie', 'Diana', 'Eric', 'Fiona', 'George', 'Hannah',
               'Ian', 'Julia', 'Kevin', 'Laura', 'Michael', 'Nina', 'Oscar', 'Patricia',
               'Quinn', 'Rachel', 'Steve', 'Teresa']
last_names = ['Chen', 'Martinez', 'Kim', 'Patel', 'Thompson', 'Garcia', 'Lee', 'Wilson',
              'Anderson', 'Jackson', 'Brown', 'Davis', 'Miller', 'Rodriguez', 'Singh']

# Cities weighted by population
cities = np.random.choice(
    ['Seattle', 'Portland', 'San Francisco', 'Los Angeles', 'San Diego', 'Sacramento'],
    size=100,
    p=[0.25, 0.20, 0.20, 0.15, 0.10, 0.10]
)

# Generate customer data
customers = pd.DataFrame({
    'customer_id': customer_ids,
    'name': [f"{np.random.choice(first_names)} {np.random.choice(last_names)}"
             for _ in range(100)],
    'city': cities,
    'signup_date': pd.date_range('2023-01-01', periods=100, freq='3D')
})

print(f"✓ Generated {len(customers)} customers")
customers.head()

✓ Generated 100 customers


Unnamed: 0,customer_id,name,city,signup_date
0,C001,George Chen,Portland,2023-01-01
1,C002,Teresa Miller,Sacramento,2023-01-04
2,C003,Diana Rodriguez,Los Angeles,2023-01-07
3,C004,Eric Lee,San Francisco,2023-01-10
4,C005,George Miller,Seattle,2023-01-13


---

## Generate Product Catalog

In [3]:
# Product categories
categories = {
    'Electronics': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Tablet', 'Smartphone',
                    'Headphones', 'Webcam', 'USB Cable', 'Power Bank'],
    'Clothing': ['T-Shirt', 'Jeans', 'Sweater', 'Jacket', 'Shoes', 'Hat', 'Socks',
                 'Dress', 'Shorts', 'Scarf'],
    'Home & Garden': ['Coffee Maker', 'Blender', 'Vacuum', 'Lamp', 'Plant Pot',
                      'Rug', 'Curtains', 'Pillow', 'Candle', 'Picture Frame'],
    'Books': ['Fiction Novel', 'Cookbook', 'Biography', 'Textbook', 'Magazine',
              'Comic Book', 'Travel Guide', 'Self-Help', 'Poetry', 'Reference Book'],
    'Sports': ['Yoga Mat', 'Dumbbells', 'Tennis Racket', 'Soccer Ball', 'Running Shoes',
               'Water Bottle', 'Resistance Bands', 'Jump Rope', 'Bicycle', 'Skateboard']
}

# Build product catalog
product_list = []
product_id = 1

for category, items in categories.items():
    for item in items:
        # Price varies by category
        base_prices = {
            'Electronics': (50, 1500),
            'Clothing': (20, 150),
            'Home & Garden': (15, 300),
            'Books': (10, 50),
            'Sports': (15, 500)
        }

        min_price, max_price = base_prices[category]
        price = round(np.random.uniform(min_price, max_price), 2)

        product_list.append({
            'product_id': f'P{product_id:03d}',
            'product_name': item,
            'category': category,
            'price': price
        })
        product_id += 1

products = pd.DataFrame(product_list)

print(f"✓ Generated {len(products)} products across {len(categories)} categories")
products.head(10)

✓ Generated 50 products across 5 categories


Unnamed: 0,product_id,product_name,category,price
0,P001,Laptop,Electronics,1097.56
1,P002,Mouse,Electronics,457.12
2,P003,Keyboard,Electronics,85.26
3,P004,Monitor,Electronics,985.93
4,P005,Tablet,Electronics,306.81
5,P006,Smartphone,Electronics,1413.66
6,P007,Headphones,Electronics,1433.2
7,P008,Webcam,Electronics,1376.55
8,P009,USB Cable,Electronics,586.73
9,P010,Power Bank,Electronics,72.41


---

## Generate Purchase Transactions

In [4]:
# Generate 2,000 purchases
num_purchases = 2000

# Weighted customer selection (some customers buy more)
customer_weights = np.exp(np.linspace(0, 2, 100))  # Exponential distribution
customer_weights = customer_weights / customer_weights.sum()

purchases = pd.DataFrame({
    'purchase_id': [f'T{i:04d}' for i in range(1, num_purchases + 1)],
    'customer_id': np.random.choice(customer_ids, size=num_purchases, p=customer_weights),
    'product_id': np.random.choice(products['product_id'], size=num_purchases),
    'quantity': np.random.choice([1, 2, 3, 4, 5], size=num_purchases, p=[0.5, 0.25, 0.15, 0.07, 0.03]),
    'purchase_date': pd.date_range('2023-01-01', periods=num_purchases, freq='4H'),
    'store': np.random.choice(['Store A', 'Store B', 'Store C'], size=num_purchases)
})

print(f"✓ Generated {len(purchases)} purchase transactions")
purchases.head(10)

✓ Generated 2000 purchase transactions


  'purchase_date': pd.date_range('2023-01-01', periods=num_purchases, freq='4H'),


Unnamed: 0,purchase_id,customer_id,product_id,quantity,purchase_date,store
0,T0001,C011,P042,1,2023-01-01 00:00:00,Store C
1,T0002,C070,P001,2,2023-01-01 04:00:00,Store A
2,T0003,C075,P029,1,2023-01-01 08:00:00,Store A
3,T0004,C053,P045,1,2023-01-01 12:00:00,Store C
4,T0005,C079,P049,3,2023-01-01 16:00:00,Store B
5,T0006,C010,P035,1,2023-01-01 20:00:00,Store C
6,T0007,C011,P040,3,2023-01-02 00:00:00,Store C
7,T0008,C092,P014,1,2023-01-02 04:00:00,Store C
8,T0009,C060,P035,1,2023-01-02 08:00:00,Store B
9,T0010,C030,P048,1,2023-01-02 12:00:00,Store B


---

## Verify Data Relationships

In [5]:
# Check for many-to-one relationships
print("Data Quality Checks:")
print(f"  Unique customers: {customers['customer_id'].nunique()}")
print(f"  Unique products: {products['product_id'].nunique()}")
print(f"  Total purchases: {len(purchases)}")
print()

# Customer purchase frequency
purchase_counts = purchases['customer_id'].value_counts()
print(f"  Customers with purchases: {len(purchase_counts)}")
print(f"  Max purchases by one customer: {purchase_counts.max()}")
print(f"  Customers with no purchases: {len(customers) - len(purchase_counts)}")
print()

# Product popularity
product_counts = purchases['product_id'].value_counts()
print(f"  Products sold: {len(product_counts)}")
print(f"  Products never sold: {len(products) - len(product_counts)}")
print()

print("✓ Data relationships look good for assignment!")

Data Quality Checks:
  Unique customers: 100
  Unique products: 50
  Total purchases: 2000

  Customers with purchases: 99
  Max purchases by one customer: 49
  Customers with no purchases: 1

  Products sold: 50
  Products never sold: 0

✓ Data relationships look good for assignment!


---

## Save to CSV Files

In [6]:
# Create data directory if it doesn't exist
import os
os.makedirs('data', exist_ok=True)

# Save all datasets
customers.to_csv('data/customers.csv', index=False)
print("✓ Saved data/customers.csv")

products.to_csv('data/products.csv', index=False)
print("✓ Saved data/products.csv")

purchases.to_csv('data/purchases.csv', index=False)
print("✓ Saved data/purchases.csv")

print()
print("=" * 50)
print("✓ ALL DATA FILES GENERATED SUCCESSFULLY!")
print("=" * 50)
print()
print("Next step: Open assignment.ipynb and complete the questions.")

✓ Saved data/customers.csv
✓ Saved data/products.csv
✓ Saved data/purchases.csv

✓ ALL DATA FILES GENERATED SUCCESSFULLY!

Next step: Open assignment.ipynb and complete the questions.


---

## Preview Generated Data

In [7]:
print("Customers:")
display(customers.head())

print("\nProducts:")
display(products.head())

print("\nPurchases:")
display(purchases.head())

Customers:


Unnamed: 0,customer_id,name,city,signup_date
0,C001,George Chen,Portland,2023-01-01
1,C002,Teresa Miller,Sacramento,2023-01-04
2,C003,Diana Rodriguez,Los Angeles,2023-01-07
3,C004,Eric Lee,San Francisco,2023-01-10
4,C005,George Miller,Seattle,2023-01-13



Products:


Unnamed: 0,product_id,product_name,category,price
0,P001,Laptop,Electronics,1097.56
1,P002,Mouse,Electronics,457.12
2,P003,Keyboard,Electronics,85.26
3,P004,Monitor,Electronics,985.93
4,P005,Tablet,Electronics,306.81



Purchases:


Unnamed: 0,purchase_id,customer_id,product_id,quantity,purchase_date,store
0,T0001,C011,P042,1,2023-01-01 00:00:00,Store C
1,T0002,C070,P001,2,2023-01-01 04:00:00,Store A
2,T0003,C075,P029,1,2023-01-01 08:00:00,Store A
3,T0004,C053,P045,1,2023-01-01 12:00:00,Store C
4,T0005,C079,P049,3,2023-01-01 16:00:00,Store B
