In [25]:
from faker import Faker
import csv
import random
from datetime import datetime, timedelta

fake = Faker()
Faker.seed(42)
random.seed(42)

# Generate reference products table
products = [f"PROD{str(i).zfill(5)}" for i in range(1, 1500)]
with open('products.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["product_id", "product_name"])
    for pid in products:
        writer.writerow([pid, fake.catch_phrase()])

# Generate main dataset
header = [
    "customer_id", "order_id", "email", "phone", "state", "category",
    "order_date", "delivery_date", "age", "price", "original_price",
    "discounted_price", "shipping_date", "product_id", "street", "city",
    "zip_code", "description", "currency", "sku", "credit_card_number",
    "order_time", "delivery_time"
]

invalid_states = ["XX", "YY", "ZZ"]
invalid_categories = ["TOYS", "BOOKS", "MUSIC"]
currencies = ["USD", "EUR", "GBP", "JPY", "INVALID"]
invalid_skus = ["ABC-1234-567", "123-ABCD-EFG", "SKU-1234-567"]

def generate_record(index):
    # Generate base record with valid values
    record = {
        "customer_id": f"CUST{str(index).zfill(5)}",
        "order_id": f"ORD{str(index).zfill(5)}",
        "email": fake.email(),
        "phone": f"+1{random.randint(2000000000, 9999999999)}",
        "state": random.choice(["AL", "AK", "AZ", "CA", "NY", "TX"]),
        "category": random.choice(["ELECTRONICS", "CLOTHING", "GROCERY", "FURNITURE"]),
        "age": random.randint(18, 100),
        "price": round(random.uniform(10, 1000), 2),
        "original_price": 0.0,
        "discounted_price": 0.0,
        "product_id": random.choice(products),
        "street": fake.street_address(),
        "city": fake.city(),
        "zip_code": fake.zipcode(),
        "description": fake.text(max_nb_chars=200),
        "currency": random.choice(currencies),
        "sku": f"{random.choice(['ABC', 'DEF', 'GHI'])}-{random.randint(1000, 9999)}-{random.choice(['A1B', 'C2D', 'E3F'])}",
        "credit_card_number": fake.credit_card_number(card_type='visa').replace(' ', '-'),
        "order_time": f"{random.randint(9, 20)}:{random.randint(0,59):02d}:00",
    }

    # Generate order date with proper error handling
    try:
        order_date = fake.date_between(start_date='-1y', end_date='today')
        record["order_date"] = order_date.strftime('%Y-%m-%d')
    except:
        record["order_date"] = '2023-01-01'  # Fallback valid date

    # Generate delivery date
    try:
        delivery_date = fake.date_between(start_date='today', end_date='+30d')
        record["delivery_date"] = delivery_date.strftime('%Y-%m-%d')
    except:
        record["delivery_date"] = None

    # Generate shipping date with order date reference
    try:
        order_date_dt = datetime.strptime(record["order_date"], "%Y-%m-%d")
        shipping_date = fake.date_between(
            start_date=order_date_dt,
            end_date=order_date_dt + timedelta(days=3)
        )
        record["shipping_date"] = shipping_date.strftime('%Y-%m-%d')
    except ValueError:
        record["shipping_date"] = None

    # Generate delivery time
    try:
        delivery_dt = datetime.strptime(record["delivery_date"], "%Y-%m-%d") if record["delivery_date"] else None
        record["delivery_time"] = delivery_dt.strftime('%H:%M:%S') if delivery_dt else None
    except:
        record["delivery_time"] = None

    # Introduce 10% invalid records
    if random.random() < 0.1:
        record.update({
            "customer_id": None if random.random() < 0.05 else record["customer_id"],
            "order_id": "ORD00001" if index < 10 else record["order_id"],  # Duplicate IDs
            "email": "invalid.email" if random.random() < 0.1 else record["email"],
            "phone": "123-456-789" if random.random() < 0.1 else record["phone"],
            "state": random.choice(invalid_states) if random.random() < 0.1 else record["state"],
            "category": random.choice(invalid_categories) if random.random() < 0.1 else record["category"],
            "order_date": "2023-13-32" if random.random() < 0.05 else record["order_date"],  # Invalid date
            "age": random.choice([17, 101]) if random.random() < 0.05 else record["age"],
            "price": -random.uniform(1, 100) if random.random() < 0.05 else record["price"],
            "product_id": "INVALID_PROD" if random.random() < 0.05 else record["product_id"],
            "street": None if random.random() < 0.05 else record["street"],
            "description": fake.text(max_nb_chars=10) if random.random() < 0.05 else record["description"],
            "sku": random.choice(invalid_skus) if random.random() < 0.1 else record["sku"],
            "credit_card_number": "4111-1111-1111-111" if random.random() < 0.05 else record["credit_card_number"],
            "order_time": "08:30:00" if random.random() < 0.05 else record["order_time"]
        })

    # Generate price relationships
    record["original_price"] = round(record["price"] * random.uniform(1.1, 2.0), 2)
    record["discounted_price"] = round(record["price"] * random.uniform(0.5, 1.1), 2)

    # Introduce cross-column issues
    if random.random() < 0.05:
        record["discounted_price"] = record["original_price"] * 1.1
    if random.random() < 0.05:
        try:
            record["shipping_date"] = (datetime.strptime(record["order_date"], "%Y-%m-%d") - 
                                      timedelta(days=1)).strftime('%Y-%m-%d')
        except:
            record["shipping_date"] = None

    return [record[field] for field in header]

# Generate 2000 records
with open('transactions.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    for i in range(10000):
        writer.writerow(generate_record(i+1))

In [None]:
from faker import Faker
import csv
import random
from datetime import datetime, timedelta

fake = Faker()
Faker.seed(42)
random.seed(42)

# Generate reference products table
products = [f"PROD{str(i).zfill(5)}" for i in range(1, 1500)]
with open('products.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["product_id", "product_name"])
    for pid in products:
        writer.writerow([pid, fake.catch_phrase()])

# Generate main dataset
header = [
    "customer_id", "order_id", "email", "phone", "state", "category",
    "order_date", "delivery_date", "age", "price", "original_price",
    "discounted_price", "shipping_date", "product_id", "street", "city",
    "zip_code", "description", "currency", "sku", "credit_card_number",
    "order_time", "delivery_time"
]

invalid_states = ["XX", "YY", "ZZ"]
invalid_categories = ["TOYS", "BOOKS", "MUSIC"]
currencies = ["USD", "EUR", "GBP", "JPY", "INVALID"]
invalid_skus = ["ABC-1234-567", "123-ABCD-EFG", "SKU-1234-567"]

def generate_record(index):
    # Generate base record with valid values
    record = {
        "customer_id": f"CUST{str(index).zfill(5)}",
        "order_id": f"ORD{str(index).zfill(5)}",
        "email": fake.email(),
        "phone": f"+1{random.randint(2000000000, 9999999999)}",
        "state": random.choice(["AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","NY","TX"]),
        "category": random.choice(["ELECTRONICS", "CLOTHING", "GROCERY", "FURNITURE"]),
        "age": random.randint(18, 100),
        "price": round(random.uniform(10, 1000), 2),
        "original_price": 0.0,
        "discounted_price": 0.0,
        "product_id": random.choice(products),
        "street": fake.street_address(),
        "city": fake.city(),
        "zip_code": fake.zipcode(),
        "description": fake.text(max_nb_chars=200),
        "currency": random.choice(currencies),
        "sku": f"{random.choice(['ABC', 'DEF', 'GHI'])}-{random.randint(1000, 9999)}-{random.choice(['A1B', 'C2D', 'E3F'])}",
        "credit_card_number": fake.credit_card_number(card_type='visa').replace(' ', '-'),
        "order_time": f"{random.randint(9, 20)}:{random.randint(0,59):02d}:00",
    }

    # Generate order date with proper error handling
    try:
        order_date = fake.date_between(start_date='-1y', end_date='today')
        record["order_date"] = order_date.strftime('%Y-%m-%d')
    except:
        record["order_date"] = '2023-01-01'  # Fallback valid date

    # Generate delivery date
    try:
        delivery_date = fake.date_between(start_date='today', end_date='+30d')
        record["delivery_date"] = delivery_date.strftime('%Y-%m-%d')
    except:
        record["delivery_date"] = None

    # Generate shipping date with order date reference
    try:
        order_date_dt = datetime.strptime(record["order_date"], "%Y-%m-%d")
        shipping_date = fake.date_between(
            start_date=order_date_dt,
            end_date=order_date_dt + timedelta(days=3)
        )
        record["shipping_date"] = shipping_date.strftime('%Y-%m-%d')
    except ValueError:
        record["shipping_date"] = None

    # Generate delivery time
    try:
        delivery_dt = datetime.strptime(record["delivery_date"], "%Y-%m-%d") if record["delivery_date"] else None
        record["delivery_time"] = delivery_dt.strftime('%H:%M:%S') if delivery_dt else None
    except:
        record["delivery_time"] = None

    # Introduce 10% invalid records
    if random.random() < 0.1:
        record.update({
            "customer_id": None if random.random() < 0.05 else record["customer_id"],
            "order_id": "ORD00001" if index < 10 else record["order_id"],  # Duplicate IDs
            "email": "invalid.email" if random.random() < 0.1 else record["email"],
            "phone": "123-456-789" if random.random() < 0.1 else record["phone"],
            "state": random.choice(invalid_states) if random.random() < 0.1 else record["state"],
            "category": random.choice(invalid_categories) if random.random() < 0.1 else record["category"],
            "order_date": "2023-13-32" if random.random() < 0.05 else record["order_date"],  # Invalid date
            "age": random.choice([17, 101]) if random.random() < 0.05 else record["age"],
            "price": -random.uniform(1, 100) if random.random() < 0.05 else record["price"],
            "product_id": "INVALID_PROD" if random.random() < 0.05 else record["product_id"],
            "street": None if random.random() < 0.05 else record["street"],
            "description": fake.text(max_nb_chars=10) if random.random() < 0.05 else record["description"],
            "sku": random.choice(invalid_skus) if random.random() < 0.1 else record["sku"],
            "credit_card_number": "4111-1111-1111-111" if random.random() < 0.05 else record["credit_card_number"],
            "order_time": "08:30:00" if random.random() < 0.05 else record["order_time"]
        })

    # Generate price relationships
    record["original_price"] = round(record["price"] * random.uniform(1.1, 2.0), 2)
    record["discounted_price"] = round(record["price"] * random.uniform(0.5, 1.1), 2)

    # Introduce cross-column issues
    if random.random() < 0.05:
        record["discounted_price"] = record["original_price"] * 1.1
    if random.random() < 0.05:
        try:
            record["shipping_date"] = (datetime.strptime(record["order_date"], "%Y-%m-%d") - 
                                      timedelta(days=1)).strftime('%Y-%m-%d')
        except:
            record["shipping_date"] = None

    return [record[field] for field in header]

# Generate 2000 records
with open('transactions.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    for i in range(10000):
        writer.writerow(generate_record(i+1))

In [22]:
random.random() < 0.05

False