In [1]:
import random
from faker import Faker
from datetime import datetime, timedelta
import pandas as pd

# Initialize Faker instance
fake = Faker()

In [2]:
# Define Dataset Fields and Rules
def generate_invoice():
    """Generates a single invoice record with realistic data."""
    supplier_name = random.choice([
        "Purple Plumbers Limited", 
        "Artic Apples Co.", 
        "Global Tech Solutions",
        "Blue Banana Inc."
    ])
    supplier_reference = f"V_{supplier_name.split()[0][:2].upper()}001"
    invoice_date = fake.date_between(start_date='-1y', end_date='today')
    posting_date = invoice_date + timedelta(days=random.randint(0, 15))
    due_date = invoice_date + timedelta(days=random.randint(15, 60))
    is_paid = random.random() > 0.2  # 80% chance the invoice is paid
    payment_date = (due_date + timedelta(days=random.randint(0, 30))) if is_paid else None
    gross_amount = round(random.uniform(50, 10000), 2) * (1 if random.random() > 0.1 else -1)
    vat_amount = round(gross_amount * 0.2, 2)  # 20% VAT
    currency = random.choice(["USD", "EUR", "GBP"])
    debit_credit = "Dr" if gross_amount > 0 else "Cr"

    return {
        "Internal Reference": fake.random_int(min=100000000, max=999999999),
        "Invoice Number": f"IN{fake.random_int(min=100000, max=999999)}",
        "Supplier Name": supplier_name,
        "Supplier Reference": supplier_reference,
        "Invoice Date": invoice_date,
        "Posting Date": posting_date,
        "Due Date": due_date,
        "Payment Date": payment_date,
        "Gross Amount": gross_amount,
        "VAT Amount": vat_amount,
        "Currency Code": currency,
        "Debit/Credit": debit_credit
    }

In [3]:
# Generate Dataset
def generate_dataset(num_records=1000):
    """Generates a dataset with the specified number of records."""
    dataset = [generate_invoice() for _ in range(num_records)]
    return dataset

In [4]:
# Inject Anomalies
def inject_anomalies(data, anomaly_rate=0.05):
    """Injects anomalies into the dataset at the specified rate."""
    for record in data:
        if random.random() < anomaly_rate:
            anomaly_type = random.choice(['currency', 'gross_amount', 'supplier_name', 'date_inconsistency'])
            if anomaly_type == 'currency':
                record['Currency Code'] = random.choice(['JPY', 'AUD', 'CAD'])
            elif anomaly_type == 'gross_amount':
                record['Gross Amount'] *= random.uniform(5, 10)  # Large anomaly
                record['VAT Amount'] = round(record['Gross Amount'] * 0.2, 2)
            elif anomaly_type == 'supplier_name':
                record['Supplier Name'] = record['Supplier Name'].replace('Apples', 'Appes')
            elif anomaly_type == 'date_inconsistency':
                record['Due Date'] = record['Invoice Date'] - timedelta(days=random.randint(1, 10))
    return data

In [5]:
# Export to CSV
def export_to_csv(data, filename="synthetic_dataset.csv"):
    """Exports the dataset to a CSV file."""
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Dataset exported to {filename}")

In [6]:
# Generate the dataset
dataset = generate_dataset(num_records=1000)

# Inject anomalies
dataset_with_anomalies = inject_anomalies(dataset, anomaly_rate=0.05)

# Export to CSV
export_to_csv(dataset_with_anomalies, filename="synthetic_dataset.csv")

Dataset exported to synthetic_dataset.csv
