In [9]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

In [10]:
# Initialize Faker and set locale for realistic regional data
faker = Faker()
Faker.seed(42)
random.seed(42)
np.random.seed(42)

# Predefined categories
systems = ['SAP-P49', 'SAP-P49-WF', 'SAP-P49, SAP-P49-WF']
statuses = ['Active', 'Inactive']
divisions = ['Marelli PWT Slovakia', 'Marelli Germany', 'Marelli Poland', 'Marelli EPT Cologne', 'Marelli Italy', 'Marelli Japan']

In [11]:
# Helper functions
def generate_divisions():
    """Generates a random list of divisions or none."""
    if random.random() > 0.7:  # 30% chance of having no division
        return None
    return ", ".join(random.sample(divisions, random.randint(1, len(divisions))))

def generate_status():
    """Randomly assigns Active or Inactive."""
    return random.choice(statuses)

# def generate_vendor_no():
#     """Generates a 10-digit Vendor Number."""
#     return str(random.randint(1000000000, 9999999999))

def generate_spend():
    """Generates Total Spend (can be negative, zero, or large)."""
    return round(np.random.normal(1000000, 5000), 2) if random.random() > 0.5 else 0 # mean=1000k, std=5k

def generate_invoices(total_spend):
    """Generates number of invoices based on total spend."""
    if total_spend <= 0:
        return 0
    return random.randint(1, 500)

def generate_open_balance(total_spend):
    """Generates Open Balance as a fraction of Total Spend."""
    return round(total_spend * np.random.uniform(-0.5, 0.5), 2)

def generate_date_added():
    """Generates a random date in the past 10 years."""
    start_date = datetime.now() - timedelta(days=1460)
    random_days = random.randint(0, 1460)
    return (start_date + timedelta(days=random_days)).date()

In [12]:
# Generate the dataset
def generate_vendor_data(num_records):
    """Creates a synthetic Vendor Master Database dataset."""
    data = []
    for _ in range(num_records):
        total_spend = generate_spend()
        record = {
            'Status': generate_status(),
            # 'Vendor No.': generate_vendor_no(),
            'Vendor No.': 1000000001+_,
            'Vendor': faker.company(),
            'System': random.choice(systems),
            'Division': generate_divisions(),
            'Total Spend': total_spend,
            'Invoices': generate_invoices(total_spend),
            'Open Balance': generate_open_balance(total_spend),
            'Avg Day to Pay': None if total_spend == 0 else random.randint(10, 90),
            '% Late': None if total_spend == 0 else round(np.random.uniform(0, 100), 2),
            'Avg Days Paid Late': None if total_spend == 0 else random.randint(0, 30),
            'Avg Invoice Value': round(total_spend / max(1, random.randint(1, 500)), 2),
            'Added On': generate_date_added(),
            'Vendor Group': None,
            'Telephone': faker.phone_number(),
            'Tax ID': str(random.randint(1000000000, 7777777777)+1000000001+_),
            'Email': faker.email()
        }
        data.append(record)
    
    return pd.DataFrame(data)

In [13]:
# Generate and save the dataset
num_records = 1000
df = generate_vendor_data(num_records)
df.to_csv('vendor_master_dataset.csv', index=False)

print("Synthetic Vendor Master Dataset generated and saved as 'vendor_master_dataset.csv'.")

Synthetic Vendor Master Dataset generated and saved as 'vendor_master_dataset.csv'.
