 Level 4: Business Planning & Logistics

Products & Materials

In [6]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def generate_products_data(num_products=100, output_file="data/products.csv"):
    """
    Generate synthetic data for the Products table from ISA-95 Level 4.
    
    Parameters:
    - num_products: Number of product records to generate
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated products data
    """
    # Define product categories and types
    product_categories = {
        "Pharmaceutical": [
            "Tablet", "Capsule", "Liquid", "Injection", "Cream", "Ointment", 
            "Inhaler", "Patch", "Suppository", "Suspension"
        ],
        "Food & Beverage": [
            "Dairy", "Bakery", "Beverage", "Snack", "Frozen", "Canned", 
            "Condiment", "Ready Meal", "Confectionery", "Dried"
        ],
        "Chemical": [
            "Industrial", "Agricultural", "Cleaning", "Coating", "Adhesive", 
            "Solvent", "Catalyst", "Resin", "Pigment", "Additive"
        ],
        "Electronics": [
            "Consumer", "Industrial", "Component", "Circuit Board", "Sensor", 
            "Power Supply", "Display", "Memory", "Processor", "Controller"
        ],
        "Automotive": [
            "Engine", "Transmission", "Chassis", "Interior", "Exterior", 
            "Electrical", "Cooling", "Fuel", "Suspension", "Braking"
        ],
        "Consumer Goods": [
            "Personal Care", "Household", "Paper", "Textile", "Appliance", 
            "Furniture", "Toy", "Sporting", "Tool", "Office"
        ]
    }
    
    # Define units of measurement by category
    units_of_measure = {
        "Pharmaceutical": ["mg", "g", "ml", "tablet", "capsule", "vial", "patch", "unit"],
        "Food & Beverage": ["g", "kg", "ml", "L", "oz", "lb", "piece", "pack"],
        "Chemical": ["g", "kg", "L", "gal", "ton", "drum", "pallet", "IBC"],
        "Electronics": ["unit", "piece", "kit", "set", "assembly", "module"],
        "Automotive": ["unit", "piece", "kit", "set", "assembly"],
        "Consumer Goods": ["unit", "piece", "pack", "box", "case", "pallet"]
    }
    
    # Define status options
    product_statuses = ["Active", "In Development", "Obsolete", "On Hold", "Discontinued"]
    status_weights = [0.7, 0.1, 0.1, 0.05, 0.05]  # Mostly active products
    
    # Generate data structure
    data = {
        "product_id": [f"PROD-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_products)],
        "product_name": [],
        "product_code": [],
        "product_family": [],
        "description": [],
        "unit_of_measure": [],
        "status": [],
        "introduction_date": [],
        "discontinuation_date": [],
        "revision": [],
        "base_cost": [],
        "list_price": [],
        "shelf_life_days": [],
        "storage_requirements": [],
        "parent_product_id": []
    }
    
    # Define storage requirements by category
    storage_requirements = {
        "Pharmaceutical": [
            "Room Temperature (15-25°C)", "Refrigerated (2-8°C)", "Frozen (-20°C)", 
            "Cold Chain", "Controlled Room Temperature (20-25°C)", "Protected from Light",
            "Protected from Moisture", "Controlled Humidity (<60% RH)"
        ],
        "Food & Beverage": [
            "Room Temperature", "Refrigerated", "Frozen", "Cool and Dry", 
            "Protected from Light", "Protected from Moisture", "Ambient"
        ],
        "Chemical": [
            "Room Temperature", "Controlled Temperature", "Flammable Storage", 
            "Acid Cabinet", "Base Cabinet", "Ventilated Area", "Away from Incompatibles",
            "Protected from Moisture", "Protected from Light"
        ],
        "Electronics": [
            "ESD Protected", "Temperature Controlled", "Humidity Controlled", 
            "Dust-Free Environment", "Standard Warehouse", "Clean Room"
        ],
        "Automotive": [
            "Standard Warehouse", "Climate Controlled", "Covered Storage", 
            "Protected from Moisture", "Protected from Corrosives"
        ],
        "Consumer Goods": [
            "Standard Warehouse", "Temperature Controlled", "Humidity Controlled", 
            "Stack Limit", "Heavy Item Storage", "Fragile Item Handling"
        ]
    }
    
    # Track products for potential parent-child relationships
    all_products = data["product_id"].copy()
    potential_parents = random.sample(all_products, int(len(all_products) * 0.2))  # 20% can be parents
    
    # Generate data for each product
    for i in range(num_products):
        # Select product category
        category = random.choice(list(product_categories.keys()))
        
        # Select product type within category
        product_type = random.choice(product_categories[category])
        
        # Set product family (category)
        data["product_family"].append(category)
        
        # Generate product name and code
        product_series = random.choice(["Pro", "Elite", "Standard", "Premium", "Ultra", "Max", "Advanced", "Basic"])
        product_number = random.randint(100, 999)
        
        product_name = f"{product_type} {product_series} {product_number}"
        data["product_name"].append(product_name)
        
        # Generate product code (category abbreviation + type abbreviation + number)
        category_abbr = category[0].upper()
        type_abbr = ''.join([word[0].upper() for word in product_type.split()])
        product_code = f"{category_abbr}{type_abbr}{product_number}"
        data["product_code"].append(product_code)
        
        # Generate description
        descriptions = [
            f"Standard {product_type.lower()} for general use",
            f"Premium quality {product_type.lower()} with enhanced features",
            f"Industrial grade {product_type.lower()} for professional applications",
            f"Cost-effective {product_type.lower()} solution",
            f"High-performance {product_type.lower()} designed for demanding environments"
        ]
        data["description"].append(random.choice(descriptions))
        
        # Select unit of measure
        if category in units_of_measure:
            unit = random.choice(units_of_measure[category])
        else:
            unit = "unit"
        data["unit_of_measure"].append(unit)
        
        # Set status (weighted random)
        status = random.choices(product_statuses, weights=status_weights)[0]
        data["status"].append(status)
        
        # Generate introduction date (between 10 years ago and now)
        intro_days_ago = random.randint(0, 3650)
        intro_date = datetime.now() - timedelta(days=intro_days_ago)
        data["introduction_date"].append(intro_date.strftime("%Y-%m-%d"))
        
        # Generate discontinuation date (only for obsolete or discontinued products)
        if status in ["Obsolete", "Discontinued"]:
            # Discontinuation date is after introduction but before now
            min_disc_days_ago = min(intro_days_ago - 1, 1)  # At least 1 day after intro
            disc_days_ago = random.randint(min_disc_days_ago, intro_days_ago - 1)
            disc_date = datetime.now() - timedelta(days=disc_days_ago)
            data["discontinuation_date"].append(disc_date.strftime("%Y-%m-%d"))
        else:
            data["discontinuation_date"].append("")
        
        # Generate revision (format: 1.0, 1.1, 2.0, etc.)
        major_revision = random.randint(1, 3)
        minor_revision = random.randint(0, 9)
        data["revision"].append(f"{major_revision}.{minor_revision}")
        
        # Generate cost and price
        # Different price ranges for different categories
        if category == "Pharmaceutical":
            base_cost = random.uniform(5, 500)
        elif category == "Food & Beverage":
            base_cost = random.uniform(1, 50)
        elif category == "Chemical":
            base_cost = random.uniform(10, 200)
        elif category == "Electronics":
            base_cost = random.uniform(20, 1000)
        elif category == "Automotive":
            base_cost = random.uniform(15, 800)
        else:  # Consumer Goods
            base_cost = random.uniform(2, 100)
        
        data["base_cost"].append(round(base_cost, 2))
        
        # List price is typically cost + markup
        markup = random.uniform(1.2, 2.5)  # 20% to 150% markup
        list_price = base_cost * markup
        data["list_price"].append(round(list_price, 2))
        
        # Generate shelf life
        # Different shelf life ranges for different categories
        if category == "Pharmaceutical":
            shelf_life = random.randint(365, 1825)  # 1-5 years
        elif category == "Food & Beverage":
            shelf_life = random.randint(30, 730)  # 1 month to 2 years
        elif category == "Chemical":
            shelf_life = random.randint(365, 3650)  # 1-10 years
        elif category in ["Electronics", "Automotive", "Consumer Goods"]:
            shelf_life = random.randint(730, 3650)  # 2-10 years
        else:
            shelf_life = random.randint(365, 1825)  # 1-5 years
        
        data["shelf_life_days"].append(shelf_life)
        
        # Set storage requirements
        if category in storage_requirements:
            data["storage_requirements"].append(random.choice(storage_requirements[category]))
        else:
            data["storage_requirements"].append("Standard Storage")
        
        # Determine parent product (if any)
        # About 15% of products will have a parent
        if (data["product_id"][i] not in potential_parents and 
            random.random() < 0.15 and 
            potential_parents):  # Make sure there are potential parents
            
            parent_id = random.choice(potential_parents)
            data["parent_product_id"].append(parent_id)
        else:
            data["parent_product_id"].append("")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} product records.")
    print(f"Data saved to {output_file}")
    
    return df

def generate_materials_data(products_df=None, num_materials=150, output_file="data/materials.csv"):
    """
    Generate synthetic data for the Materials table from ISA-95 Level 4.
    
    Parameters:
    - products_df: DataFrame containing products data (optional)
    - num_materials: Number of material records to generate
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated materials data
    """
    # Define material types and their probabilities
    material_types = {
        "Raw Material": 0.4,
        "Packaging": 0.25,
        "WIP": 0.15,
        "Intermediate": 0.1,
        "Consumable": 0.1
    }
    
    # Define material categories
    material_categories = {
        "Chemical": [
            "Solvent", "Reagent", "Catalyst", "Acid", "Base", "Salt", 
            "Polymer", "Monomer", "Pigment", "Additive"
        ],
        "Pharmaceutical": [
            "API", "Excipient", "Binder", "Filler", "Coating", "Disintegrant", 
            "Preservative", "Stabilizer", "Colorant", "Flavoring"
        ],
        "Food": [
            "Ingredient", "Additive", "Flavoring", "Coloring", "Preservative", 
            "Thickener", "Sweetener", "Emulsifier", "Stabilizer", "Nutrient"
        ],
        "Packaging": [
            "Container", "Closure", "Label", "Carton", "Insert", "Film", 
            "Foil", "Bottle", "Cap", "Box"
        ],
        "Industrial": [
            "Metal", "Plastic", "Rubber", "Glass", "Ceramic", "Composite", 
            "Textile", "Paper", "Wood", "Lubricant"
        ]
    }
    
    # Define units of measurement by material type
    units_of_measure = {
        "Raw Material": ["kg", "L", "g", "ton", "m³"],
        "Packaging": ["piece", "unit", "roll", "sheet", "box"],
        "WIP": ["kg", "L", "batch", "unit"],
        "Intermediate": ["kg", "L", "batch", "unit"],
        "Consumable": ["piece", "unit", "kg", "L"]
    }
    
    # Define storage requirements by material category
    storage_requirements = {
        "Chemical": [
            "Flammable Storage", "Acid Cabinet", "Base Cabinet", "Ventilated Area", 
            "Temperature Controlled (15-25°C)", "Protected from Light", "Protected from Moisture",
            "Refrigerated (2-8°C)", "Freezer (-20°C)"
        ],
        "Pharmaceutical": [
            "Room Temperature (15-25°C)", "Refrigerated (2-8°C)", "Frozen (-20°C)", 
            "Controlled Room Temperature (20-25°C)", "Protected from Light",
            "Protected from Moisture", "Controlled Humidity (<60% RH)"
        ],
        "Food": [
            "Room Temperature", "Refrigerated", "Frozen", "Cool and Dry", 
            "Protected from Light", "Protected from Moisture", "Ambient"
        ],
        "Packaging": [
            "Standard Warehouse", "Climate Controlled", "Protected from Moisture", 
            "Protected from Dust", "Away from Chemicals"
        ],
        "Industrial": [
            "Standard Warehouse", "Climate Controlled", "Protected from Moisture", 
            "Protected from Corrosives", "Hazardous Materials Storage"
        ]
    }
    
    # Define hazard classifications
    hazard_classifications = [
        "Non-Hazardous", "Flammable", "Corrosive", "Toxic", "Oxidizer", 
        "Explosive", "Environmentally Hazardous", "Irritant", "Pressurized Gas",
        "Carcinogen", "Radioactive", "Biohazard", "None"
    ]
    
    # Define data structure
    data = {
        "material_id": [f"MAT-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_materials)],
        "material_name": [],
        "material_type": [],
        "description": [],
        "unit_of_measure": [],
        "standard_cost": [],
        "lead_time_days": [],
        "minimum_order_quantity": [],
        "approved_suppliers": [],
        "safety_stock": [],
        "reorder_point": [],
        "status": [],
        "storage_requirements": [],
        "hazard_classification": []
    }
    
    # Generate supplier IDs for approved suppliers
    supplier_ids = [f"SUP-{uuid.uuid4().hex[:8].upper()}" for _ in range(30)]
    
    # Generate data for each material
    for i in range(num_materials):
        # Select material type (weighted random)
        material_type = random.choices(
            list(material_types.keys()), 
            weights=list(material_types.values())
        )[0]
        data["material_type"].append(material_type)
        
        # Select material category
        category = random.choice(list(material_categories.keys()))
        
        # Select material subtype within category
        material_subtype = random.choice(material_categories[category])
        
        # Generate material name
        material_grade = random.choice(["Standard", "Premium", "Technical", "USP", "NF", "EP", "BP", "CP", "ACS", "Ultra"])
        material_number = random.randint(100, 999)
        
        material_name = f"{material_subtype} {material_grade} {material_number}"
        data["material_name"].append(material_name)
        
        # Generate description
        descriptions = [
            f"Standard {material_subtype.lower()} for general use",
            f"{material_grade} grade {material_subtype.lower()} for {material_type.lower()} applications",
            f"High-quality {material_subtype.lower()} meeting {material_grade} specifications",
            f"Industrial {material_subtype.lower()} for manufacturing processes",
            f"{material_grade} certified {material_subtype.lower()}"
        ]
        data["description"].append(random.choice(descriptions))
        
        # Select unit of measure
        if material_type in units_of_measure:
            unit = random.choice(units_of_measure[material_type])
        else:
            unit = "kg"
        data["unit_of_measure"].append(unit)
        
        # Generate cost
        # Different cost ranges for different material types
        if material_type == "Raw Material":
            if material_subtype in ["API", "Catalyst"]:
                # High-value materials
                standard_cost = random.uniform(100, 5000)
            else:
                standard_cost = random.uniform(10, 500)
        elif material_type == "Packaging":
            standard_cost = random.uniform(0.5, 50)
        elif material_type in ["WIP", "Intermediate"]:
            standard_cost = random.uniform(20, 1000)
        else:  # Consumable
            standard_cost = random.uniform(5, 200)
        
        data["standard_cost"].append(round(standard_cost, 2))
        
        # Generate lead time
        if material_type == "Raw Material":
            lead_time = random.randint(30, 120)  # 1-4 months
        elif material_type == "Packaging":
            lead_time = random.randint(14, 60)  # 2-8 weeks
        else:
            lead_time = random.randint(7, 30)  # 1-4 weeks
        
        data["lead_time_days"].append(lead_time)
        
        # Generate minimum order quantity
        if unit in ["kg", "L"]:
            if standard_cost > 1000:
                # Expensive materials have lower MOQs
                min_order = random.choice([0.1, 0.25, 0.5, 1, 5])
            else:
                min_order = random.choice([1, 5, 10, 25, 50, 100])
        elif unit in ["g", "ml"]:
            min_order = random.choice([100, 250, 500, 1000, 5000])
        else:
            min_order = random.choice([10, 25, 50, 100, 500, 1000])
        
        data["minimum_order_quantity"].append(min_order)
        
        # Generate approved suppliers
        num_suppliers = random.randint(1, 3)
        approved_suppliers = random.sample(supplier_ids, num_suppliers)
        data["approved_suppliers"].append(str(approved_suppliers))
        
        # Generate safety stock and reorder point
        # Higher for critical materials, lower for consumables
        if material_type in ["Raw Material", "Intermediate"]:
            safety_factor = random.uniform(0.5, 2.0)
            consumption_rate = random.uniform(1, 10) * min_order
            
            # Safety stock based on lead time and consumption rate
            safety_stock = consumption_rate * lead_time * safety_factor / 30  # Normalized to monthly consumption
            reorder_point = safety_stock + (consumption_rate * lead_time / 30)
        else:
            safety_stock = min_order * random.uniform(1, 3)
            reorder_point = min_order * random.uniform(2, 5)
        
        data["safety_stock"].append(round(safety_stock, 2))
        data["reorder_point"].append(round(reorder_point, 2))
        
        # Set status (mostly active)
        statuses = ["Active", "Pending Approval", "Obsolete", "On Hold", "Discontinued"]
        status_weights = [0.8, 0.05, 0.05, 0.05, 0.05]
        data["status"].append(random.choices(statuses, weights=status_weights)[0])
        
        # Set storage requirements
        if category in storage_requirements:
            data["storage_requirements"].append(random.choice(storage_requirements[category]))
        else:
            data["storage_requirements"].append("Standard Storage")
        
        # Set hazard classification
        if category == "Chemical":
            # Chemicals are more likely to be hazardous
            hazard_weights = [0.1, 0.15, 0.15, 0.15, 0.1, 0.05, 0.1, 0.1, 0.05, 0.05, 0.0, 0.0, 0.0]
        elif category == "Pharmaceutical":
            # Pharmaceuticals can be hazardous but less so
            hazard_weights = [0.3, 0.1, 0.05, 0.1, 0.05, 0.0, 0.05, 0.1, 0.0, 0.05, 0.0, 0.05, 0.15]
        else:
            # Other categories are less likely to be hazardous
            hazard_weights = [0.6, 0.05, 0.05, 0.05, 0.0, 0.0, 0.05, 0.05, 0.0, 0.0, 0.0, 0.0, 0.15]
        
        data["hazard_classification"].append(random.choices(hazard_classifications, weights=hazard_weights)[0])
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} material records.")
    print(f"Data saved to {output_file}")
    
    return df

def display_statistics(products_df, materials_df=None):
    """
    Display basic statistics about the generated products and materials data
    
    Parameters:
    - products_df: DataFrame containing products data
    - materials_df: DataFrame containing materials data (optional)
    """
    if products_df is None or len(products_df) == 0:
        print("No products data to analyze.")
        return
    
    print("\nProducts Statistics:")
    print(f"Total products: {len(products_df)}")
    
    # Product family distribution
    print("\nProduct Family Distribution:")
    family_counts = products_df['product_family'].value_counts()
    for family, count in family_counts.items():
        print(f"  {family}: {count} ({count/len(products_df)*100:.1f}%)")
    
    # Status distribution
    print("\nProduct Status Distribution:")
    status_counts = products_df['status'].value_counts()
    for status, count in status_counts.items():
        print(f"  {status}: {count} ({count/len(products_df)*100:.1f}%)")
    
    # Units of measure distribution
    print("\nUnits of Measure Distribution:")
    unit_counts = products_df['unit_of_measure'].value_counts().head(10)
    for unit, count in unit_counts.items():
        print(f"  {unit}: {count} ({count/len(products_df)*100:.1f}%)")
    
    # Price statistics
    print("\nPrice Statistics:")
    print(f"  Average base cost: ${products_df['base_cost'].mean():.2f}")
    print(f"  Average list price: ${products_df['list_price'].mean():.2f}")
    print(f"  Average markup: {(products_df['list_price'] / products_df['base_cost']).mean():.2f}x")
    
    # Shelf life statistics
    print("\nShelf Life Statistics:")
    print(f"  Average shelf life: {products_df['shelf_life_days'].mean():.1f} days ({products_df['shelf_life_days'].mean()/365:.1f} years)")
    print(f"  Minimum shelf life: {products_df['shelf_life_days'].min()} days ({products_df['shelf_life_days'].min()/365:.1f} years)")
    print(f"  Maximum shelf life: {products_df['shelf_life_days'].max()} days ({products_df['shelf_life_days'].max()/365:.1f} years)")
    
    # Product age (days since introduction)
    products_df['introduction_date'] = pd.to_datetime(products_df['introduction_date'])
    products_df['age_days'] = (datetime.now() - products_df['introduction_date']).dt.days
    
    print("\nProduct Age Statistics:")
    print(f"  Average product age: {products_df['age_days'].mean():.1f} days ({products_df['age_days'].mean()/365:.1f} years)")
    print(f"  Newest product: {products_df['age_days'].min()} days old")
    print(f"  Oldest product: {products_df['age_days'].max()} days old ({products_df['age_days'].max()/365:.1f} years)")
    
    # Parent-child relationships
    parent_count = products_df['parent_product_id'].apply(lambda x: x != "").sum()
    print(f"\nProducts with parent: {parent_count} ({parent_count/len(products_df)*100:.1f}%)")
    
    # Materials statistics (if available)
    if materials_df is not None and len(materials_df) > 0:
        print("\nMaterials Statistics:")
        print(f"Total materials: {len(materials_df)}")
        
        # Material type distribution
        print("\nMaterial Type Distribution:")
        type_counts = materials_df['material_type'].value_counts()
        for mat_type, count in type_counts.items():
            print(f"  {mat_type}: {count} ({count/len(materials_df)*100:.1f}%)")
        
        # Status distribution
        print("\nMaterial Status Distribution:")
        status_counts = materials_df['status'].value_counts()
        for status, count in status_counts.items():
            print(f"  {status}: {count} ({count/len(materials_df)*100:.1f}%)")
        
        # Cost statistics
        print("\nCost Statistics:")
        print(f"  Average standard cost: ${materials_df['standard_cost'].mean():.2f}")
        print(f"  Minimum standard cost: ${materials_df['standard_cost'].min():.2f}")
        print(f"  Maximum standard cost: ${materials_df['standard_cost'].max():.2f}")
        
        # Lead time statistics
        print("\nLead Time Statistics:")
        print(f"  Average lead time: {materials_df['lead_time_days'].mean():.1f} days")
        print(f"  Minimum lead time: {materials_df['lead_time_days'].min()} days")
        print(f"  Maximum lead time: {materials_df['lead_time_days'].max()} days")
        
        # Hazard classification distribution
        print("\nHazard Classification Distribution:")
        hazard_counts = materials_df['hazard_classification'].value_counts()
        for hazard, count in hazard_counts.items():
            print(f"  {hazard}: {count} ({count/len(materials_df)*100:.1f}%)")
        
        # Supplier distribution
        avg_suppliers = materials_df['approved_suppliers'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0).mean()
        print(f"\nAverage approved suppliers per material: {avg_suppliers:.1f}")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Generate products data
    products_df = generate_products_data(
        num_products=100,  # Generate 100 product records
        output_file="data/products.csv"
    )
    
    # Generate materials data
    materials_df = generate_materials_data(
        products_df,
        num_materials=150,  # Generate 150 material records
        output_file="data/materials.csv"
    )
    
    # Display statistics
    if products_df is not None:
        display_statistics(products_df, materials_df)
        
        # Display sample data
        print("\nSample products data (first 5 records):")
        print(products_df.head(5))
        
        if materials_df is not None:
            print("\nSample materials data (first 5 records):")
            print(materials_df.head(5))

Successfully generated 100 product records.
Data saved to data/products.csv
Successfully generated 150 material records.
Data saved to data/materials.csv

Products Statistics:
Total products: 100

Product Family Distribution:
  Automotive: 24 (24.0%)
  Consumer Goods: 19 (19.0%)
  Food & Beverage: 16 (16.0%)
  Pharmaceutical: 15 (15.0%)
  Electronics: 15 (15.0%)
  Chemical: 11 (11.0%)

Product Status Distribution:
  Active: 69 (69.0%)
  In Development: 12 (12.0%)
  Obsolete: 11 (11.0%)
  Discontinued: 5 (5.0%)
  On Hold: 3 (3.0%)

Units of Measure Distribution:
  unit: 20 (20.0%)
  pallet: 8 (8.0%)
  assembly: 8 (8.0%)
  piece: 8 (8.0%)
  kit: 7 (7.0%)
  pack: 7 (7.0%)
  set: 6 (6.0%)
  ml: 6 (6.0%)
  g: 4 (4.0%)
  L: 4 (4.0%)

Price Statistics:
  Average base cost: $246.03
  Average list price: $471.71
  Average markup: 1.89x

Shelf Life Statistics:
  Average shelf life: 1673.8 days (4.6 years)
  Minimum shelf life: 64 days (0.2 years)
  Maximum shelf life: 3620 days (9.9 years)

Prod

Bill Of Materials

In [7]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def load_products_data(products_file="data/products.csv"):
    """
    Load the previously generated products data
    
    Parameters:
    - products_file: CSV file containing products data
    
    Returns:
    - DataFrame containing the products data
    """
    try:
        return pd.read_csv(products_file)
    except FileNotFoundError:
        print(f"Error: Products data file {products_file} not found.")
        print("Please run the products data generation script first.")
        return None

def load_materials_data(materials_file="data/materials.csv"):
    """
    Load the previously generated materials data
    
    Parameters:
    - materials_file: CSV file containing materials data
    
    Returns:
    - DataFrame containing the materials data
    """
    try:
        return pd.read_csv(materials_file)
    except FileNotFoundError:
        print(f"Error: Materials data file {materials_file} not found.")
        print("Please run the materials data generation script first.")
        return None

def generate_bill_of_materials(products_df, materials_df, output_file="data/bill_of_materials.csv"):
    """
    Generate synthetic data for the BillOfMaterials table from ISA-95 Level 4.
    
    Parameters:
    - products_df: DataFrame containing products data
    - materials_df: DataFrame containing materials data
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated bill of materials data
    """
    if products_df is None or len(products_df) == 0:
        print("Error: No products data available.")
        return None
        
    if materials_df is None or len(materials_df) == 0:
        print("Error: No materials data available.")
        return None
    
    # Define data structure
    data = {
        "bom_id": [],
        "product_id": [],
        "material_id": [],
        "quantity": [],
        "unit": [],
        "reference_designator": [],
        "bom_level": [],
        "effective_date": [],
        "obsolete_date": [],
        "alternative_material_ids": [],
        "scrap_factor": []
    }
    
    # Track the number of BOM records we'll create
    total_bom_records = 0
    
    # Process each product to create its bill of materials
    for _, product in products_df.iterrows():
        product_id = product['product_id']
        product_family = product['product_family']
        product_status = product['status']
        
        # Skip creating BOMs for obsolete or discontinued products
        if product_status in ["Obsolete", "Discontinued"]:
            continue
        
        # Determine the number of materials to include in this product's BOM
        # More complex products have more components
        if product_family in ["Pharmaceutical", "Electronics", "Automotive"]:
            num_materials = random.randint(5, 15)
        elif product_family in ["Food & Beverage", "Chemical"]:
            num_materials = random.randint(3, 10)
        else:
            num_materials = random.randint(2, 8)
        
        # Ensure we don't try to select more materials than available
        num_materials = min(num_materials, len(materials_df))
        
        # Select materials for this product's BOM (without replacement for this product)
        # Filter materials by type first to ensure logical combinations
        raw_materials = materials_df[materials_df['material_type'] == 'Raw Material']
        packaging_materials = materials_df[materials_df['material_type'] == 'Packaging']
        other_materials = materials_df[~materials_df['material_type'].isin(['Raw Material', 'Packaging'])]
        
        # Determine mix of material types (usually more raw materials than packaging)
        num_raw = int(num_materials * 0.6)
        num_packaging = int(num_materials * 0.3)
        num_other = num_materials - num_raw - num_packaging
        
        # Adjust if not enough materials of a certain type
        if len(raw_materials) < num_raw:
            num_raw = len(raw_materials)
            num_other += (num_materials - num_raw - num_packaging)
        
        if len(packaging_materials) < num_packaging:
            num_packaging = len(packaging_materials)
            num_other += (num_materials - num_raw - num_packaging)
        
        if len(other_materials) < num_other:
            num_other = len(other_materials)
        
        # Select materials of each type
        selected_raw = raw_materials.sample(num_raw) if num_raw > 0 else pd.DataFrame()
        selected_packaging = packaging_materials.sample(num_packaging) if num_packaging > 0 else pd.DataFrame()
        selected_other = other_materials.sample(num_other) if num_other > 0 else pd.DataFrame()
        
        # Combine selected materials
        selected_materials = pd.concat([selected_raw, selected_packaging, selected_other])
        
        # Create BOM records for this product
        for level, (_, material) in enumerate(selected_materials.iterrows(), 1):
            material_id = material['material_id']
            material_type = material['material_type']
            
            # Generate a unique BOM ID
            bom_id = f"BOM-{uuid.uuid4().hex[:8].upper()}"
            data["bom_id"].append(bom_id)
            data["product_id"].append(product_id)
            data["material_id"].append(material_id)
            
            # Set BOM level
            if material_type == 'Raw Material':
                bom_level = 1  # Raw materials typically at level 1
            elif material_type == 'Packaging':
                bom_level = 2  # Packaging typically at level 2
            else:
                bom_level = random.randint(1, 3)  # Other materials at various levels
                
            data["bom_level"].append(bom_level)
            
            # Set material quantity based on material type and unit
            material_unit = material['unit_of_measure']
            data["unit"].append(material_unit)
            
            if material_type == 'Raw Material':
                if material_unit in ['kg', 'L']:
                    quantity = random.uniform(0.1, 100)
                elif material_unit in ['g', 'ml']:
                    quantity = random.uniform(1, 5000)
                else:
                    quantity = random.uniform(1, 100)
            elif material_type == 'Packaging':
                if material_unit in ['piece', 'unit']:
                    quantity = random.randint(1, 10)
                else:
                    quantity = random.uniform(0.1, 10)
            else:
                quantity = random.uniform(0.1, 50)
                
            # Round to appropriate precision
            if material_unit in ['g', 'ml']:
                quantity = round(quantity, 0)
            else:
                quantity = round(quantity, 2)
                
            data["quantity"].append(quantity)
            
            # Set reference designator (mainly for assembled products)
            if product_family in ["Electronics", "Automotive"] and material_type != 'Packaging':
                reference_designators = [
                    f"POS-{random.randint(1, 100)}",
                    f"COMP-{random.randint(1, 100)}",
                    f"ASY-{random.randint(1, 100)}",
                    f"PCB-{random.randint(1, 100)}",
                    f"MOD-{random.randint(1, 100)}"
                ]
                data["reference_designator"].append(random.choice(reference_designators))
            else:
                data["reference_designator"].append("")
            
            # Set effective and obsolete dates
            # Effective date is typically before product introduction
            if pd.notna(product['introduction_date']):
                intro_date = pd.to_datetime(product['introduction_date'])
                effective_date = intro_date - timedelta(days=random.randint(30, 180))
                data["effective_date"].append(effective_date.strftime("%Y-%m-%d"))
            else:
                effective_date = datetime.now() - timedelta(days=random.randint(30, 365))
                data["effective_date"].append(effective_date.strftime("%Y-%m-%d"))
            
            # Most BOM items don't have obsolete dates
            if random.random() < 0.1:  # 10% chance of having an obsolete date
                obsolete_date = datetime.now() + timedelta(days=random.randint(180, 730))
                data["obsolete_date"].append(obsolete_date.strftime("%Y-%m-%d"))
            else:
                data["obsolete_date"].append("")
            
            # Set alternative materials
            # About 20% of materials have alternatives
            if random.random() < 0.2:
                # Find materials of the same type to use as alternatives
                same_type_materials = materials_df[materials_df['material_type'] == material_type]
                same_type_materials = same_type_materials[same_type_materials['material_id'] != material_id]
                
                if len(same_type_materials) > 0:
                    num_alternatives = random.randint(1, min(3, len(same_type_materials)))
                    alternatives = same_type_materials.sample(num_alternatives)['material_id'].tolist()
                    data["alternative_material_ids"].append(str(alternatives))
                else:
                    data["alternative_material_ids"].append("[]")
            else:
                data["alternative_material_ids"].append("[]")
            
            # Set scrap factor (higher for more complex materials)
            if material_type == 'Raw Material':
                scrap_factor = random.uniform(0.02, 0.1)  # 2-10% scrap
            elif material_type == 'Packaging':
                scrap_factor = random.uniform(0.01, 0.05)  # 1-5% scrap
            else:
                scrap_factor = random.uniform(0.03, 0.15)  # 3-15% scrap
                
            data["scrap_factor"].append(round(scrap_factor, 3))
            
            total_bom_records += 1
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {total_bom_records} bill of materials records for {df['product_id'].nunique()} products.")
    print(f"Data saved to {output_file}")
    
    return df

def display_statistics(bom_df, products_df=None, materials_df=None):
    """
    Display basic statistics about the generated bill of materials data
    
    Parameters:
    - bom_df: DataFrame containing bill of materials data
    - products_df: DataFrame containing products data (optional)
    - materials_df: DataFrame containing materials data (optional)
    """
    if bom_df is None or len(bom_df) == 0:
        print("No bill of materials data to analyze.")
        return
    
    print("\nBill of Materials Statistics:")
    print(f"Total BOM records: {len(bom_df)}")
    print(f"Products with BOMs: {bom_df['product_id'].nunique()}")
    print(f"Materials used in BOMs: {bom_df['material_id'].nunique()}")
    
    # BOM level distribution
    print("\nBOM Level Distribution:")
    level_counts = bom_df['bom_level'].value_counts().sort_index()
    for level, count in level_counts.items():
        print(f"  Level {level}: {count} ({count/len(bom_df)*100:.1f}%)")
    
    # Materials per product statistics
    materials_per_product = bom_df.groupby('product_id')['material_id'].count()
    
    print("\nMaterials Per Product Statistics:")
    print(f"  Average materials per product: {materials_per_product.mean():.1f}")
    print(f"  Minimum materials per product: {materials_per_product.min()}")
    print(f"  Maximum materials per product: {materials_per_product.max()}")
    
    # Material type distribution (if materials_df is available)
    if materials_df is not None:
        # Create a mapping of material_id to material_type
        material_type_map = dict(zip(materials_df['material_id'], materials_df['material_type']))
        
        # Add material type to BOM data
        bom_df['material_type'] = bom_df['material_id'].map(material_type_map)
        
        print("\nMaterial Type Distribution in BOMs:")
        type_counts = bom_df['material_type'].value_counts()
        for mat_type, count in type_counts.items():
            print(f"  {mat_type}: {count} ({count/len(bom_df)*100:.1f}%)")
    
    # Product family distribution (if products_df is available)
    if products_df is not None:
        # Create a mapping of product_id to product_family
        product_family_map = dict(zip(products_df['product_id'], products_df['product_family']))
        
        # Add product family to BOM data
        bom_df['product_family'] = bom_df['product_id'].map(product_family_map)
        
        print("\nProduct Family Distribution in BOMs:")
        family_counts = bom_df.groupby('product_family')['product_id'].nunique()
        total_products = family_counts.sum()
        
        for family, count in family_counts.items():
            print(f"  {family}: {count} products ({count/total_products*100:.1f}%)")
        
        # Materials per product by family
        print("\nAverage Materials Per Product by Family:")
        for family in family_counts.index:
            family_boms = bom_df[bom_df['product_family'] == family]
            avg_materials = family_boms.groupby('product_id')['material_id'].count().mean()
            print(f"  {family}: {avg_materials:.1f} materials per product")
    
    # Alternative materials statistics
    has_alternatives = bom_df['alternative_material_ids'].apply(lambda x: x != "[]").sum()
    print(f"\nBOM items with alternative materials: {has_alternatives} ({has_alternatives/len(bom_df)*100:.1f}%)")
    
    # Scrap factor statistics
    print("\nScrap Factor Statistics:")
    print(f"  Average scrap factor: {bom_df['scrap_factor'].mean():.1%}")
    print(f"  Minimum scrap factor: {bom_df['scrap_factor'].min():.1%}")
    print(f"  Maximum scrap factor: {bom_df['scrap_factor'].max():.1%}")
    
    # Reference designator statistics
    has_designator = bom_df['reference_designator'].apply(lambda x: x != "").sum()
    print(f"\nBOM items with reference designators: {has_designator} ({has_designator/len(bom_df)*100:.1f}%)")
    
    # Effective date distribution
    bom_df['effective_date'] = pd.to_datetime(bom_df['effective_date'])
    
    print("\nEffective Date Distribution:")
    oldest = bom_df['effective_date'].min()
    newest = bom_df['effective_date'].max()
    print(f"  Oldest effective date: {oldest.strftime('%Y-%m-%d')}")
    print(f"  Newest effective date: {newest.strftime('%Y-%m-%d')}")
    
    # Count by year
    bom_df['effective_year'] = bom_df['effective_date'].dt.year
    year_counts = bom_df['effective_year'].value_counts().sort_index()
    
    print("  Effective dates by year:")
    for year, count in year_counts.items():
        print(f"    {year}: {count} ({count/len(bom_df)*100:.1f}%)")
    
    # Obsolete date statistics
    has_obsolete = bom_df['obsolete_date'].apply(lambda x: x != "").sum()
    print(f"\nBOM items with obsolete dates: {has_obsolete} ({has_obsolete/len(bom_df)*100:.1f}%)")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load required data
    products_df = load_products_data()
    materials_df = load_materials_data()
    
    if products_df is not None and materials_df is not None:
        # Generate bill of materials data
        bom_df = generate_bill_of_materials(
            products_df,
            materials_df,
            output_file="data/bill_of_materials.csv"
        )
        
        # Display statistics
        if bom_df is not None:
            display_statistics(bom_df, products_df, materials_df)
            
            # Display sample data
            print("\nSample bill of materials data (first 5 records):")
            print(bom_df.head(5))

Successfully generated 680 bill of materials records for 84 products.
Data saved to data/bill_of_materials.csv

Bill of Materials Statistics:
Total BOM records: 680
Products with BOMs: 84
Materials used in BOMs: 145

BOM Level Distribution:
  Level 1: 426 (62.6%)
  Level 2: 205 (30.1%)
  Level 3: 49 (7.2%)

Materials Per Product Statistics:
  Average materials per product: 8.1
  Minimum materials per product: 2
  Maximum materials per product: 15

Material Type Distribution in BOMs:
  Raw Material: 372 (54.7%)
  Packaging: 164 (24.1%)
  WIP: 64 (9.4%)
  Intermediate: 43 (6.3%)
  Consumable: 37 (5.4%)

Product Family Distribution in BOMs:
  Automotive: 19 products (22.6%)
  Chemical: 9 products (10.7%)
  Consumer Goods: 17 products (20.2%)
  Electronics: 14 products (16.7%)
  Food & Beverage: 14 products (16.7%)
  Pharmaceutical: 11 products (13.1%)

Average Materials Per Product by Family:
  Automotive: 10.1 materials per product
  Chemical: 6.4 materials per product
  Consumer Goods: 

Customer Orders

In [8]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def generate_customers_data(num_customers=100, output_file="data/customers.csv"):
    """
    Generate synthetic data for the Customers table from ISA-95 Level 4.
    
    Parameters:
    - num_customers: Number of customer records to generate
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated customers data
    """
    # Define customer types and their probabilities
    customer_types = {
        "Distributor": 0.3,
        "Retailer": 0.25,
        "Wholesaler": 0.2,
        "Direct Customer": 0.15,
        "Contract Manufacturer": 0.1
    }
    
    # Define industries by customer type
    industries = {
        "Distributor": [
            "Pharmaceutical Distribution", "Food & Beverage Distribution", 
            "Chemical Distribution", "Industrial Supply", "Electronics Distribution",
            "Automotive Parts Distribution", "Consumer Goods Distribution"
        ],
        "Retailer": [
            "Pharmacy", "Grocery", "Department Store", "Specialty Store", 
            "Online Retail", "Home Improvement", "Electronics Retail"
        ],
        "Wholesaler": [
            "Pharmaceutical Wholesale", "Food Service", "Industrial Equipment", 
            "Building Materials", "Electronic Components", "Automotive Parts"
        ],
        "Direct Customer": [
            "Hospital", "Restaurant Chain", "Manufacturing", "Government", 
            "Educational Institution", "Healthcare Provider"
        ],
        "Contract Manufacturer": [
            "Pharmaceutical Manufacturing", "Food Processing", "Electronics Assembly", 
            "Automotive Manufacturing", "Medical Device Manufacturing"
        ]
    }
    
    # Define credit terms
    credit_terms = ["Net 30", "Net 45", "Net 60", "2/10 Net 30", "COD", "Prepaid"]
    credit_terms_weights = [0.4, 0.2, 0.15, 0.1, 0.1, 0.05]  # Probabilities
    
    # Define regions for address generation
    regions = {
        "North America": ["USA", "Canada"],
        "Europe": ["UK", "Germany", "France", "Italy", "Spain", "Netherlands"],
        "Asia": ["Japan", "China", "South Korea", "India", "Singapore", "Taiwan"],
        "Latin America": ["Mexico", "Brazil", "Colombia", "Argentina", "Chile"],
        "Oceania": ["Australia", "New Zealand"]
    }
    
    # Region probabilities (adjust as needed for your business model)
    region_weights = {
        "North America": 0.4,
        "Europe": 0.3,
        "Asia": 0.15,
        "Latin America": 0.1,
        "Oceania": 0.05
    }
    
    # Generate personnel IDs for account managers
    account_manager_ids = [f"PERS-{uuid.uuid4().hex[:8].upper()}" for _ in range(10)]
    
    # Generate data structure
    data = {
        "customer_id": [f"CUST-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_customers)],
        "customer_name": [],
        "customer_type": [],
        "industry": [],
        "contact_person": [],
        "email": [],
        "phone": [],
        "address": [],
        "credit_terms": [],
        "credit_limit": [],
        "status": [],
        "account_manager_id": []
    }
    
    # First names for contact generation
    first_names = [
        "James", "Mary", "John", "Patricia", "Robert", "Jennifer", "Michael", "Linda", 
        "William", "Elizabeth", "David", "Barbara", "Richard", "Susan", "Joseph", "Jessica", 
        "Thomas", "Sarah", "Charles", "Karen", "Christopher", "Nancy", "Daniel", "Lisa", 
        "Matthew", "Margaret", "Anthony", "Betty", "Mark", "Sandra", "Donald", "Ashley", 
        "Steven", "Kimberly", "Paul", "Emily", "Andrew", "Donna", "Joshua", "Michelle", 
        "Kenneth", "Dorothy", "Kevin", "Carol", "Brian", "Amanda", "George", "Melissa", 
        "Edward", "Deborah", "Ronald", "Stephanie", "Timothy", "Rebecca", "Jason", "Sharon", 
        "Jeffrey", "Laura", "Ryan", "Cynthia", "Jacob", "Kathleen", "Gary", "Amy", 
        "Nicholas", "Shirley", "Eric", "Angela", "Jonathan", "Helen", "Stephen", "Anna", 
        "Larry", "Brenda", "Justin", "Pamela", "Scott", "Nicole", "Brandon", "Emma", 
        "Benjamin", "Samantha", "Samuel", "Katherine", "Gregory", "Christine", "Frank", "Debra", 
        "Alexander", "Rachel", "Raymond", "Catherine", "Patrick", "Carolyn", "Jack", "Janet", 
        "Dennis", "Ruth", "Jerry", "Maria", "Tyler", "Heather", "Aaron", "Diane", 
        "Jose", "Virginia", "Adam", "Julie", "Henry", "Joyce", "Nathan", "Victoria", 
        "Douglas", "Olivia", "Zachary", "Kelly", "Peter", "Christina", "Kyle", "Lauren", 
        "Walter", "Joan", "Ethan", "Evelyn", "Jeremy", "Judith", "Harold", "Megan", 
        "Keith", "Cheryl", "Christian", "Andrea", "Roger", "Hannah", "Noah", "Martha"
    ]
    
    # Last names for contact generation
    last_names = [
        "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", 
        "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", 
        "Thomas", "Taylor", "Moore", "Jackson", "Martin", "Lee", "Perez", "Thompson", 
        "White", "Harris", "Sanchez", "Clark", "Ramirez", "Lewis", "Robinson", "Walker", 
        "Young", "Allen", "King", "Wright", "Scott", "Torres", "Nguyen", "Hill", "Flores", 
        "Green", "Adams", "Nelson", "Baker", "Hall", "Rivera", "Campbell", "Mitchell", 
        "Carter", "Roberts", "Gomez", "Phillips", "Evans", "Turner", "Diaz", "Parker", 
        "Cruz", "Edwards", "Collins", "Reyes", "Stewart", "Morris", "Morales", "Murphy", 
        "Cook", "Rogers", "Gutierrez", "Ortiz", "Morgan", "Cooper", "Peterson", "Bailey", 
        "Reed", "Kelly", "Howard", "Ramos", "Kim", "Cox", "Ward", "Richardson", "Watson", 
        "Brooks", "Chavez", "Wood", "James", "Bennett", "Gray", "Mendoza", "Ruiz", "Hughes", 
        "Price", "Alvarez", "Castillo", "Sanders", "Patel", "Myers", "Long", "Ross", 
        "Foster", "Jimenez", "Powell", "Jenkins", "Perry", "Russell", "Sullivan", "Bell", 
        "Coleman", "Butler", "Henderson", "Barnes", "Gonzales", "Fisher", "Vasquez", "Simmons", 
        "Romero", "Jordan", "Patterson", "Alexander", "Hamilton", "Graham", "Reynolds", "Griffin", 
        "Wallace", "Moreno", "West", "Cole", "Hayes", "Bryant", "Herrera", "Gibson", "Ellis"
    ]
    
    # Company name components for generating realistic company names
    company_prefixes = [
        "Advanced", "Allied", "American", "Asian", "Atlantic", "Best", "Better", "Blue", 
        "Bright", "Central", "Century", "Consolidated", "Continental", "Digital", "Dynamic", 
        "East", "Eastern", "Euro", "European", "Express", "First", "Global", "Golden", 
        "Great", "Green", "International", "Metro", "Modern", "National", "New", "North", 
        "Northern", "Pacific", "Pan", "Premier", "Prime", "Pro", "Professional", "Quality", 
        "Red", "Regional", "Royal", "Select", "South", "Southern", "Standard", "Superior", 
        "Supreme", "Trans", "United", "Universal", "West", "Western", "World"
    ]
    
    company_types = {
        "Distributor": ["Distribution", "Distributors", "Supply", "Supplies", "Logistics"],
        "Retailer": ["Retail", "Stores", "Mart", "Market", "Shop", "Outlet"],
        "Wholesaler": ["Wholesale", "Trading", "Merchandise", "Commerce"],
        "Direct Customer": ["Industries", "Group", "Corporation", "Co.", "Inc.", "Enterprises"],
        "Contract Manufacturer": ["Manufacturing", "Production", "Fabrication", "Industries", "Processors"]
    }
    
    # Generate data for each customer
    for i in range(num_customers):
        # Select customer type (weighted random)
        customer_type = random.choices(
            list(customer_types.keys()), 
            weights=list(customer_types.values())
        )[0]
        data["customer_type"].append(customer_type)
        
        # Generate a realistic company name
        if random.random() < 0.7:  # 70% chance of using prefix
            prefix = random.choice(company_prefixes)
            if customer_type in company_types:
                suffix = random.choice(company_types[customer_type])
            else:
                suffix = random.choice(list(company_types.values())[0])
                
            company_name = f"{prefix} {suffix}"
        else:
            # Use a last name
            last_name = random.choice(last_names)
            if customer_type in company_types:
                suffix = random.choice(company_types[customer_type])
            else:
                suffix = random.choice(list(company_types.values())[0])
                
            company_name = f"{last_name} {suffix}"
        
        data["customer_name"].append(company_name)
        
        # Select industry based on customer type
        if customer_type in industries:
            industry = random.choice(industries[customer_type])
        else:
            # Default to general industry
            industry = "General Manufacturing"
        
        data["industry"].append(industry)
        
        # Generate contact person (random first and last name)
        contact_first = random.choice(first_names)
        contact_last = random.choice(last_names)
        data["contact_person"].append(f"{contact_first} {contact_last}")
        
        # Generate email (company domain based on name)
        company_domain = company_name.lower().replace(" ", "").replace(".", "")
        email_domains = [".com", ".net", ".org", ".co", ".biz"]
        email_domain = random.choice(email_domains)
        data["email"].append(f"{contact_first.lower()}.{contact_last.lower()}@{company_domain}{email_domain}")
        
        # Generate phone
        data["phone"].append(f"+{random.randint(1, 9)}{random.randint(10, 99)} {random.randint(100, 999)} {random.randint(100, 999)} {random.randint(1000, 9999)}")
        
        # Generate address based on region probability
        region = random.choices(
            list(region_weights.keys()), 
            weights=list(region_weights.values())
        )[0]
        
        country = random.choice(regions[region])
        
        # Generate a city name (simplified)
        city_prefixes = ["New", "Old", "East", "West", "North", "South", "Central", "Upper", "Lower", "Port", "Lake", "Mount", "Fort"]
        city_suffixes = ["town", "ville", "burg", "berg", "field", "ford", "port", "mouth", "stad", "furt", "chester", "cester", "bridge", "haven", "minster"]
        
        if random.random() < 0.3:  # 30% chance of using prefix
            city = f"{random.choice(city_prefixes)} {random.choice(last_names)}{random.choice(['', random.choice(city_suffixes)])}"
        else:
            city = f"{random.choice(last_names)}{random.choice(['', random.choice(city_suffixes)])}"
        
        # Generate street address
        street_number = random.randint(1, 9999)
        street_types = ["Street", "Avenue", "Boulevard", "Road", "Lane", "Drive", "Way", "Place", "Court", "Terrace"]
        street_name = f"{random.choice(last_names)} {random.choice(street_types)}"
        
        address = f"{street_number} {street_name}, {city}, {country}"
        data["address"].append(address)
        
        # Set credit terms (weighted random)
        credit_term = random.choices(credit_terms, weights=credit_terms_weights)[0]
        data["credit_terms"].append(credit_term)
        
        # Set credit limit based on customer type
        if customer_type in ["Distributor", "Wholesaler"]:
            # Larger customers typically have higher credit limits
            credit_limit = random.randint(50000, 500000)
        elif customer_type == "Retailer":
            credit_limit = random.randint(10000, 100000)
        elif customer_type == "Contract Manufacturer":
            credit_limit = random.randint(100000, 1000000)
        else:
            credit_limit = random.randint(5000, 50000)
            
        data["credit_limit"].append(credit_limit)
        
        # Set status (mostly active)
        statuses = ["Active", "Inactive", "On Hold", "New", "Archived"]
        status_weights = [0.8, 0.05, 0.05, 0.07, 0.03]  # Probabilities
        data["status"].append(random.choices(statuses, weights=status_weights)[0])
        
        # Assign account manager
        data["account_manager_id"].append(random.choice(account_manager_ids))
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} customer records.")
    print(f"Data saved to {output_file}")
    
    return df

def generate_customer_orders(customers_df, products_df=None, num_orders=300, 
                           start_time=None, end_time=None, output_file="data/customer_orders.csv"):
    """
    Generate synthetic data for the CustomerOrders table from ISA-95 Level 4.
    
    Parameters:
    - customers_df: DataFrame containing customers data
    - products_df: DataFrame containing products data (optional)
    - num_orders: Number of customer order records to generate
    - start_time: Start time for order dates (defaults to 365 days ago)
    - end_time: End time for order dates (defaults to 30 days in the future)
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated customer orders data
    """
    if customers_df is None or len(customers_df) == 0:
        print("Error: No customers data available.")
        return None
    
    # Set default time range if not provided
    if start_time is None:
        start_time = datetime.now() - timedelta(days=365)
    if end_time is None:
        end_time = datetime.now() + timedelta(days=30)
    
    # Generate product IDs if products_df is not provided
    if products_df is None or len(products_df) == 0:
        print("Generating synthetic product IDs...")
        product_ids = [f"PROD-{uuid.uuid4().hex[:8].upper()}" for _ in range(50)]
    else:
        # Use actual product IDs from products_df
        product_ids = products_df['product_id'].tolist()
    
    # Define order types and their probabilities
    order_types = {
        "Standard": 0.7,
        "Rush": 0.1,
        "Scheduled": 0.1,
        "Blanket": 0.05,
        "Sample": 0.05
    }
    
    # Define priority levels
    priority_levels = [1, 2, 3, 4, 5]  # 1 = highest, 5 = lowest
    priority_weights = [0.1, 0.2, 0.4, 0.2, 0.1]  # Most orders are medium priority
    
    # Define payment terms
    payment_terms = ["Net 30", "Net 45", "Net 60", "2/10 Net 30", "COD", "Prepaid"]
    
    # Define shipping methods
    shipping_methods = ["Truck", "Air", "Sea", "Rail", "Express", "Courier", "Customer Pickup"]
    
    # Generate sales rep IDs
    sales_rep_ids = [f"PERS-{uuid.uuid4().hex[:8].upper()}" for _ in range(15)]
    
    # Generate data structure
    data = {
        "order_id": [f"CO-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_orders)],
        "customer_id": [],
        "order_date": [],
        "requested_delivery_date": [],
        "promised_delivery_date": [],
        "status": [],
        "order_type": [],
        "priority": [],
        "order_value": [],
        "payment_terms": [],
        "shipping_method": [],
        "sales_rep_id": [],
        "notes": []
    }
    
    # Generate data for each order
    for i in range(num_orders):
        # Select customer (more active customers place more orders)
        active_customers = customers_df[customers_df['status'] == 'Active']
        
        if len(active_customers) > 0:
            # Prefer active customers (80% chance)
            if random.random() < 0.8:
                customer = active_customers.sample(1).iloc[0]
            else:
                customer = customers_df.sample(1).iloc[0]
        else:
            customer = customers_df.sample(1).iloc[0]
            
        data["customer_id"].append(customer['customer_id'])
        
        # Generate order date
        time_range_days = (end_time - start_time).days
        days_from_start = random.randint(0, time_range_days)
        order_date = start_time + timedelta(days=days_from_start)
        data["order_date"].append(order_date.strftime("%Y-%m-%d"))
        
        # Select order type (weighted random)
        order_type = random.choices(
            list(order_types.keys()), 
            weights=list(order_types.values())
        )[0]
        data["order_type"].append(order_type)
        
        # Generate requested delivery date based on order type
        if order_type == "Rush":
            # Rush orders have shorter delivery windows
            delivery_window = random.randint(1, 14)  # 1-14 days
        elif order_type == "Standard":
            delivery_window = random.randint(14, 45)  # 2-6 weeks
        elif order_type == "Scheduled":
            delivery_window = random.randint(30, 90)  # 1-3 months
        elif order_type == "Blanket":
            delivery_window = random.randint(60, 180)  # 2-6 months
        else:  # Sample
            delivery_window = random.randint(7, 30)  # 1-4 weeks
            
        requested_delivery_date = order_date + timedelta(days=delivery_window)
        data["requested_delivery_date"].append(requested_delivery_date.strftime("%Y-%m-%d"))
        
        # Generate promised delivery date (usually close to requested, but can vary)
        promise_variation = random.randint(-5, 10)  # -5 to +10 days from requested
        promised_delivery_date = requested_delivery_date + timedelta(days=promise_variation)
        
        # Ensure promised date is not before order date
        if promised_delivery_date <= order_date:
            promised_delivery_date = order_date + timedelta(days=1)
            
        data["promised_delivery_date"].append(promised_delivery_date.strftime("%Y-%m-%d"))
        
        # Determine order status based on dates
        current_date = datetime.now()
        
        if order_date > current_date:
            # Future orders are typically in Draft or Pending status
            status = random.choice(["Draft", "Pending"])
        elif requested_delivery_date > current_date:
            # Current orders are In Process or Confirmed
            status = random.choice(["In Process", "Confirmed", "Partially Shipped"])
        else:
            # Past orders are Completed, Cancelled, or On Hold
            status_options = ["Completed", "Completed", "Completed", "Cancelled", "On Hold"]  # Weighted for more completed
            status = random.choice(status_options)
            
        data["status"].append(status)
        
        # Set priority (weighted random)
        priority = random.choices(priority_levels, weights=priority_weights)[0]
        
        # Rush orders typically have higher priority
        if order_type == "Rush" and priority > 2:
            priority = random.randint(1, 2)
            
        data["priority"].append(priority)
        
        # Generate order value (based on customer credit limit as a rough guide)
        if 'credit_limit' in customer:
            max_order = customer['credit_limit'] * 0.5  # Typically orders are less than 50% of credit limit
        else:
            max_order = 50000
            
        order_value = random.uniform(1000, max_order)
        data["order_value"].append(round(order_value, 2))
        
        # Set payment terms (use customer terms if available)
        if 'credit_terms' in customer and pd.notna(customer['credit_terms']):
            data["payment_terms"].append(customer['credit_terms'])
        else:
            data["payment_terms"].append(random.choice(payment_terms))
        
        # Set shipping method
        data["shipping_method"].append(random.choice(shipping_methods))
        
        # Assign sales rep
        data["sales_rep_id"].append(random.choice(sales_rep_ids))
        
        # Generate notes (mostly empty)
        if random.random() < 0.2:  # 20% chance of having notes
            notes_options = [
                "Customer requested special packaging",
                "Delivery must be on exact date",
                "Call customer before shipping",
                "Include certificates of analysis",
                "Partial shipments acceptable",
                "Do not substitute products",
                f"Reference PO #{random.randint(10000, 99999)}",
                "Preferred carrier requested",
                "Weekend delivery authorized",
                "Contact warehouse manager upon arrival"
            ]
            data["notes"].append(random.choice(notes_options))
        else:
            data["notes"].append("")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} customer order records.")
    print(f"Data saved to {output_file}")
    
    return df

def generate_order_lines(customer_orders_df, products_df=None, output_file="data/order_lines.csv"):
    """
    Generate synthetic data for the OrderLines table from ISA-95 Level 4.
    
    Parameters:
    - customer_orders_df: DataFrame containing customer orders data
    - products_df: DataFrame containing products data (optional)
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated order lines data
    """
    if customer_orders_df is None or len(customer_orders_df) == 0:
        print("Error: No customer orders data available.")
        return None
    
    # Generate product IDs if products_df is not provided
    if products_df is None or len(products_df) == 0:
        print("Generating synthetic product IDs...")
        product_ids = [f"PROD-{uuid.uuid4().hex[:8].upper()}" for _ in range(50)]
        
        # Create synthetic product prices
        product_prices = {}
        for prod_id in product_ids:
            product_prices[prod_id] = random.uniform(10, 1000)
    else:
        # Use actual product IDs and prices from products_df
        product_ids = products_df['product_id'].tolist()
        
        # Create price mapping
        product_prices = {}
        for _, product in products_df.iterrows():
            if 'list_price' in product and pd.notna(product['list_price']):
                product_prices[product['product_id']] = product['list_price']
            else:
                product_prices[product['product_id']] = random.uniform(10, 1000)
    
    # Generate work order IDs
    work_order_ids = [f"WO-{uuid.uuid4().hex[:8].upper()}" for _ in range(100)]
    
    # Generate data structure
    data = {
        "line_id": [],
        "order_id": [],
        "line_number": [],
        "product_id": [],
        "quantity": [],
        "unit_price": [],
        "line_value": [],
        "requested_delivery_date": [],
        "promised_delivery_date": [],
        "status": [],
        "work_order_id": [],
        "shipped_quantity": [],
        "shipping_date": []
    }
    
    # Process each customer order
    for _, order in customer_orders_df.iterrows():
        order_id = order['order_id']
        order_status = order['status']
        
        # Determine number of line items for this order
        num_lines = random.randint(1, 10)  # 1-10 line items per order
        
        # Keep track of selected products for this order to avoid duplicates
        selected_products = []
        
        # Get order dates
        order_date = pd.to_datetime(order['order_date'])
        requested_delivery_date = pd.to_datetime(order['requested_delivery_date'])
        promised_delivery_date = pd.to_datetime(order['promised_delivery_date'])
        
        # Generate line items
        for line_num in range(1, num_lines + 1):
            # Create unique line ID
            line_id = f"LINE-{uuid.uuid4().hex[:8].upper()}"
            data["line_id"].append(line_id)
            data["order_id"].append(order_id)
            data["line_number"].append(line_num)
            
            # Select product (avoid duplicates within same order)
            available_products = [p for p in product_ids if p not in selected_products]
            
            if not available_products:
                # If we've used all products, just pick a random one
                product_id = random.choice(product_ids)
            else:
                product_id = random.choice(available_products)
                selected_products.append(product_id)
                
            data["product_id"].append(product_id)
            
            # Generate quantity
            quantity = random.randint(1, 1000)
            data["quantity"].append(quantity)
            
            # Get unit price
            if product_id in product_prices:
                unit_price = product_prices[product_id]
            else:
                unit_price = random.uniform(10, 1000)
                
            # Apply random discount/markup
            price_adjustment = random.uniform(0.9, 1.1)  # -10% to +10%
            unit_price = unit_price * price_adjustment
            
            data["unit_price"].append(round(unit_price, 2))
            
            # Calculate line value
            line_value = quantity * unit_price
            data["line_value"].append(round(line_value, 2))
            
            # Set delivery dates (can vary slightly from order dates for individual lines)
            line_req_variation = random.randint(-3, 3)  # +/- 3 days
            line_requested_date = requested_delivery_date + timedelta(days=line_req_variation)
            
            line_prom_variation = random.randint(-2, 2)  # +/- 2 days
            line_promised_date = promised_delivery_date + timedelta(days=line_prom_variation)
            
            # Ensure dates make sense
            if line_requested_date < order_date:
                line_requested_date = order_date + timedelta(days=1)
                
            if line_promised_date < order_date:
                line_promised_date = order_date + timedelta(days=1)
                
            data["requested_delivery_date"].append(line_requested_date.strftime("%Y-%m-%d"))
            data["promised_delivery_date"].append(line_promised_date.strftime("%Y-%m-%d"))
            
            # Set line status based on order status
            if order_status == "Draft" or order_status == "Pending":
                line_status = order_status
                data["work_order_id"].append("")
                data["shipped_quantity"].append(0)
                data["shipping_date"].append("")
                
            elif order_status == "Confirmed":
                line_status = "Confirmed"
                
                # Some confirmed orders have work orders
                if random.random() < 0.7:  # 70% chance
                    data["work_order_id"].append(random.choice(work_order_ids))
                else:
                    data["work_order_id"].append("")
                    
                data["shipped_quantity"].append(0)
                data["shipping_date"].append("")
                
            elif order_status == "In Process":
                line_statuses = ["Confirmed", "In Production", "Ready to Ship", "Partially Shipped"]
                line_status = random.choice(line_statuses)
                
                # Most in-process lines have work orders
                if random.random() < 0.9:  # 90% chance
                    data["work_order_id"].append(random.choice(work_order_ids))
                else:
                    data["work_order_id"].append("")
                
                # Some lines may be partially shipped
                if line_status == "Partially Shipped":
                    shipped_qty = random.randint(1, quantity - 1)
                    data["shipped_quantity"].append(shipped_qty)
                    
                    # Shipping date is between order date and current date
                    days_difference = (datetime.now() - order_date).days
                    if days_difference >= 1:
                        ship_days = random.randint(1, days_difference)
                        shipping_date = order_date + timedelta(days=ship_days)
                        data["shipping_date"].append(shipping_date.strftime("%Y-%m-%d"))
                    else:
                        # Handle the case when order_date is today or in the future
                        data["shipping_date"].append("")

                else:
                    data["shipped_quantity"].append(0)
                    data["shipping_date"].append("")
                
            elif order_status == "Partially Shipped":
                # Mix of shipped and unshipped lines
                if random.random() < 0.6:  # 60% chance this line is shipped
                    line_status = "Shipped"
                    data["shipped_quantity"].append(quantity)
                    
                    # Shipping date is between order date and current date
                    days_difference = (datetime.now() - order_date).days
                    if days_difference >= 1:
                        ship_days = random.randint(1, days_difference)
                        shipping_date = order_date + timedelta(days=ship_days)
                        data["shipping_date"].append(shipping_date.strftime("%Y-%m-%d"))
                    else:
                        # Handle the case when order_date is today or in the future
                        data["shipping_date"].append("")
                else:
                    line_status = random.choice(["Confirmed", "In Production", "Ready to Ship"])
                    data["shipped_quantity"].append(0)
                    data["shipping_date"].append("")
                
                # Most lines have work orders
                if random.random() < 0.9:  # 90% chance
                    data["work_order_id"].append(random.choice(work_order_ids))
                else:
                    data["work_order_id"].append("")
                
            elif order_status == "Completed":
                line_status = "Shipped"
                data["shipped_quantity"].append(quantity)
                
                # Shipping date is between order date and promised date
                ship_days = random.randint(1, (promised_delivery_date - order_date).days)
                shipping_date = order_date + timedelta(days=ship_days)
                data["shipping_date"].append(shipping_date.strftime("%Y-%m-%d"))
                
                # Most completed lines have work orders
                if random.random() < 0.95:  # 95% chance
                    data["work_order_id"].append(random.choice(work_order_ids))
                else:
                    data["work_order_id"].append("")
                
            elif order_status == "Cancelled":
                line_status = "Cancelled"
                data["work_order_id"].append("")
                data["shipped_quantity"].append(0)
                data["shipping_date"].append("")
                
            else:  # On Hold
                line_status = "On Hold"
                
                # Some on-hold orders have work orders
                if random.random() < 0.4:  # 40% chance
                    data["work_order_id"].append(random.choice(work_order_ids))
                else:
                    data["work_order_id"].append("")
                    
                data["shipped_quantity"].append(0)
                data["shipping_date"].append("")
            
            data["status"].append(line_status)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} order line records for {df['order_id'].nunique()} orders.")
    print(f"Data saved to {output_file}")
    
    return df

def display_statistics(customers_df, customer_orders_df=None, order_lines_df=None):
    """
    Display basic statistics about the generated customers, orders, and order lines data
    
    Parameters:
    - customers_df: DataFrame containing customers data
    - customer_orders_df: DataFrame containing customer orders data (optional)
    - order_lines_df: DataFrame containing order lines data (optional)
    """
    if customers_df is None or len(customers_df) == 0:
        print("No customers data to analyze.")
        return
    
    print("\nCustomers Statistics:")
    print(f"Total customers: {len(customers_df)}")
    
    # Customer type distribution
    print("\nCustomer Type Distribution:")
    type_counts = customers_df['customer_type'].value_counts()
    for cust_type, count in type_counts.items():
        print(f"  {cust_type}: {count} ({count/len(customers_df)*100:.1f}%)")
    
    # Industry distribution
    print("\nIndustry Distribution (top 10):")
    industry_counts = customers_df['industry'].value_counts().head(10)
    for industry, count in industry_counts.items():
        print(f"  {industry}: {count} ({count/len(customers_df)*100:.1f}%)")
    
    # Status distribution
    print("\nCustomer Status Distribution:")
    status_counts = customers_df['status'].value_counts()
    for status, count in status_counts.items():
        print(f"  {status}: {count} ({count/len(customers_df)*100:.1f}%)")
    
    # Credit terms distribution
    print("\nCredit Terms Distribution:")
    terms_counts = customers_df['credit_terms'].value_counts()
    for terms, count in terms_counts.items():
        print(f"  {terms}: {count} ({count/len(customers_df)*100:.1f}%)")
    
    # Credit limit statistics
    print("\nCredit Limit Statistics:")
    print(f"  Average credit limit: ${customers_df['credit_limit'].mean():.2f}")
    print(f"  Minimum credit limit: ${customers_df['credit_limit'].min():.2f}")
    print(f"  Maximum credit limit: ${customers_df['credit_limit'].max():.2f}")
    
    # Regional distribution (extracted from address)
    if 'address' in customers_df.columns:
        print("\nRegional Distribution:")
        
        # Extract country from address
        customers_df['country'] = customers_df['address'].apply(lambda x: x.split(',')[-1].strip() if isinstance(x, str) else "Unknown")
        
        country_counts = customers_df['country'].value_counts().head(10)
        for country, count in country_counts.items():
            print(f"  {country}: {count} ({count/len(customers_df)*100:.1f}%)")
    
    # Customer orders statistics (if available)
    if customer_orders_df is not None and len(customer_orders_df) > 0:
        print("\nCustomer Orders Statistics:")
        print(f"Total orders: {len(customer_orders_df)}")
        print(f"Customers with orders: {customer_orders_df['customer_id'].nunique()}")
        
        # Order type distribution
        print("\nOrder Type Distribution:")
        order_type_counts = customer_orders_df['order_type'].value_counts()
        for order_type, count in order_type_counts.items():
            print(f"  {order_type}: {count} ({count/len(customer_orders_df)*100:.1f}%)")
        
        # Order status distribution
        print("\nOrder Status Distribution:")
        order_status_counts = customer_orders_df['status'].value_counts()
        for status, count in order_status_counts.items():
            print(f"  {status}: {count} ({count/len(customer_orders_df)*100:.1f}%)")
        
        # Priority distribution
        print("\nOrder Priority Distribution:")
        priority_counts = customer_orders_df['priority'].value_counts().sort_index()
        for priority, count in priority_counts.items():
            print(f"  Priority {priority}: {count} ({count/len(customer_orders_df)*100:.1f}%)")
        
        # Order value statistics
        print("\nOrder Value Statistics:")
        print(f"  Average order value: ${customer_orders_df['order_value'].mean():.2f}")
        print(f"  Minimum order value: ${customer_orders_df['order_value'].min():.2f}")
        print(f"  Maximum order value: ${customer_orders_df['order_value'].max():.2f}")
        print(f"  Total order value: ${customer_orders_df['order_value'].sum():.2f}")
        
        # Shipping method distribution
        print("\nShipping Method Distribution:")
        shipping_counts = customer_orders_df['shipping_method'].value_counts()
        for method, count in shipping_counts.items():
            print(f"  {method}: {count} ({count/len(customer_orders_df)*100:.1f}%)")
        
        # Orders per customer
        orders_per_customer = customer_orders_df.groupby('customer_id').size()
        print("\nOrders Per Customer Statistics:")
        print(f"  Average orders per customer: {orders_per_customer.mean():.1f}")
        print(f"  Maximum orders per customer: {orders_per_customer.max()}")
        
        # Date-based analysis
        customer_orders_df['order_date'] = pd.to_datetime(customer_orders_df['order_date'])
        
        # Orders by month
        customer_orders_df['order_month'] = customer_orders_df['order_date'].dt.to_period('M')
        monthly_orders = customer_orders_df.groupby('order_month').size().sort_index()
        
        print("\nMonthly Order Distribution (last 6 months):")
        for month, count in monthly_orders.tail(6).items():
            print(f"  {month}: {count} orders")
    
    # Order lines statistics (if available)
    if order_lines_df is not None and len(order_lines_df) > 0:
        print("\nOrder Lines Statistics:")
        print(f"Total order lines: {len(order_lines_df)}")
        print(f"Orders with lines: {order_lines_df['order_id'].nunique()}")
        
        # Lines per order
        lines_per_order = order_lines_df.groupby('order_id').size()
        print("\nLines Per Order Statistics:")
        print(f"  Average lines per order: {lines_per_order.mean():.1f}")
        print(f"  Minimum lines per order: {lines_per_order.min()}")
        print(f"  Maximum lines per order: {lines_per_order.max()}")
        
        # Line status distribution
        print("\nLine Status Distribution:")
        line_status_counts = order_lines_df['status'].value_counts()
        for status, count in line_status_counts.items():
            print(f"  {status}: {count} ({count/len(order_lines_df)*100:.1f}%)")
        
        # Line value statistics
        print("\nLine Value Statistics:")
        print(f"  Average line value: ${order_lines_df['line_value'].mean():.2f}")
        print(f"  Total line value: ${order_lines_df['line_value'].sum():.2f}")
        
        # Quantity statistics
        print("\nQuantity Statistics:")
        print(f"  Average order quantity: {order_lines_df['quantity'].mean():.1f}")
        print(f"  Average unit price: ${order_lines_df['unit_price'].mean():.2f}")
        
        # Work order association
        has_wo = order_lines_df['work_order_id'].apply(lambda x: x != "").sum()
        print(f"\nLines with work order association: {has_wo} ({has_wo/len(order_lines_df)*100:.1f}%)")
        
        # Shipping statistics
        shipped_lines = order_lines_df[order_lines_df['shipped_quantity'] > 0]
        if len(shipped_lines) > 0:
            print("\nShipping Statistics:")
            print(f"  Shipped lines: {len(shipped_lines)} ({len(shipped_lines)/len(order_lines_df)*100:.1f}%)")
            print(f"  Total shipped quantity: {shipped_lines['shipped_quantity'].sum()}")
            
            # Calculate ship-to-promise ratio
            shipped_lines['shipping_date'] = pd.to_datetime(shipped_lines['shipping_date'])
            shipped_lines['promised_delivery_date'] = pd.to_datetime(shipped_lines['promised_delivery_date'])
            
            # On-time shipments (shipped on or before promised date)
            on_time = shipped_lines[shipped_lines['shipping_date'] <= shipped_lines['promised_delivery_date']]
            on_time_pct = len(on_time) / len(shipped_lines) * 100
            
            print(f"  On-time shipments: {len(on_time)} ({on_time_pct:.1f}% of shipped lines)")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Try to load products data if available
    try:
        products_df = pd.read_csv("data/products.csv")
    except FileNotFoundError:
        print("Note: Products data file not found.")
        products_df = None
    
    # Generate customers data
    customers_df = generate_customers_data(
        num_customers=100,  # Generate 100 customer records
        output_file="data/customers.csv"
    )
    
    if customers_df is not None:
        # Generate customer orders data
        customer_orders_df = generate_customer_orders(
            customers_df,
            products_df,
            num_orders=300,  # Generate 300 order records
            output_file="data/customer_orders.csv"
        )
        
        if customer_orders_df is not None:
            # Generate order lines data
            order_lines_df = generate_order_lines(
                customer_orders_df,
                products_df,
                output_file="data/order_lines.csv"
            )
        else:
            order_lines_df = None
    else:
        customer_orders_df = None
        order_lines_df = None
    
    # Display statistics
    if customers_df is not None:
        display_statistics(customers_df, customer_orders_df, order_lines_df)
        
        # Display sample data
        print("\nSample customers data (first 5 records):")
        print(customers_df.head(5))
        
        if customer_orders_df is not None:
            print("\nSample customer orders data (first 5 records):")
            print(customer_orders_df.head(5))
            
            if order_lines_df is not None:
                print("\nSample order lines data (first 5 records):")
                print(order_lines_df.head(5))

Successfully generated 100 customer records.
Data saved to data/customers.csv
Successfully generated 300 customer order records.
Data saved to data/customer_orders.csv
Successfully generated 1726 order line records for 300 orders.
Data saved to data/order_lines.csv

Customers Statistics:
Total customers: 100

Customer Type Distribution:
  Retailer: 28 (28.0%)
  Distributor: 25 (25.0%)
  Wholesaler: 21 (21.0%)
  Contract Manufacturer: 14 (14.0%)
  Direct Customer: 12 (12.0%)

Industry Distribution (top 10):
  Food & Beverage Distribution: 8 (8.0%)
  Specialty Store: 8 (8.0%)
  Home Improvement: 6 (6.0%)
  Pharmaceutical Manufacturing: 5 (5.0%)
  Industrial Equipment: 5 (5.0%)
  Food Processing: 5 (5.0%)
  Building Materials: 5 (5.0%)
  Electronics Retail: 5 (5.0%)
  Automotive Parts Distribution: 4 (4.0%)
  Automotive Manufacturing: 4 (4.0%)

Customer Status Distribution:
  Active: 77 (77.0%)
  New: 8 (8.0%)
  On Hold: 5 (5.0%)
  Inactive: 5 (5.0%)
  Archived: 5 (5.0%)

Credit Terms Dis

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shipped_lines['shipping_date'] = pd.to_datetime(shipped_lines['shipping_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shipped_lines['promised_delivery_date'] = pd.to_datetime(shipped_lines['promised_delivery_date'])


Suppliers

In [9]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def generate_suppliers_data(num_suppliers=50, output_file="data/suppliers.csv"):
    """
    Generate synthetic data for the Suppliers table from ISA-95 Level 4.
    
    Parameters:
    - num_suppliers: Number of supplier records to generate
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated suppliers data
    """
    # Define supplier types and their probabilities
    supplier_types = {
        "Manufacturer": 0.4,
        "Distributor": 0.25,
        "Wholesaler": 0.15,
        "Service Provider": 0.1,
        "Contractor": 0.1
    }
    
    # Define supplier categories by material type
    supplier_categories = {
        "Raw Material": ["Chemical Supplier", "Industrial Raw Material", "Commodity Supplier", 
                      "Mining Company", "Agricultural Supplier", "Petroleum Supplier"],
        "Packaging": ["Packaging Manufacturer", "Container Supplier", "Label Supplier", 
                   "Film Supplier", "Box Manufacturer", "Bottle Supplier"],
        "Equipment": ["Equipment Manufacturer", "Machinery Supplier", "Tool Vendor", 
                   "Instrumentation Supplier", "Automation Provider", "Parts Supplier"],
        "Service": ["Maintenance Service", "Calibration Service", "Cleaning Service", 
                 "Engineering Consultant", "Testing Laboratory", "Transportation Provider"],
        "Consumable": ["Laboratory Supplies", "MRO Supplier", "Office Supplies", 
                    "Safety Equipment", "Utility Provider", "IT Service Provider"]
    }
    
    # Define payment terms
    payment_terms = ["Net 30", "Net 45", "Net 60", "2/10 Net 30", "COD", "Net 15"]
    payment_terms_weights = [0.4, 0.2, 0.15, 0.1, 0.1, 0.05]  # Probabilities
    
    # Define regions for address generation
    regions = {
        "North America": ["USA", "Canada", "Mexico"],
        "Europe": ["UK", "Germany", "France", "Italy", "Spain", "Netherlands"],
        "Asia": ["Japan", "China", "South Korea", "India", "Singapore", "Taiwan"],
        "Latin America": ["Brazil", "Colombia", "Argentina", "Chile", "Peru"],
        "Oceania": ["Australia", "New Zealand"]
    }
    
    # Region probabilities (adjust as needed for your supply chain model)
    region_weights = {
        "North America": 0.35,
        "Europe": 0.25,
        "Asia": 0.25,
        "Latin America": 0.1,
        "Oceania": 0.05
    }
    
    # Generate data structure
    data = {
        "supplier_id": [f"SUP-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_suppliers)],
        "supplier_name": [],
        "supplier_type": [],
        "contact_person": [],
        "email": [],
        "phone": [],
        "address": [],
        "payment_terms": [],
        "lead_time_days": [],
        "quality_rating": [],
        "status": [],
        "primary_materials": [],
        "notes": []
    }
    
    # First names for contact generation
    first_names = [
        "James", "Mary", "John", "Patricia", "Robert", "Jennifer", "Michael", "Linda", 
        "William", "Elizabeth", "David", "Barbara", "Richard", "Susan", "Joseph", "Jessica", 
        "Thomas", "Sarah", "Charles", "Karen", "Christopher", "Nancy", "Daniel", "Lisa", 
        "Matthew", "Margaret", "Anthony", "Betty", "Mark", "Sandra", "Donald", "Ashley", 
        "Steven", "Kimberly", "Paul", "Emily", "Andrew", "Donna", "Joshua", "Michelle"
    ]
    
    # Last names for contact generation
    last_names = [
        "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", 
        "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", 
        "Thomas", "Taylor", "Moore", "Jackson", "Martin", "Lee", "Perez", "Thompson", 
        "White", "Harris", "Sanchez", "Clark", "Ramirez", "Lewis", "Robinson", "Walker"
    ]
    
    # Company name components for generating realistic company names
    company_prefixes = [
        "Advanced", "Allied", "American", "Asian", "Atlantic", "Best", "Better", "Blue", 
        "Bright", "Central", "Century", "Consolidated", "Continental", "Digital", "Dynamic", 
        "East", "Eastern", "Euro", "European", "Express", "First", "Global", "Golden", 
        "Great", "Green", "International", "Metro", "Modern", "National", "New", "North"
    ]
    
    company_types = {
        "Manufacturer": ["Manufacturing", "Industries", "Fabrication", "Production", "Processors"],
        "Distributor": ["Distribution", "Distributors", "Supply", "Supplies", "Logistics"],
        "Wholesaler": ["Wholesale", "Trading", "Merchandise", "Commerce", "Exchange"],
        "Service Provider": ["Services", "Solutions", "Consulting", "Associates", "Group"],
        "Contractor": ["Contracting", "Construction", "Engineering", "Installations", "Systems"]
    }
    
    # Generate data for each supplier
    for i in range(num_suppliers):
        # Select supplier type (weighted random)
        supplier_type = random.choices(
            list(supplier_types.keys()), 
            weights=list(supplier_types.values())
        )[0]
        data["supplier_type"].append(supplier_type)
        
        # Generate a realistic company name
        if random.random() < 0.7:  # 70% chance of using prefix
            prefix = random.choice(company_prefixes)
            if supplier_type in company_types:
                suffix = random.choice(company_types[supplier_type])
            else:
                suffix = random.choice(list(company_types.values())[0])
                
            company_name = f"{prefix} {suffix}"
        else:
            # Use a last name
            last_name = random.choice(last_names)
            if supplier_type in company_types:
                suffix = random.choice(company_types[supplier_type])
            else:
                suffix = random.choice(list(company_types.values())[0])
                
            company_name = f"{last_name} {suffix}"
        
        data["supplier_name"].append(company_name)
        
        # Generate contact person (random first and last name)
        contact_first = random.choice(first_names)
        contact_last = random.choice(last_names)
        data["contact_person"].append(f"{contact_first} {contact_last}")
        
        # Generate email (company domain based on name)
        company_domain = company_name.lower().replace(" ", "").replace(".", "")
        email_domains = [".com", ".net", ".org", ".co", ".biz"]
        email_domain = random.choice(email_domains)
        data["email"].append(f"{contact_first.lower()}.{contact_last.lower()}@{company_domain}{email_domain}")
        
        # Generate phone
        data["phone"].append(f"+{random.randint(1, 9)}{random.randint(10, 99)} {random.randint(100, 999)} {random.randint(100, 999)} {random.randint(1000, 9999)}")
        
        # Generate address based on region probability
        region = random.choices(
            list(region_weights.keys()), 
            weights=list(region_weights.values())
        )[0]
        
        country = random.choice(regions[region])
        
        # Generate a city name (simplified)
        city_prefixes = ["New", "Old", "East", "West", "North", "South", "Central", "Upper", "Lower", "Port", "Lake", "Mount", "Fort"]
        city_suffixes = ["town", "ville", "burg", "berg", "field", "ford", "port", "mouth", "stad", "furt", "chester", "cester", "bridge", "haven", "minster"]
        
        if random.random() < 0.3:  # 30% chance of using prefix
            city = f"{random.choice(city_prefixes)} {random.choice(last_names)}{random.choice(['', random.choice(city_suffixes)])}"
        else:
            city = f"{random.choice(last_names)}{random.choice(['', random.choice(city_suffixes)])}"
        
        # Generate street address
        street_number = random.randint(1, 9999)
        street_types = ["Street", "Avenue", "Boulevard", "Road", "Lane", "Drive", "Way", "Place", "Court", "Terrace"]
        street_name = f"{random.choice(last_names)} {random.choice(street_types)}"
        
        address = f"{street_number} {street_name}, {city}, {country}"
        data["address"].append(address)
        
        # Set payment terms (weighted random)
        payment_term = random.choices(payment_terms, weights=payment_terms_weights)[0]
        data["payment_terms"].append(payment_term)
        
        # Set lead time based on supplier type and region
        if supplier_type in ["Manufacturer", "Contractor"]:
            base_lead_time = random.randint(30, 90)  # Longer lead times for manufacturers
        else:
            base_lead_time = random.randint(7, 45)   # Shorter for distributors
            
        # Adjust for region (international suppliers have longer lead times)
        if region == "North America":
            lead_time_multiplier = 1.0
        elif region == "Europe" or region == "Oceania":
            lead_time_multiplier = 1.5
        elif region == "Asia":
            lead_time_multiplier = 1.8
        else:
            lead_time_multiplier = 1.3
            
        lead_time = int(base_lead_time * lead_time_multiplier)
        data["lead_time_days"].append(lead_time)
        
        # Set quality rating (1-5 scale, 5 being best)
        # Most suppliers should be good (3-5) with fewer poor suppliers
        quality_weights = [0.05, 0.15, 0.30, 0.35, 0.15]  # Weights for ratings 1-5
        quality_rating = round(random.choices([1, 2, 3, 4, 5], weights=quality_weights)[0], 1)
        
        # Add some random decimal to make it more realistic
        if quality_rating < 5:
            quality_rating += round(random.uniform(0, 0.9), 1)
            
        data["quality_rating"].append(quality_rating)
        
        # Set status (mostly active)
        statuses = ["Active", "Inactive", "On Hold", "New", "Disqualified"]
        status_weights = [0.8, 0.05, 0.05, 0.07, 0.03]  # Probabilities
        data["status"].append(random.choices(statuses, weights=status_weights)[0])
        
        # Set primary materials/categories
        num_categories = random.randint(1, 3)  # 1-3 primary categories per supplier
        categories = []
        
        # Select based on supplier type
        if supplier_type == "Manufacturer":
            category_pool = list(supplier_categories["Raw Material"]) + list(supplier_categories["Equipment"])
        elif supplier_type == "Distributor":
            category_pool = list(supplier_categories["Raw Material"]) + list(supplier_categories["Packaging"]) + list(supplier_categories["Consumable"])
        elif supplier_type == "Service Provider":
            category_pool = list(supplier_categories["Service"])
        else:
            # Mix of all categories
            category_pool = []
            for cat_list in supplier_categories.values():
                category_pool.extend(cat_list)
                
        categories = random.sample(category_pool, min(num_categories, len(category_pool)))
        data["primary_materials"].append(str(categories))
        
        # Generate notes (mostly empty)
        if random.random() < 0.3:  # 30% chance of having notes
            notes_options = [
                "Preferred supplier for critical materials",
                "Requires minimum order quantities",
                "ISO 9001 certified",
                "Long-term contract in place",
                f"Annual review scheduled for Q{random.randint(1, 4)}",
                "Sustainability certified",
                "Offers volume discounts",
                "Approved for regulated materials",
                "Subject to import restrictions",
                "Can provide rush delivery"
            ]
            data["notes"].append(random.choice(notes_options))
        else:
            data["notes"].append("")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} supplier records.")
    print(f"Data saved to {output_file}")
    
    return df

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Generate suppliers data
    suppliers_df = generate_suppliers_data(
        num_suppliers=50,  # Generate 50 supplier records
        output_file="data/suppliers.csv"
    )
    
    # Display sample data
    if suppliers_df is not None:
        print("\nSample suppliers data (first 5 records):")
        print(suppliers_df.head(5))

Successfully generated 50 supplier records.
Data saved to data/suppliers.csv

Sample suppliers data (first 5 records):
    supplier_id           supplier_name     supplier_type   contact_person  \
0  SUP-50348391       Robinson Services  Service Provider    Betty Johnson   
1  SUP-7A0426D3  Hernandez Distributors       Distributor  Jennifer Taylor   
2  SUP-5CF5DECA       Central Logistics       Distributor   Michael Walker   
3  SUP-6330D222             Great Group  Service Provider     Daniel Lopez   
4  SUP-9EF5B17E         Best Industries      Manufacturer       Lisa Perez   

                                       email              phone  \
0         betty.johnson@robinsonservices.org  +157 975 903 1133   
1  jennifer.taylor@hernandezdistributors.biz  +981 290 886 5916   
2        michael.walker@centrallogistics.net  +125 108 159 4947   
3                 daniel.lopez@greatgroup.co  +466 556 749 1046   
4              lisa.perez@bestindustries.org  +213 931 721 4937   

         

Purchase Orders & Purchase Order Lines

In [10]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def load_suppliers_data(suppliers_file="data/suppliers.csv"):
    """
    Load the previously generated suppliers data
    
    Parameters:
    - suppliers_file: CSV file containing suppliers data
    
    Returns:
    - DataFrame containing the suppliers data
    """
    try:
        return pd.read_csv(suppliers_file)
    except FileNotFoundError:
        print(f"Error: Suppliers data file {suppliers_file} not found.")
        print("Please run the suppliers data generation script first.")
        return None

def load_materials_data(materials_file="data/materials.csv"):
    """
    Load the previously generated materials data if available
    
    Parameters:
    - materials_file: CSV file containing materials data
    
    Returns:
    - DataFrame containing the materials data or None if not available
    """
    try:
        return pd.read_csv(materials_file)
    except FileNotFoundError:
        print(f"Note: Materials data file {materials_file} not found.")
        print("Purchase orders will be generated with synthetic material IDs.")
        return None

def load_personnel_data(personnel_file="data/personnel.csv"):
    """
    Load the previously generated personnel data if available
    
    Parameters:
    - personnel_file: CSV file containing personnel data
    
    Returns:
    - DataFrame containing the personnel data or None if not available
    """
    try:
        return pd.read_csv(personnel_file)
    except FileNotFoundError:
        print(f"Note: Personnel data file {personnel_file} not found.")
        print("Purchase orders will be generated with synthetic buyer IDs.")
        return None

def generate_purchase_orders(suppliers_df, materials_df=None, personnel_df=None, 
                             num_orders=200, start_time=None, end_time=None,
                             output_file="data/purchase_orders.csv"):
    """
    Generate synthetic data for the PurchaseOrders table from ISA-95 Level 4.
    
    Parameters:
    - suppliers_df: DataFrame containing suppliers data
    - materials_df: DataFrame containing materials data (optional)
    - personnel_df: DataFrame containing personnel data (optional)
    - num_orders: Number of purchase order records to generate
    - start_time: Start time for order dates (defaults to 365 days ago)
    - end_time: End time for order dates (defaults to 30 days in the future)
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated purchase orders data
    """
    if suppliers_df is None or len(suppliers_df) == 0:
        print("Error: No suppliers data available.")
        return None
    
    # Set default time range if not provided
    if start_time is None:
        start_time = datetime.now() - timedelta(days=365)
    if end_time is None:
        end_time = datetime.now() + timedelta(days=30)
    
    # Generate buyer IDs if personnel_df is not provided
    if personnel_df is None or len(personnel_df) == 0:
        print("Generating synthetic buyer IDs...")
        buyer_ids = [f"PERS-{uuid.uuid4().hex[:8].upper()}" for _ in range(10)]
    else:
        # Use actual personnel IDs from personnel_df
        # Typically use personnel with purchasing roles
        buyer_ids = personnel_df['personnel_id'].sample(min(10, len(personnel_df))).tolist()
    
    # Define payment terms (use supplier terms or defaults)
    payment_terms = ["Net 30", "Net 45", "Net 60", "2/10 Net 30", "COD", "Prepaid"]
    
    # Define shipping methods
    shipping_methods = ["Truck", "Air", "Sea", "Rail", "Express", "Courier", "Supplier Delivery"]
    
    # Define approval statuses
    approval_statuses = ["Draft", "Pending Approval", "Approved", "Rejected", "On Hold"]
    
    # Generate data structure
    data = {
        "po_id": [f"PO-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_orders)],
        "supplier_id": [],
        "order_date": [],
        "expected_delivery_date": [],
        "status": [],
        "total_value": [],
        "payment_terms": [],
        "shipping_method": [],
        "buyer_id": [],
        "approval_status": [],
        "notes": []
    }
    
    # Generate data for each purchase order
    for i in range(num_orders):
        # Select supplier (more active suppliers get more orders)
        active_suppliers = suppliers_df[suppliers_df['status'] == 'Active']
        
        if len(active_suppliers) > 0:
            # Prefer active suppliers (80% chance)
            if random.random() < 0.8:
                supplier = active_suppliers.sample(1).iloc[0]
            else:
                supplier = suppliers_df.sample(1).iloc[0]
        else:
            supplier = suppliers_df.sample(1).iloc[0]
            
        supplier_id = supplier['supplier_id']
        data["supplier_id"].append(supplier_id)
        
        # Generate order date
        time_range_days = (end_time - start_time).days
        days_from_start = random.randint(0, time_range_days)
        order_date = start_time + timedelta(days=days_from_start)
        data["order_date"].append(order_date.strftime("%Y-%m-%d"))
        
        # Generate expected delivery date based on supplier lead time
        if 'lead_time_days' in supplier and pd.notna(supplier['lead_time_days']):
            lead_time = supplier['lead_time_days']
        else:
            # Default lead time if not available
            lead_time = random.randint(14, 60)
            
        # Add some variation to the lead time
        lead_time_variation = random.uniform(0.8, 1.2)  # +/- 20%
        adjusted_lead_time = int(lead_time * lead_time_variation)
        
        expected_delivery_date = order_date + timedelta(days=adjusted_lead_time)
        data["expected_delivery_date"].append(expected_delivery_date.strftime("%Y-%m-%d"))
        
        # Determine PO status based on dates
        current_date = datetime.now()
        
        if order_date > current_date:
            # Future POs are typically in Draft or Pending status
            status = random.choice(["Draft", "Pending Approval"])
        elif expected_delivery_date > current_date:
            # Current POs are Approved or In Process
            status = random.choice(["Approved", "In Process", "Partially Received"])
        else:
            # Past POs are Completed, Cancelled, or Closed
            status_options = ["Completed", "Completed", "Completed", "Cancelled", "Closed"]  # Weighted for more completed
            status = random.choice(status_options)
            
        data["status"].append(status)
        
        # Generate order value (based on a reasonable range for purchase orders)
        order_value = random.uniform(1000, 50000)
        data["total_value"].append(round(order_value, 2))
        
        # Set payment terms (use supplier terms if available)
        if 'payment_terms' in supplier and pd.notna(supplier['payment_terms']):
            data["payment_terms"].append(supplier['payment_terms'])
        else:
            data["payment_terms"].append(random.choice(payment_terms))
        
        # Set shipping method
        data["shipping_method"].append(random.choice(shipping_methods))
        
        # Assign buyer
        data["buyer_id"].append(random.choice(buyer_ids))
        
        # Set approval status based on PO status
        if status == "Draft":
            approval_status = "Draft"
        elif status == "Pending Approval":
            approval_status = "Pending Approval"
        elif status in ["Cancelled", "Rejected"]:
            approval_status = "Rejected"
        elif status == "On Hold":
            approval_status = "On Hold"
        else:
            approval_status = "Approved"
            
        data["approval_status"].append(approval_status)
        
        # Generate notes (mostly empty)
        if random.random() < 0.2:  # 20% chance of having notes
            notes_options = [
                "Rush order, critical materials",
                "Partial shipments acceptable",
                "Quality certificates required",
                "Special packaging instructions included",
                "Price negotiated below standard",
                "Consolidated order for multiple projects",
                f"Reference requisition #{random.randint(10000, 99999)}",
                "Schedule delivery with warehouse manager",
                "New supplier, additional quality checks",
                "Replacement for PO cancelled last month"
            ]
            data["notes"].append(random.choice(notes_options))
        else:
            data["notes"].append("")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} purchase order records.")
    print(f"Data saved to {output_file}")
    
    return df

def generate_purchase_order_lines(purchase_orders_df, materials_df=None, 
                                output_file="data/purchase_order_lines.csv"):
    """
    Generate synthetic data for the PurchaseOrderLines table from ISA-95 Level 4.
    
    Parameters:
    - purchase_orders_df: DataFrame containing purchase orders data
    - materials_df: DataFrame containing materials data (optional)
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated purchase order lines data
    """
    if purchase_orders_df is None or len(purchase_orders_df) == 0:
        print("Error: No purchase orders data available.")
        return None
    
    # Generate material IDs if materials_df is not provided
    if materials_df is None or len(materials_df) == 0:
        print("Generating synthetic material IDs...")
        material_ids = [f"MAT-{uuid.uuid4().hex[:8].upper()}" for _ in range(50)]
        
        # Create synthetic material prices
        material_prices = {}
        for mat_id in material_ids:
            material_prices[mat_id] = random.uniform(10, 1000)
    else:
        # Use actual material IDs from materials_df
        material_ids = materials_df['material_id'].tolist()
        
        # Create price mapping from standard cost
        material_prices = {}
        for _, material in materials_df.iterrows():
            if 'standard_cost' in material and pd.notna(material['standard_cost']):
                material_prices[material['material_id']] = material['standard_cost']
            else:
                material_prices[material['material_id']] = random.uniform(10, 1000)
    
    # Generate data structure
    data = {
        "line_id": [],
        "po_id": [],
        "line_number": [],
        "material_id": [],
        "quantity": [],
        "unit_price": [],
        "line_value": [],
        "expected_delivery_date": [],
        "received_quantity": [],
        "receipt_date": [],
        "status": [],
        "lot_id": []
    }
    
    # Process each purchase order
    for _, po in purchase_orders_df.iterrows():
        po_id = po['po_id']
        po_status = po['status']
        
        # Determine number of line items for this PO
        num_lines = random.randint(1, 10)  # 1-10 line items per PO
        
        # Keep track of selected materials for this PO to avoid duplicates
        selected_materials = []
        
        # Get PO dates
        order_date = pd.to_datetime(po['order_date'])
        expected_delivery_date = pd.to_datetime(po['expected_delivery_date'])
        
        # Generate line items
        for line_num in range(1, num_lines + 1):
            # Create unique line ID
            line_id = f"POLINE-{uuid.uuid4().hex[:8].upper()}"
            data["line_id"].append(line_id)
            data["po_id"].append(po_id)
            data["line_number"].append(line_num)
            
            # Select material (avoid duplicates within same PO)
            available_materials = [m for m in material_ids if m not in selected_materials]
            
            if not available_materials:
                # If we've used all materials, just pick a random one
                material_id = random.choice(material_ids)
            else:
                material_id = random.choice(available_materials)
                selected_materials.append(material_id)
                
            data["material_id"].append(material_id)
            
            # Generate quantity based on material (would depend on unit of measure)
            # For simplicity we'll use generic quantities
            quantity = random.randint(1, 1000)
            data["quantity"].append(quantity)
            
            # Get unit price
            if material_id in material_prices:
                unit_price = material_prices[material_id]
            else:
                unit_price = random.uniform(10, 1000)
                
            # Apply random variation (supplier-specific pricing)
            price_variation = random.uniform(0.9, 1.1)  # +/- 10%
            unit_price = unit_price * price_variation
            
            data["unit_price"].append(round(unit_price, 2))
            
            # Calculate line value
            line_value = quantity * unit_price
            data["line_value"].append(round(line_value, 2))
            
            # Set expected delivery date (can vary slightly from PO date)
            line_delivery_variation = random.randint(-5, 5)  # +/- 5 days
            line_delivery_date = expected_delivery_date + timedelta(days=line_delivery_variation)
            
            # Ensure date makes sense
            if line_delivery_date < order_date:
                line_delivery_date = order_date + timedelta(days=1)
                
            data["expected_delivery_date"].append(line_delivery_date.strftime("%Y-%m-%d"))
            
            # Set line status and receipt info based on PO status
            if po_status in ["Draft", "Pending Approval"]:
                line_status = po_status
                data["received_quantity"].append(0)
                data["receipt_date"].append("")
                data["lot_id"].append("")
                
            elif po_status == "Approved":
                line_status = "Approved"
                data["received_quantity"].append(0)
                data["receipt_date"].append("")
                data["lot_id"].append("")
                
            elif po_status == "In Process":
                line_status = "In Process"
                data["received_quantity"].append(0)
                data["receipt_date"].append("")
                data["lot_id"].append("")
                
            elif po_status == "Partially Received":
                # Mix of received and unreceived lines
                if random.random() < 0.6:  # 60% chance this line is received
                    line_status = "Received"
                    received_qty = quantity
                    
                    # Receipt date is between order date and current date
                    days_difference = (datetime.now() - order_date).days
                    if days_difference >= 1:
                        receipt_days = random.randint(1, days_difference)
                        receipt_date = order_date + timedelta(days=receipt_days)
                        data["receipt_date"].append(receipt_date.strftime("%Y-%m-%d"))
                    else:
                        data["receipt_date"].append(datetime.now().strftime("%Y-%m-%d"))
                    
                    # Generate a lot ID for the received material
                    data["lot_id"].append(f"LOT-{uuid.uuid4().hex[:8].upper()}")
                else:
                    line_status = "In Process"
                    received_qty = 0
                    data["receipt_date"].append("")
                    data["lot_id"].append("")
                    
                data["received_quantity"].append(received_qty)
                
            elif po_status == "Completed":
                line_status = "Received"
                data["received_quantity"].append(quantity)
                
                # Receipt date is between order date and expected delivery date
                receipt_days = random.randint(1, max(1, (expected_delivery_date - order_date).days))
                receipt_date = order_date + timedelta(days=receipt_days)
                data["receipt_date"].append(receipt_date.strftime("%Y-%m-%d"))
                
                # Generate a lot ID for the received material
                data["lot_id"].append(f"LOT-{uuid.uuid4().hex[:8].upper()}")
                
            elif po_status == "Cancelled":
                line_status = "Cancelled"
                data["received_quantity"].append(0)
                data["receipt_date"].append("")
                data["lot_id"].append("")
                
            else:  # Closed or other status
                line_status = po_status
                data["received_quantity"].append(quantity)
                
                # Receipt date is between order date and expected delivery date
                receipt_days = random.randint(1, max(1, (expected_delivery_date - order_date).days))
                receipt_date = order_date + timedelta(days=receipt_days)
                data["receipt_date"].append(receipt_date.strftime("%Y-%m-%d"))
                
                # Generate a lot ID for the received material
                data["lot_id"].append(f"LOT-{uuid.uuid4().hex[:8].upper()}")
            
            data["status"].append(line_status)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} purchase order line records for {df['po_id'].nunique()} purchase orders.")
    print(f"Data saved to {output_file}")
    
    return df

def display_statistics(purchase_orders_df, purchase_order_lines_df=None):
    """
    Display basic statistics about the generated purchase orders and lines data
    
    Parameters:
    - purchase_orders_df: DataFrame containing purchase orders data
    - purchase_order_lines_df: DataFrame containing purchase order lines data (optional)
    """
    if purchase_orders_df is None or len(purchase_orders_df) == 0:
        print("No purchase orders data to analyze.")
        return
    
    print("\nPurchase Orders Statistics:")
    print(f"Total purchase orders: {len(purchase_orders_df)}")
    
    # Order status distribution
    print("\nPurchase Order Status Distribution:")
    status_counts = purchase_orders_df['status'].value_counts()
    for status, count in status_counts.items():
        print(f"  {status}: {count} ({count/len(purchase_orders_df)*100:.1f}%)")
    
    # Approval status distribution
    print("\nApproval Status Distribution:")
    approval_counts = purchase_orders_df['approval_status'].value_counts()
    for status, count in approval_counts.items():
        print(f"  {status}: {count} ({count/len(purchase_orders_df)*100:.1f}%)")
    
    # Order value statistics
    print("\nOrder Value Statistics:")
    print(f"  Average order value: ${purchase_orders_df['total_value'].mean():.2f}")
    print(f"  Minimum order value: ${purchase_orders_df['total_value'].min():.2f}")
    print(f"  Maximum order value: ${purchase_orders_df['total_value'].max():.2f}")
    print(f"  Total order value: ${purchase_orders_df['total_value'].sum():.2f}")
    
    # Shipping method distribution
    print("\nShipping Method Distribution:")
    shipping_counts = purchase_orders_df['shipping_method'].value_counts()
    for method, count in shipping_counts.items():
        print(f"  {method}: {count} ({count/len(purchase_orders_df)*100:.1f}%)")
    
    # Payment terms distribution
    print("\nPayment Terms Distribution:")
    terms_counts = purchase_orders_df['payment_terms'].value_counts()
    for terms, count in terms_counts.items():
        print(f"  {terms}: {count} ({count/len(purchase_orders_df)*100:.1f}%)")
    
    # Date-based analysis
    purchase_orders_df['order_date'] = pd.to_datetime(purchase_orders_df['order_date'])
    purchase_orders_df['expected_delivery_date'] = pd.to_datetime(purchase_orders_df['expected_delivery_date'])
    
    # Lead time analysis
    purchase_orders_df['lead_time_days'] = (purchase_orders_df['expected_delivery_date'] - 
                                         purchase_orders_df['order_date']).dt.days
    
    print("\nLead Time Statistics:")
    print(f"  Average lead time: {purchase_orders_df['lead_time_days'].mean():.1f} days")
    print(f"  Minimum lead time: {purchase_orders_df['lead_time_days'].min()} days")
    print(f"  Maximum lead time: {purchase_orders_df['lead_time_days'].max()} days")
    
    # Orders by month
    purchase_orders_df['order_month'] = purchase_orders_df['order_date'].dt.to_period('M')
    monthly_orders = purchase_orders_df.groupby('order_month').size().sort_index()
    
    print("\nMonthly Order Distribution (last 6 months):")
    for month, count in monthly_orders.tail(6).items():
        print(f"  {month}: {count} orders")
    
    # Supplier distribution
    supplier_counts = purchase_orders_df.groupby('supplier_id').size().sort_values(ascending=False)
    
    print("\nTop 10 Suppliers by Order Count:")
    for supplier, count in supplier_counts.head(10).items():
        print(f"  {supplier}: {count} orders ({count/len(purchase_orders_df)*100:.1f}%)")
    
    # Purchase order lines statistics (if available)
    if purchase_order_lines_df is not None and len(purchase_order_lines_df) > 0:
        print("\nPurchase Order Lines Statistics:")
        print(f"Total purchase order lines: {len(purchase_order_lines_df)}")
        print(f"Purchase orders with lines: {purchase_order_lines_df['po_id'].nunique()}")
        
        # Lines per order
        lines_per_order = purchase_order_lines_df.groupby('po_id').size()
        print("\nLines Per Order Statistics:")
        print(f"  Average lines per order: {lines_per_order.mean():.1f}")
        print(f"  Minimum lines per order: {lines_per_order.min()}")
        print(f"  Maximum lines per order: {lines_per_order.max()}")
        
        # Line status distribution
        print("\nLine Status Distribution:")
        line_status_counts = purchase_order_lines_df['status'].value_counts()
        for status, count in line_status_counts.items():
            print(f"  {status}: {count} ({count/len(purchase_order_lines_df)*100:.1f}%)")
        
        # Line value statistics
        print("\nLine Value Statistics:")
        print(f"  Average line value: ${purchase_order_lines_df['line_value'].mean():.2f}")
        print(f"  Total line value: ${purchase_order_lines_df['line_value'].sum():.2f}")
        
        # Quantity statistics
        print("\nQuantity Statistics:")
        print(f"  Average order quantity: {purchase_order_lines_df['quantity'].mean():.1f}")
        print(f"  Average unit price: ${purchase_order_lines_df['unit_price'].mean():.2f}")
        
        # Receipt statistics
        received_lines = purchase_order_lines_df[purchase_order_lines_df['received_quantity'] > 0]
        if len(received_lines) > 0:
            print("\nReceipt Statistics:")
            print(f"  Received lines: {len(received_lines)} ({len(received_lines)/len(purchase_order_lines_df)*100:.1f}%)")
            print(f"  Total received quantity: {received_lines['received_quantity'].sum()}")
            
            # Calculate on-time delivery
            received_lines['receipt_date'] = pd.to_datetime(received_lines['receipt_date'])
            received_lines['expected_delivery_date'] = pd.to_datetime(received_lines['expected_delivery_date'])
            
            # On-time deliveries (received on or before expected date)
            on_time = received_lines[received_lines['receipt_date'] <= received_lines['expected_delivery_date']]
            on_time_pct = len(on_time) / len(received_lines) * 100
            
            print(f"  On-time deliveries: {len(on_time)} ({on_time_pct:.1f}% of received lines)")
            
            # Material distribution
            material_counts = purchase_order_lines_df.groupby('material_id').size().sort_values(ascending=False)
            
            print("\nTop 10 Materials by Order Frequency:")
            for material, count in material_counts.head(10).items():
                print(f"  {material}: {count} order lines ({count/len(purchase_order_lines_df)*100:.1f}%)")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load required data
    suppliers_df = load_suppliers_data()
    materials_df = load_materials_data()
    personnel_df = load_personnel_data()
    
    if suppliers_df is not None:
        # Generate purchase orders data
        purchase_orders_df = generate_purchase_orders(
            suppliers_df,
            materials_df,
            personnel_df,
            num_orders=200,  # Generate 200 purchase order records
            output_file="data/purchase_orders.csv"
        )
        
        if purchase_orders_df is not None:
            # Generate purchase order lines data
            purchase_order_lines_df = generate_purchase_order_lines(
                purchase_orders_df,
                materials_df,
                output_file="data/purchase_order_lines.csv"
            )
        else:
            purchase_order_lines_df = None
    else:
        purchase_orders_df = None
        purchase_order_lines_df = None
    
    # Display statistics
    if purchase_orders_df is not None:
        display_statistics(purchase_orders_df, purchase_order_lines_df)
        
        # Display sample data
        print("\nSample purchase orders data (first 5 records):")
        print(purchase_orders_df.head(5))
        
        if purchase_order_lines_df is not None:
            print("\nSample purchase order lines data (first 5 records):")
            print(purchase_order_lines_df.head(5))

Note: Personnel data file data/personnel.csv not found.
Purchase orders will be generated with synthetic buyer IDs.
Generating synthetic buyer IDs...
Successfully generated 200 purchase order records.
Data saved to data/purchase_orders.csv
Successfully generated 1005 purchase order line records for 200 purchase orders.
Data saved to data/purchase_order_lines.csv

Purchase Orders Statistics:
Total purchase orders: 200

Purchase Order Status Distribution:
  Completed: 85 (42.5%)
  Closed: 37 (18.5%)
  Cancelled: 34 (17.0%)
  In Process: 12 (6.0%)
  Partially Received: 11 (5.5%)
  Pending Approval: 8 (4.0%)
  Draft: 8 (4.0%)
  Approved: 5 (2.5%)

Approval Status Distribution:
  Approved: 150 (75.0%)
  Rejected: 34 (17.0%)
  Pending Approval: 8 (4.0%)
  Draft: 8 (4.0%)

Order Value Statistics:
  Average order value: $24663.36
  Minimum order value: $1044.11
  Maximum order value: $49739.18
  Total order value: $4932671.10

Shipping Method Distribution:
  Sea: 38 (19.0%)
  Truck: 32 (16.0%)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  received_lines['receipt_date'] = pd.to_datetime(received_lines['receipt_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  received_lines['expected_delivery_date'] = pd.to_datetime(received_lines['expected_delivery_date'])


Production Schedules & Scheduled Production

In [11]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def load_products_data(products_file="data/products.csv"):
    """
    Load the previously generated products data
    
    Parameters:
    - products_file: CSV file containing products data
    
    Returns:
    - DataFrame containing the products data
    """
    try:
        return pd.read_csv(products_file)
    except FileNotFoundError:
        print(f"Error: Products data file {products_file} not found.")
        print("Please run the products data generation script first.")
        return None

def load_customer_orders_data(customer_orders_file="data/customer_orders.csv"):
    """
    Load the previously generated customer orders data if available
    
    Parameters:
    - customer_orders_file: CSV file containing customer orders data
    
    Returns:
    - DataFrame containing the customer orders data or None if not available
    """
    try:
        return pd.read_csv(customer_orders_file)
    except FileNotFoundError:
        print(f"Note: Customer orders data file {customer_orders_file} not found.")
        print("Production schedules will be generated without customer order associations.")
        return None

def load_equipment_data(equipment_file="data/equipment.csv"):
    """
    Load the previously generated equipment data if available
    
    Parameters:
    - equipment_file: CSV file containing equipment data
    
    Returns:
    - DataFrame containing the equipment data or None if not available
    """
    try:
        return pd.read_csv(equipment_file)
    except FileNotFoundError:
        print(f"Note: Equipment data file {equipment_file} not found.")
        print("Production schedules will be generated without equipment associations.")
        return None

def load_facilities_data(facilities_file="data/facilities.csv"):
    """
    Load the previously generated facilities data if available
    
    Parameters:
    - facilities_file: CSV file containing facilities data
    
    Returns:
    - DataFrame containing the facilities data or None if not available
    """
    try:
        return pd.read_csv(facilities_file)
    except FileNotFoundError:
        print(f"Note: Facilities data file {facilities_file} not found.")
        print("Production schedules will be generated with synthetic facility IDs.")
        return None

def load_personnel_data(personnel_file="data/personnel.csv"):
    """
    Load the previously generated personnel data if available
    
    Parameters:
    - personnel_file: CSV file containing personnel data
    
    Returns:
    - DataFrame containing the personnel data or None if not available
    """
    try:
        return pd.read_csv(personnel_file)
    except FileNotFoundError:
        print(f"Note: Personnel data file {personnel_file} not found.")
        print("Production schedules will be generated with synthetic creator IDs.")
        return None

def generate_production_schedules(products_df, customer_orders_df=None, equipment_df=None, 
                                facilities_df=None, personnel_df=None, num_schedules=20,
                                start_time=None, end_time=None, 
                                output_file="data/production_schedules.csv"):
    """
    Generate synthetic data for the ProductionSchedules table from ISA-95 Level 4.
    
    Parameters:
    - products_df: DataFrame containing products data
    - customer_orders_df: DataFrame containing customer orders data (optional)
    - equipment_df: DataFrame containing equipment data (optional)
    - facilities_df: DataFrame containing facilities data (optional)
    - personnel_df: DataFrame containing personnel data (optional)
    - num_schedules: Number of production schedule records to generate
    - start_time: Start time for schedule dates (defaults to 180 days ago)
    - end_time: End time for schedule dates (defaults to 180 days in the future)
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated production schedules data
    """
    if products_df is None or len(products_df) == 0:
        print("Error: No products data available.")
        return None
    
    # Set default time range if not provided
    if start_time is None:
        start_time = datetime.now() - timedelta(days=180)
    if end_time is None:
        end_time = datetime.now() + timedelta(days=180)
    
    # Generate facility IDs if facilities_df is not provided
    if facilities_df is None or len(facilities_df) == 0:
        print("Generating synthetic facility IDs...")
        facility_ids = [f"FAC-{uuid.uuid4().hex[:8].upper()}" for _ in range(5)]
    else:
        facility_ids = facilities_df['facility_id'].tolist()
    
    # Generate personnel IDs for creators if personnel_df is not provided
    if personnel_df is None or len(personnel_df) == 0:
        print("Generating synthetic creator IDs...")
        creator_ids = [f"PERS-{uuid.uuid4().hex[:8].upper()}" for _ in range(10)]
    else:
        creator_ids = personnel_df['personnel_id'].tolist()
    
    # Define schedule types
    schedule_types = ["Master", "Detailed", "Production", "Campaign", "Weekly", "Monthly", "Quarterly"]
    
    # Generate data structure
    data = {
        "schedule_id": [f"PS-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_schedules)],
        "schedule_name": [],
        "schedule_type": [],
        "facility_id": [],
        "start_date": [],
        "end_date": [],
        "creation_date": [],
        "created_by": [],
        "status": [],
        "revision": [],
        "freeze_horizon_days": [],
        "notes": []
    }
    
    # Define schedule statuses
    statuses = ["Draft", "Approved", "In Progress", "Completed", "Cancelled", "Superseded"]
    
    # Generate data for each production schedule
    for i in range(num_schedules):
        # Generate schedule name (descriptive)
        schedule_type = random.choice(schedule_types)
        
        # Create a date component for the name
        if schedule_type in ["Weekly", "Production"]:
            # Weekly schedules - named by week number
            week_num = random.randint(1, 52)
            year = random.randint(datetime.now().year - 1, datetime.now().year + 1)
            schedule_name = f"{schedule_type} Schedule - Week {week_num}, {year}"
        elif schedule_type == "Monthly":
            # Monthly schedules - named by month
            month_names = ["January", "February", "March", "April", "May", "June", 
                          "July", "August", "September", "October", "November", "December"]
            month = random.choice(month_names)
            year = random.randint(datetime.now().year - 1, datetime.now().year + 1)
            schedule_name = f"{month} {year} Production Schedule"
        elif schedule_type == "Quarterly":
            # Quarterly schedules
            quarter = random.randint(1, 4)
            year = random.randint(datetime.now().year - 1, datetime.now().year + 1)
            schedule_name = f"Q{quarter} {year} Production Plan"
        elif schedule_type == "Campaign":
            # Campaign schedules - named by product or campaign
            product_name = random.choice(products_df['product_name'].tolist())
            schedule_name = f"{product_name} Production Campaign"
        else:
            # Other schedule types
            time_period = random.choice(["Short-term", "Mid-term", "Long-term"])
            year = random.randint(datetime.now().year - 1, datetime.now().year + 1)
            schedule_name = f"{time_period} {schedule_type} Schedule {year}"
        
        data["schedule_name"].append(schedule_name)
        data["schedule_type"].append(schedule_type)
        
        # Assign facility
        data["facility_id"].append(random.choice(facility_ids))
        
        # Generate schedule dates based on type
        time_range_days = (end_time - start_time).days
        start_offset = random.randint(0, time_range_days - 1)
        schedule_start = start_time + timedelta(days=start_offset)
        
        # Schedule duration depends on type
        if schedule_type == "Weekly":
            duration_days = 7
        elif schedule_type == "Monthly":
            duration_days = 30
        elif schedule_type == "Quarterly":
            duration_days = 90
        elif schedule_type == "Campaign":
            duration_days = random.randint(14, 60)  # 2-8 weeks
        elif schedule_type == "Master":
            duration_days = random.randint(180, 365)  # 6-12 months
        else:
            duration_days = random.randint(30, 120)  # 1-4 months
            
        schedule_end = schedule_start + timedelta(days=duration_days)
        
        # Ensure end date is within range
        if schedule_end > end_time:
            schedule_end = end_time
            
        data["start_date"].append(schedule_start.strftime("%Y-%m-%d"))
        data["end_date"].append(schedule_end.strftime("%Y-%m-%d"))
        
        # Creation date is typically before start date
        creation_offset = random.randint(7, 60)  # 1-8 weeks before
        creation_date = schedule_start - timedelta(days=creation_offset)
        
        # Ensure creation date is not before data range start
        if creation_date < start_time:
            creation_date = start_time
            
        data["creation_date"].append(creation_date.strftime("%Y-%m-%d"))
        
        # Assign creator
        data["created_by"].append(random.choice(creator_ids))
        
        # Determine status based on dates
        current_date = datetime.now()
        
        if schedule_start > current_date:
            # Future schedules are typically Draft or Approved
            status = random.choice(["Draft", "Approved"])
        elif schedule_end < current_date:
            # Past schedules are Completed or Superseded
            status = random.choice(["Completed", "Completed", "Superseded"])  # Weight toward completed
        else:
            # Current schedules are In Progress
            status = "In Progress"
            
        # Some schedules might be cancelled
        if random.random() < 0.05:  # 5% chance
            status = "Cancelled"
            
        data["status"].append(status)
        
        # Set revision
        if status in ["Draft", "Approved"]:
            revision = random.randint(1, 3)  # Newer revisions
        elif status == "In Progress":
            revision = random.randint(1, 5)  # Might have several revisions
        else:
            revision = random.randint(1, 10)  # Completed schedules might have many revisions
            
        data["revision"].append(revision)
        
        # Set freeze horizon (period during which schedule cannot be changed)
        if schedule_type in ["Master", "Quarterly"]:
            freeze_horizon = random.randint(30, 60)  # 1-2 months
        elif schedule_type in ["Monthly", "Campaign"]:
            freeze_horizon = random.randint(14, 30)  # 2-4 weeks
        else:
            freeze_horizon = random.randint(3, 14)  # 3-14 days
            
        data["freeze_horizon_days"].append(freeze_horizon)
        
        # Generate notes (mostly empty)
        if random.random() < 0.3:  # 30% chance of having notes
            notes_options = [
                "Adjusted for material availability",
                "Optimized for equipment efficiency",
                "Consolidated for resource utilization",
                "Modified to accommodate rush orders",
                "Updated based on inventory levels",
                "Revised to match supplier deliveries",
                "Balanced for labor utilization",
                "Coordinated with maintenance schedule",
                "Aligned with quality testing capacity",
                "Considering seasonal demand factors"
            ]
            data["notes"].append(random.choice(notes_options))
        else:
            data["notes"].append("")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} production schedule records.")
    print(f"Data saved to {output_file}")
    
    return df

def generate_scheduled_production(production_schedules_df, products_df, customer_orders_df=None, 
                                 equipment_df=None, work_orders_df=None,
                                 output_file="data/scheduled_production.csv"):
    """
    Generate synthetic data for the ScheduledProduction table from ISA-95 Level 4.
    
    Parameters:
    - production_schedules_df: DataFrame containing production schedules data
    - products_df: DataFrame containing products data
    - customer_orders_df: DataFrame containing customer orders data (optional)
    - equipment_df: DataFrame containing equipment data (optional)
    - work_orders_df: DataFrame containing work orders data (optional)
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated scheduled production data
    """
    if production_schedules_df is None or len(production_schedules_df) == 0:
        print("Error: No production schedules data available.")
        return None
        
    if products_df is None or len(products_df) == 0:
        print("Error: No products data available.")
        return None
    
    # Load work order IDs if available
    if work_orders_df is not None and len(work_orders_df) > 0:
        work_order_ids = work_orders_df['work_order_id'].tolist()
    else:
        print("Generating synthetic work order IDs...")
        work_order_ids = [f"WO-{uuid.uuid4().hex[:8].upper()}" for _ in range(100)]
    
    # Load equipment IDs if available
    if equipment_df is not None and len(equipment_df) > 0:
        equipment_ids = equipment_df['equipment_id'].tolist()
    else:
        print("Generating synthetic equipment IDs...")
        equipment_ids = [f"EQ-{uuid.uuid4().hex[:8].upper()}" for _ in range(20)]
    
    # Load customer order IDs if available
    if customer_orders_df is not None and len(customer_orders_df) > 0:
        order_ids = customer_orders_df['order_id'].tolist()
    else:
        print("Generating synthetic customer order IDs...")
        order_ids = [f"CO-{uuid.uuid4().hex[:8].upper()}" for _ in range(50)]
    
    # Generate data structure
    data = {
        "scheduled_id": [],
        "schedule_id": [],
        "product_id": [],
        "work_order_id": [],
        "scheduled_quantity": [],
        "start_date": [],
        "end_date": [],
        "equipment_id": [],
        "priority": [],
        "status": [],
        "order_id": []
    }
    
    # Generate scheduled production items for each production schedule
    for _, schedule in production_schedules_df.iterrows():
        schedule_id = schedule['schedule_id']
        schedule_start = pd.to_datetime(schedule['start_date'])
        schedule_end = pd.to_datetime(schedule['end_date'])
        schedule_status = schedule['status']
        
        # Determine number of production items for this schedule
        # More detailed schedules have more items
        if schedule['schedule_type'] in ["Detailed", "Weekly"]:
            num_items = random.randint(10, 30)
        elif schedule['schedule_type'] == "Campaign":
            num_items = random.randint(5, 15)
        else:
            num_items = random.randint(5, 20)
        
        # Generate production items
        for _ in range(num_items):
            # Create unique scheduled production ID
            scheduled_id = f"SP-{uuid.uuid4().hex[:8].upper()}"
            data["scheduled_id"].append(scheduled_id)
            data["schedule_id"].append(schedule_id)
            
            # Select product
            product = products_df.sample(1).iloc[0]
            data["product_id"].append(product['product_id'])
            
            # Assign work order (some items might not have work orders yet)
            if schedule_status in ["In Progress", "Completed"] and random.random() < 0.9:
                # Most in-progress or completed schedule items have work orders
                data["work_order_id"].append(random.choice(work_order_ids))
            elif schedule_status == "Approved" and random.random() < 0.5:
                # Some approved schedule items have work orders
                data["work_order_id"].append(random.choice(work_order_ids))
            else:
                data["work_order_id"].append("")
            
            # Generate scheduled quantity
            scheduled_quantity = random.randint(100, 10000)
            data["scheduled_quantity"].append(scheduled_quantity)
            
            # Generate start and end dates
            # Distribute items across the schedule period
            schedule_days = (schedule_end - schedule_start).days
            if schedule_days <= 0:
                schedule_days = 1  # Ensure at least 1 day
                
            item_start_offset = random.randint(0, max(0, schedule_days - 1))
            item_start = schedule_start + timedelta(days=item_start_offset)
            
            # Item duration depends on quantity and complexity
            if 'product_family' in product and product['product_family'] in ["Pharmaceutical", "Chemical"]:
                # Complex products take longer
                item_duration = random.randint(3, 14)  # 3-14 days
            else:
                item_duration = random.randint(1, 7)  # 1-7 days
                
            item_end = item_start + timedelta(days=item_duration)
            
            # Ensure end date is within schedule
            if item_end > schedule_end:
                item_end = schedule_end
                
            data["start_date"].append(item_start.strftime("%Y-%m-%d"))
            data["end_date"].append(item_end.strftime("%Y-%m-%d"))
            
            # Assign equipment
            data["equipment_id"].append(random.choice(equipment_ids))
            
            # Set priority
            data["priority"].append(random.randint(1, 5))  # 1=highest, 5=lowest
            
            # Set status based on schedule status and dates
            current_date = datetime.now()
            
            if schedule_status == "Cancelled":
                item_status = "Cancelled"
            elif schedule_status == "Draft":
                item_status = "Planned"
            elif schedule_status == "Approved":
                if item_start > current_date:
                    item_status = "Scheduled"
                else:
                    item_status = random.choice(["Scheduled", "Released"])
            elif schedule_status == "In Progress":
                if item_start > current_date:
                    item_status = "Scheduled"
                elif item_end < current_date:
                    item_status = random.choice(["Completed", "Completed", "Canceled"])
                else:
                    item_status = random.choice(["Released", "In Progress", "Held"])
            elif schedule_status in ["Completed", "Superseded"]:
                if random.random() < 0.9:  # 90% chance
                    item_status = "Completed"
                else:
                    item_status = random.choice(["Canceled", "Partially Completed"])
            else:
                item_status = "Planned"
                
            data["status"].append(item_status)
            
            # Link to customer order (if applicable)
            if random.random() < 0.7:  # 70% linked to order
                data["order_id"].append(random.choice(order_ids))
            else:
                data["order_id"].append("")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} scheduled production records for {len(production_schedules_df)} schedules.")
    print(f"Data saved to {output_file}")
    
    return df

def display_statistics(production_schedules_df, scheduled_production_df=None):
    """
    Display basic statistics about the generated production schedules and scheduled production data
    
    Parameters:
    - production_schedules_df: DataFrame containing production schedules data
    - scheduled_production_df: DataFrame containing scheduled production data (optional)
    """
    if production_schedules_df is None or len(production_schedules_df) == 0:
        print("No production schedules data to analyze.")
        return
    
    print("\nProduction Schedules Statistics:")
    print(f"Total production schedules: {len(production_schedules_df)}")
    
    # Schedule type distribution
    print("\nSchedule Type Distribution:")
    type_counts = production_schedules_df['schedule_type'].value_counts()
    for schedule_type, count in type_counts.items():
        print(f"  {schedule_type}: {count} ({count/len(production_schedules_df)*100:.1f}%)")
    
    # Status distribution
    print("\nSchedule Status Distribution:")
    status_counts = production_schedules_df['status'].value_counts()
    for status, count in status_counts.items():
        print(f"  {status}: {count} ({count/len(production_schedules_df)*100:.1f}%)")
    
    # Duration statistics
    production_schedules_df['start_date'] = pd.to_datetime(production_schedules_df['start_date'])
    production_schedules_df['end_date'] = pd.to_datetime(production_schedules_df['end_date'])
    
    production_schedules_df['duration_days'] = (production_schedules_df['end_date'] - 
                                            production_schedules_df['start_date']).dt.days
    
    print("\nSchedule Duration Statistics:")
    print(f"  Average duration: {production_schedules_df['duration_days'].mean():.1f} days")
    print(f"  Minimum duration: {production_schedules_df['duration_days'].min()} days")
    print(f"  Maximum duration: {production_schedules_df['duration_days'].max()} days")
    
    # Freeze horizon statistics
    print("\nFreeze Horizon Statistics:")
    print(f"  Average freeze horizon: {production_schedules_df['freeze_horizon_days'].mean():.1f} days")
    print(f"  Minimum freeze horizon: {production_schedules_df['freeze_horizon_days'].min()} days")
    print(f"  Maximum freeze horizon: {production_schedules_df['freeze_horizon_days'].max()} days")
    
    # Schedule by facility
    facility_counts = production_schedules_df.groupby('facility_id').size()
    
    print("\nSchedules by Facility:")
    for facility, count in facility_counts.items():
        print(f"  {facility}: {count} schedules")
    
    # Time-based analysis
    production_schedules_df['month_year'] = production_schedules_df['start_date'].dt.to_period('M')
    monthly_schedules = production_schedules_df.groupby('month_year').size().sort_index()
    
    print("\nMonthly Schedule Distribution:")
    for month, count in monthly_schedules.items():
        print(f"  {month}: {count} schedules")
    
    # Scheduled production statistics (if available)
    if scheduled_production_df is not None and len(scheduled_production_df) > 0:
        print("\nScheduled Production Statistics:")
        print(f"Total scheduled production items: {len(scheduled_production_df)}")
        print(f"Items per schedule: {len(scheduled_production_df)/len(production_schedules_df):.1f}")
        
        # Status distribution
        print("\nItem Status Distribution:")
        item_status_counts = scheduled_production_df['status'].value_counts()
        for status, count in item_status_counts.items():
            print(f"  {status}: {count} ({count/len(scheduled_production_df)*100:.1f}%)")
        
        # Priority distribution
        print("\nPriority Distribution:")
        priority_counts = scheduled_production_df['priority'].value_counts().sort_index()
        for priority, count in priority_counts.items():
            print(f"  Priority {priority}: {count} ({count/len(scheduled_production_df)*100:.1f}%)")
        
        # Work order association
        has_wo = scheduled_production_df['work_order_id'].apply(lambda x: x != "").sum()
        print(f"\nItems with work order association: {has_wo} ({has_wo/len(scheduled_production_df)*100:.1f}%)")
        
        # Customer order association
        has_co = scheduled_production_df['order_id'].apply(lambda x: x != "").sum()
        print(f"Items with customer order association: {has_co} ({has_co/len(scheduled_production_df)*100:.1f}%)")
        
        # Quantity statistics
        print("\nQuantity Statistics:")
        print(f"  Average scheduled quantity: {scheduled_production_df['scheduled_quantity'].mean():.1f}")
        print(f"  Total scheduled quantity: {scheduled_production_df['scheduled_quantity'].sum()}")
        
        # Duration statistics
        scheduled_production_df['start_date'] = pd.to_datetime(scheduled_production_df['start_date'])
        scheduled_production_df['end_date'] = pd.to_datetime(scheduled_production_df['end_date'])
        
        scheduled_production_df['duration_days'] = (scheduled_production_df['end_date'] - 
                                                 scheduled_production_df['start_date']).dt.days
        
        print("\nItem Duration Statistics:")
        print(f"  Average duration: {scheduled_production_df['duration_days'].mean():.1f} days")
        print(f"  Minimum duration: {scheduled_production_df['duration_days'].min()} days")
        print(f"  Maximum duration: {scheduled_production_df['duration_days'].max()} days")
        
        # Product distribution
        product_counts = scheduled_production_df.groupby('product_id').size().sort_values(ascending=False)
        
        print("\nTop 10 Products in Production Schedules:")
        for product, count in product_counts.head(10).items():
            print(f"  {product}: {count} scheduled items ({count/len(scheduled_production_df)*100:.1f}%)")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load required data
    products_df = load_products_data()
    customer_orders_df = load_customer_orders_data()
    equipment_df = load_equipment_data()
    facilities_df = load_facilities_data()
    personnel_df = load_personnel_data()
    
    try:
        work_orders_df = pd.read_csv("data/work_orders.csv")
    except FileNotFoundError:
        print("Note: Work orders data file not found.")
        work_orders_df = None
    
    if products_df is not None:
        # Generate production schedules data
        production_schedules_df = generate_production_schedules(
            products_df,
            customer_orders_df,
            equipment_df,
            facilities_df,
            personnel_df,
            num_schedules=20,  # Generate 20 production schedule records
            output_file="data/production_schedules.csv"
        )
        
        if production_schedules_df is not None:
            # Generate scheduled production data
            scheduled_production_df = generate_scheduled_production(
                production_schedules_df,
                products_df,
                customer_orders_df,
                equipment_df,
                work_orders_df,
                output_file="data/scheduled_production.csv"
            )
        else:
            scheduled_production_df = None
    else:
        production_schedules_df = None
        scheduled_production_df = None
    
    # Display statistics
    if production_schedules_df is not None:
        display_statistics(production_schedules_df, scheduled_production_df)
        
        # Display sample data
        print("\nSample production schedules data (first 5 records):")
        print(production_schedules_df.head(5))
        
        if scheduled_production_df is not None:
            print("\nSample scheduled production data (first 5 records):")
            print(scheduled_production_df.head(5))

Note: Personnel data file data/personnel.csv not found.
Production schedules will be generated with synthetic creator IDs.
Generating synthetic creator IDs...
Successfully generated 20 production schedule records.
Data saved to data/production_schedules.csv
Successfully generated 264 scheduled production records for 20 schedules.
Data saved to data/scheduled_production.csv

Production Schedules Statistics:
Total production schedules: 20

Schedule Type Distribution:
  Master: 5 (25.0%)
  Quarterly: 4 (20.0%)
  Weekly: 3 (15.0%)
  Campaign: 3 (15.0%)
  Production: 3 (15.0%)
  Detailed: 1 (5.0%)
  Monthly: 1 (5.0%)

Schedule Status Distribution:
  Approved: 7 (35.0%)
  Draft: 5 (25.0%)
  In Progress: 5 (25.0%)
  Superseded: 1 (5.0%)
  Completed: 1 (5.0%)
  Cancelled: 1 (5.0%)

Schedule Duration Statistics:
  Average duration: 77.7 days
  Minimum duration: 7 days
  Maximum duration: 287 days

Freeze Horizon Statistics:
  Average freeze horizon: 27.1 days
  Minimum freeze horizon: 4 days
  

Facilities

In [12]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def load_personnel_data(personnel_file="data/personnel.csv"):
    """
    Load the previously generated personnel data if available
    
    Parameters:
    - personnel_file: CSV file containing personnel data
    
    Returns:
    - DataFrame containing the personnel data or None if not available
    """
    try:
        return pd.read_csv(personnel_file)
    except FileNotFoundError:
        print(f"Note: Personnel data file {personnel_file} not found.")
        print("Facilities will be generated with synthetic manager IDs.")
        return None

def generate_facilities_data(personnel_df=None, num_facilities=15, output_file="data/facilities.csv"):
    """
    Generate synthetic data for the Facilities table from ISA-95 Level 4.
    
    Parameters:
    - personnel_df: DataFrame containing personnel data (optional)
    - num_facilities: Number of facility records to generate
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated facilities data
    """
    # Define facility types and their probabilities
    facility_types = {
        "Manufacturing Plant": 0.5,
        "Warehouse": 0.2,
        "Distribution Center": 0.15,
        "R&D Center": 0.1,
        "Administrative Office": 0.05
    }
    
    # Define regions for address generation
    regions = {
        "North America": ["USA", "Canada", "Mexico"],
        "Europe": ["UK", "Germany", "France", "Italy", "Spain", "Netherlands"],
        "Asia": ["Japan", "China", "South Korea", "India", "Singapore", "Taiwan"],
        "Latin America": ["Brazil", "Colombia", "Argentina", "Chile", "Peru"],
        "Oceania": ["Australia", "New Zealand"]
    }
    
    # Region probabilities
    region_weights = {
        "North America": 0.4,
        "Europe": 0.25,
        "Asia": 0.2,
        "Latin America": 0.1,
        "Oceania": 0.05
    }
    
    # Generate manager IDs if personnel_df is not provided
    if personnel_df is None or len(personnel_df) == 0:
        print("Generating synthetic manager IDs...")
        manager_ids = [f"PERS-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_facilities)]
    else:
        # Use actual personnel IDs, preferably managers if role information is available
        if 'job_title' in personnel_df.columns:
            managers = personnel_df[personnel_df['job_title'].str.contains('Manager|Director|Supervisor', case=False, na=False)]
            if len(managers) >= num_facilities:
                manager_ids = managers['personnel_id'].sample(num_facilities).tolist()
            else:
                # If not enough managers, fill in with other personnel
                manager_ids = managers['personnel_id'].tolist()
                additional_ids = personnel_df[~personnel_df['personnel_id'].isin(manager_ids)]['personnel_id'].sample(num_facilities - len(manager_ids)).tolist()
                manager_ids.extend(additional_ids)
        else:
            # If no job title info, just sample random personnel
            manager_ids = personnel_df['personnel_id'].sample(min(num_facilities, len(personnel_df))).tolist()
            
            # If we don't have enough personnel, generate synthetic IDs for the rest
            if len(manager_ids) < num_facilities:
                additional_ids = [f"PERS-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_facilities - len(manager_ids))]
                manager_ids.extend(additional_ids)
    
    # Generate data structure
    data = {
        "facility_id": [f"FAC-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_facilities)],
        "facility_name": [],
        "facility_type": [],
        "address": [],
        "manager_id": [],
        "operating_hours": [],
        "status": [],
        "parent_facility_id": []
    }
    
    # Last names for city generation
    last_names = [
        "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", 
        "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", 
        "Thomas", "Taylor", "Moore", "Jackson", "Martin", "Lee", "Perez", "Thompson"
    ]
    
    # Generate a list of facility IDs for potential parent relationships
    all_facility_ids = data["facility_id"].copy()
    
    # Select some facilities to be headquarters or main facilities (no parent)
    num_main_facilities = min(3, num_facilities)  # Up to 3 main facilities or less if fewer total facilities
    main_facility_indices = random.sample(range(num_facilities), num_main_facilities)
    potential_parent_ids = [all_facility_ids[i] for i in main_facility_indices]
    
    # Generate data for each facility
    for i in range(num_facilities):
        # Select facility type (weighted random)
        facility_type = random.choices(
            list(facility_types.keys()), 
            weights=list(facility_types.values())
        )[0]
        data["facility_type"].append(facility_type)
        
        # Generate facility name
        # For manufacturing plants, use format like "City Manufacturing Plant"
        # For warehouses, use format like "Regional Distribution Center"
        # For offices, use format like "Corporate Headquarters"
        
        # Generate a region and country
        region = random.choices(
            list(region_weights.keys()), 
            weights=list(region_weights.values())
        )[0]
        country = random.choice(regions[region])
        
        # Generate a city name (simplified)
        city_prefixes = ["New", "East", "West", "North", "South", "Central", "Upper", "Lower", "Port", "Lake", "Mount"]
        city_suffixes = ["ville", "burg", "town", "field", "ford", "port", "bridge", "haven", "city"]
        
        if random.random() < 0.3:  # 30% chance of using prefix
            city = f"{random.choice(city_prefixes)} {random.choice(last_names)}{random.choice(['', random.choice(city_suffixes)])}"
        else:
            city = f"{random.choice(last_names)}{random.choice(['', random.choice(city_suffixes)])}"
        
        # Create facility name based on type
        if facility_type == "Manufacturing Plant":
            facility_name = f"{city} Manufacturing Plant"
            if random.random() < 0.3:  # 30% chance of adding specialty
                specialties = ["Pharmaceutical", "Food", "Electronics", "Automotive", "Chemical", "Consumer Goods"]
                facility_name = f"{city} {random.choice(specialties)} Manufacturing Plant"
        elif facility_type == "Warehouse":
            facility_name = f"{region} Distribution Warehouse"
            if random.random() < 0.5:  # 50% chance of adding location
                facility_name = f"{city} Distribution Warehouse"
        elif facility_type == "Distribution Center":
            facility_name = f"{country} Distribution Center"
            if random.random() < 0.5:  # 50% chance of adding location
                facility_name = f"{city} Distribution Center"
        elif facility_type == "R&D Center":
            facility_name = f"R&D Center {city}"
            if random.random() < 0.3:  # 30% chance of adding specialty
                specialties = ["Pharmaceutical", "Formulation", "Process Development", "Analytical", "Innovation"]
                facility_name = f"{random.choice(specialties)} R&D Center"
        else:  # Administrative Office
            if i in main_facility_indices:  # If this is a main facility
                facility_name = f"Corporate Headquarters - {city}"
            else:
                facility_name = f"{region} Administrative Office"
                if random.random() < 0.5:  # 50% chance of adding location
                    facility_name = f"{city} Administrative Office"
        
        data["facility_name"].append(facility_name)
        
        # Generate street address
        street_number = random.randint(1, 9999)
        street_types = ["Street", "Avenue", "Boulevard", "Road", "Lane", "Drive", "Way", "Place", "Court"]
        street_name = f"{random.choice(last_names)} {random.choice(street_types)}"
        
        address = f"{street_number} {street_name}, {city}, {country}"
        data["address"].append(address)
        
        # Assign manager
        data["manager_id"].append(manager_ids[i % len(manager_ids)])  # Cycle through available managers
        
        # Set operating hours
        operating_hour_options = [
            "24/7", 
            "Mon-Fri: 8AM-5PM", 
            "Mon-Fri: 7AM-7PM, Sat: 8AM-12PM",
            "Mon-Sat: 6AM-10PM",
            "Mon-Fri: 6AM-6PM, Weekends: On-call"
        ]
        
        # Manufacturing plants and warehouses are more likely to have extended hours
        if facility_type in ["Manufacturing Plant", "Warehouse", "Distribution Center"]:
            hours_weights = [0.4, 0.2, 0.2, 0.15, 0.05]  # More 24/7 operations
        else:
            hours_weights = [0.05, 0.6, 0.2, 0.1, 0.05]  # More standard business hours
            
        data["operating_hours"].append(random.choices(operating_hour_options, weights=hours_weights)[0])
        
        # Set status
        statuses = ["Active", "Inactive", "Under Construction", "Under Renovation", "Planned"]
        status_weights = [0.8, 0.05, 0.05, 0.05, 0.05]  # Mostly active facilities
        data["status"].append(random.choices(statuses, weights=status_weights)[0])
        
        # Set parent facility ID (if applicable)
        if i in main_facility_indices:
            # Main facilities have no parent
            data["parent_facility_id"].append("")
        else:
            # Other facilities have a parent with 80% probability
            if random.random() < 0.8:
                data["parent_facility_id"].append(random.choice(potential_parent_ids))
            else:
                data["parent_facility_id"].append("")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} facility records.")
    print(f"Data saved to {output_file}")
    
    return df

def display_statistics(facilities_df):
    """
    Display basic statistics about the generated facilities data
    
    Parameters:
    - facilities_df: DataFrame containing facilities data
    """
    if facilities_df is None or len(facilities_df) == 0:
        print("No facilities data to analyze.")
        return
    
    print("\nFacilities Statistics:")
    print(f"Total facilities: {len(facilities_df)}")
    
    # Facility type distribution
    print("\nFacility Type Distribution:")
    type_counts = facilities_df['facility_type'].value_counts()
    for facility_type, count in type_counts.items():
        print(f"  {facility_type}: {count} ({count/len(facilities_df)*100:.1f}%)")
    
    # Status distribution
    print("\nStatus Distribution:")
    status_counts = facilities_df['status'].value_counts()
    for status, count in status_counts.items():
        print(f"  {status}: {count} ({count/len(facilities_df)*100:.1f}%)")
    
    # Operating hours distribution
    print("\nOperating Hours Distribution:")
    hours_counts = facilities_df['operating_hours'].value_counts()
    for hours, count in hours_counts.items():
        print(f"  {hours}: {count} ({count/len(facilities_df)*100:.1f}%)")
    
    # Parent-child relationships
    has_parent = facilities_df['parent_facility_id'].apply(lambda x: x != "").sum()
    print(f"\nFacilities with parent: {has_parent} ({has_parent/len(facilities_df)*100:.1f}%)")
    
    # Facilities by region (extracted from address)
    if 'address' in facilities_df.columns:
        print("\nRegional Distribution:")
        
        # Extract country from address
        facilities_df['country'] = facilities_df['address'].apply(lambda x: x.split(',')[-1].strip() if isinstance(x, str) else "Unknown")
        
        country_counts = facilities_df['country'].value_counts()
        for country, count in country_counts.items():
            print(f"  {country}: {count} ({count/len(facilities_df)*100:.1f}%)")
    
    # Parent facilities analysis
    parent_ids = [pid for pid in facilities_df['parent_facility_id'].unique() if pid != ""]
    if parent_ids:
        parent_facilities = facilities_df[facilities_df['facility_id'].isin(parent_ids)]
        
        print("\nParent Facilities:")
        for _, facility in parent_facilities.iterrows():
            child_count = sum(facilities_df['parent_facility_id'] == facility['facility_id'])
            print(f"  {facility['facility_name']} ({facility['facility_type']}): {child_count} child facilities")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Try to load personnel data if available
    try:
        personnel_df = pd.read_csv("data/personnel.csv")
    except FileNotFoundError:
        print("Note: Personnel data file not found.")
        personnel_df = None
    
    # Generate facilities data
    facilities_df = generate_facilities_data(
        personnel_df,
        num_facilities=15,  # Generate 15 facility records
        output_file="data/facilities.csv"
    )
    
    # Display statistics
    if facilities_df is not None:
        display_statistics(facilities_df)
        
        # Display sample data
        print("\nSample facilities data (first 5 records):")
        print(facilities_df.head(5))

Note: Personnel data file not found.
Generating synthetic manager IDs...
Successfully generated 15 facility records.
Data saved to data/facilities.csv

Facilities Statistics:
Total facilities: 15

Facility Type Distribution:
  Distribution Center: 5 (33.3%)
  Warehouse: 4 (26.7%)
  R&D Center: 2 (13.3%)
  Administrative Office: 2 (13.3%)
  Manufacturing Plant: 2 (13.3%)

Status Distribution:
  Active: 13 (86.7%)
  Under Construction: 2 (13.3%)

Operating Hours Distribution:
  Mon-Fri: 8AM-5PM: 5 (33.3%)
  Mon-Sat: 6AM-10PM: 5 (33.3%)
  24/7: 3 (20.0%)
  Mon-Fri: 7AM-7PM, Sat: 8AM-12PM: 2 (13.3%)

Facilities with parent: 9 (60.0%)

Regional Distribution:
  Canada: 2 (13.3%)
  Singapore: 2 (13.3%)
  Italy: 2 (13.3%)
  India: 1 (6.7%)
  Brazil: 1 (6.7%)
  New Zealand: 1 (6.7%)
  Australia: 1 (6.7%)
  Netherlands: 1 (6.7%)
  Colombia: 1 (6.7%)
  China: 1 (6.7%)
  USA: 1 (6.7%)
  UK: 1 (6.7%)

Parent Facilities:
  Latin America Distribution Warehouse (Warehouse): 2 child facilities
  Jones 

Storage Locations

In [13]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def load_facilities_data(facilities_file="data/facilities.csv"):
    """
    Load the previously generated facilities data
    
    Parameters:
    - facilities_file: CSV file containing facilities data
    
    Returns:
    - DataFrame containing the facilities data
    """
    try:
        return pd.read_csv(facilities_file)
    except FileNotFoundError:
        print(f"Error: Facilities data file {facilities_file} not found.")
        print("Please run the facilities data generation script first.")
        return None

def generate_storage_locations(facilities_df, num_locations=None, output_file="data/storage_locations.csv"):
    """
    Generate synthetic data for the StorageLocations table from ISA-95 Level 4.
    
    Parameters:
    - facilities_df: DataFrame containing facilities data
    - num_locations: Number of storage location records to generate (default: auto-calculate based on facilities)
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated storage locations data
    """
    if facilities_df is None or len(facilities_df) == 0:
        print("Error: No facilities data available.")
        return None
    
    # Auto-calculate number of locations if not specified
    if num_locations is None:
        # Generate more storage locations for manufacturing plants and warehouses
        # and fewer for offices and R&D centers
        num_locations = 0
        for _, facility in facilities_df.iterrows():
            if 'facility_type' in facility:
                if facility['facility_type'] == 'Manufacturing Plant':
                    num_locations += random.randint(10, 30)
                elif facility['facility_type'] in ['Warehouse', 'Distribution Center']:
                    num_locations += random.randint(30, 50)
                elif facility['facility_type'] == 'R&D Center':
                    num_locations += random.randint(5, 15)
                else:  # Administrative Office
                    num_locations += random.randint(1, 5)
            else:
                num_locations += random.randint(10, 20)  # Default if type not available
    
    # Define storage location types
    location_types = {
        "Warehouse": ["Bulk Storage", "Pallet Rack", "Shelf", "Bin", "Cold Storage", "Freezer", "Controlled Substance", "Hazardous Material"],
        "Production": ["Raw Material Staging", "Work In Progress", "Finished Goods", "Line-Side", "Temporary Holding", "Quality Control Hold"],
        "Shipping": ["Shipping Dock", "Receiving Dock", "Staging Area", "Cross-Dock", "Outbound Queue", "Returns Processing"],
        "Special": ["Sample Storage", "Archive", "Quarantine", "QA Lab", "Damaged Goods", "Rejected Material", "Maintenance Supplies"]
    }
    
    # Define storage conditions
    storage_conditions = {
        "Standard": ["Ambient", "Room Temperature (15-25°C)", "Dry", "Standard Warehouse Conditions"],
        "Temperature Controlled": ["Refrigerated (2-8°C)", "Cold Room (8-15°C)", "Freezer (-20°C)", "Deep Freeze (-80°C)", "Heated (25-40°C)"],
        "Environmental Control": ["Humidity Controlled (<40% RH)", "Humidity Controlled (40-60% RH)", "Clean Room ISO Class 8", "Clean Room ISO Class 7", "Clean Room ISO Class 6"],
        "Special Conditions": ["Explosion Proof", "Fire Resistant", "ESD Protected", "Light Protected", "Nitrogen Atmosphere", "Oxygen-Free"]
    }
    
    # Generate data structure
    data = {
        "location_id": [f"LOC-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_locations)],
        "location_name": [],
        "facility_id": [],
        "location_type": [],
        "storage_conditions": [],
        "maximum_capacity": [],
        "current_utilization": [],
        "status": [],
        "parent_location_id": []
    }
    
    # Map to keep track of locations by facility
    locations_by_facility = {facility_id: [] for facility_id in facilities_df['facility_id']}
    
    # Generate root-level storage locations first (ones with no parent)
    facility_ids = facilities_df['facility_id'].tolist()
    root_locations_count = int(num_locations * 0.2)  # 20% of locations are root level
    child_locations_count = num_locations - root_locations_count
    
    # Keep track of all location IDs
    all_location_ids = data["location_id"].copy()
    root_location_ids = all_location_ids[:root_locations_count]
    
    # Generate root locations first
    for i in range(root_locations_count):
        location_id = all_location_ids[i]
        
        # Select a facility (weighted toward warehouses and manufacturing plants)
        if 'facility_type' in facilities_df.columns:
            facility_weights = facilities_df['facility_type'].apply(
                lambda x: 3 if x in ['Warehouse', 'Distribution Center'] else 
                         (2 if x == 'Manufacturing Plant' else 1)
            )
            facility = facilities_df.sample(weights=facility_weights).iloc[0]
        else:
            facility = facilities_df.sample(1).iloc[0]
            
        facility_id = facility['facility_id']
        locations_by_facility[facility_id].append(location_id)
        data["facility_id"].append(facility_id)
        
        # Determine location type based on facility type
        if 'facility_type' in facility:
            if facility['facility_type'] in ['Warehouse', 'Distribution Center']:
                location_category = random.choices(
                    ["Warehouse", "Shipping", "Special"],
                    weights=[0.7, 0.2, 0.1]
                )[0]
            elif facility['facility_type'] == 'Manufacturing Plant':
                location_category = random.choices(
                    ["Warehouse", "Production", "Shipping", "Special"],
                    weights=[0.3, 0.4, 0.2, 0.1]
                )[0]
            elif facility['facility_type'] == 'R&D Center':
                location_category = random.choices(
                    ["Warehouse", "Special"],
                    weights=[0.3, 0.7]
                )[0]
            else:  # Administrative Office
                location_category = "Special"
        else:
            location_category = random.choice(list(location_types.keys()))
            
        location_type = random.choice(location_types[location_category])
        data["location_type"].append(location_type)
        
        # Generate location name
        building = random.choice(["Building", "Block", "Wing", "Area", "Zone"])
        building_num = random.choice(["A", "B", "C", "D", "1", "2", "3", "4"])
        
        # Format location name based on type
        if location_category == "Warehouse":
            location_name = f"{building} {building_num} - {location_type}"
        elif location_category == "Production":
            location_name = f"Production {building} {building_num} - {location_type}"
        elif location_category == "Shipping":
            location_name = f"{location_type} Area {building_num}"
        else:  # Special
            location_name = f"{location_type} {building} {building_num}"
            
        data["location_name"].append(location_name)
        
        # Set storage conditions based on location type
        if location_type in ["Cold Storage", "Freezer", "Refrigerated"]:
            condition_category = "Temperature Controlled"
        elif location_type in ["Clean Room", "QA Lab", "Controlled Substance"]:
            condition_category = "Environmental Control"
        elif location_type in ["Hazardous Material", "Quarantine"]:
            condition_category = "Special Conditions"
        else:
            condition_category = random.choices(
                list(storage_conditions.keys()),
                weights=[0.7, 0.1, 0.1, 0.1]
            )[0]
            
        data["storage_conditions"].append(random.choice(storage_conditions[condition_category]))
        
        # Set capacity based on location type
        if location_type in ["Bulk Storage", "Pallet Rack", "Warehouse"]:
            max_capacity = random.randint(1000, 10000)
        elif location_type in ["Shelf", "Bin", "Line-Side"]:
            max_capacity = random.randint(100, 1000)
        elif location_type in ["Cold Storage", "Freezer", "Controlled Substance"]:
            max_capacity = random.randint(500, 5000)
        else:
            max_capacity = random.randint(200, 3000)
            
        data["maximum_capacity"].append(max_capacity)
        
        # Set current utilization (as a percentage)
        utilization = round(random.uniform(0.3, 0.9) * 100, 1)  # 30-90% utilized
        data["current_utilization"].append(utilization)
        
        # Set status
        statuses = ["Active", "Inactive", "Maintenance", "Full", "Reserved"]
        status_weights = [0.8, 0.05, 0.05, 0.05, 0.05]  # Mostly active
        data["status"].append(random.choices(statuses, weights=status_weights)[0])
        
        # Root locations have no parent
        data["parent_location_id"].append("")
    
    # Generate child locations
    for i in range(child_locations_count):
        location_id = all_location_ids[root_locations_count + i]
        
        # Select a facility and a potential parent location in that facility
        facility_id = random.choice(facility_ids)
        
        # If the facility has root locations, use one as parent
        if locations_by_facility[facility_id]:
            parent_location_id = random.choice(locations_by_facility[facility_id])
            parent_index = all_location_ids.index(parent_location_id)
            parent_location_type = data["location_type"][parent_index]
            
            # Store the facility and parent relationship
            data["facility_id"].append(facility_id)
            data["parent_location_id"].append(parent_location_id)
            
            # Child location types are derived from parent types
            if parent_location_type in ["Bulk Storage", "Pallet Rack", "Warehouse"]:
                child_types = ["Aisle", "Bay", "Rack", "Section", "Zone"]
                location_type = f"{random.choice(child_types)} {random.randint(1, 99):02d}"
            elif parent_location_type in ["Shelf", "Bin"]:
                location_type = f"{parent_location_type} {random.randint(1, 999):03d}"
            elif parent_location_type in ["Cold Storage", "Freezer"]:
                location_type = f"{parent_location_type} Section {random.randint(1, 20):02d}"
            elif parent_location_type in ["Raw Material Staging", "Finished Goods"]:
                location_type = f"{parent_location_type} Area {random.choice(['A', 'B', 'C', 'D'])}{random.randint(1, 20):02d}"
            else:
                # Default child naming for other parent types
                location_type = f"Sub-Location {random.randint(1, 99):02d}"
        else:
            # If no root locations in this facility, create as a root location
            data["facility_id"].append(facility_id)
            data["parent_location_id"].append("")
            
            # Pick a random location type
            location_category = random.choice(list(location_types.keys()))
            location_type = random.choice(location_types[location_category])
        
        data["location_type"].append(location_type)
        
        # Generate location name
        if data["parent_location_id"][-1]:  # If has parent
            parent_index = all_location_ids.index(data["parent_location_id"][-1])
            parent_name = data["location_name"][parent_index]
            location_name = f"{parent_name} - {location_type}"
        else:
            building = random.choice(["Building", "Block", "Wing", "Area", "Zone"])
            building_num = random.choice(["A", "B", "C", "D", "1", "2", "3", "4"])
            location_name = f"{building} {building_num} - {location_type}"
            
        data["location_name"].append(location_name)
        
        # Set storage conditions (inherit from parent or generate new)
        if data["parent_location_id"][-1]:
            # 80% chance to inherit parent's conditions
            if random.random() < 0.8:
                parent_index = all_location_ids.index(data["parent_location_id"][-1])
                data["storage_conditions"].append(data["storage_conditions"][parent_index])
            else:
                condition_category = random.choice(list(storage_conditions.keys()))
                data["storage_conditions"].append(random.choice(storage_conditions[condition_category]))
        else:
            condition_category = random.choice(list(storage_conditions.keys()))
            data["storage_conditions"].append(random.choice(storage_conditions[condition_category]))
        
        # Set capacity (smaller for child locations)
        if data["parent_location_id"][-1]:
            parent_index = all_location_ids.index(data["parent_location_id"][-1])
            parent_capacity = data["maximum_capacity"][parent_index]
            # Child capacity is a fraction of parent capacity
            max_capacity = int(parent_capacity * random.uniform(0.05, 0.2))
        else:
            max_capacity = random.randint(100, 5000)
            
        data["maximum_capacity"].append(max_capacity)
        
        # Set current utilization (as a percentage)
        utilization = round(random.uniform(0.2, 0.95) * 100, 1)  # 20-95% utilized
        data["current_utilization"].append(utilization)
        
        # Set status (inherit from parent with some variation)
        if data["parent_location_id"][-1]:
            parent_index = all_location_ids.index(data["parent_location_id"][-1])
            parent_status = data["status"][parent_index]
            
            if parent_status == "Inactive":
                # Inactive parents have inactive children
                data["status"].append("Inactive")
            elif parent_status == "Maintenance":
                # Maintenance parents likely have maintenance children
                statuses = ["Maintenance", "Inactive", "Active"]
                data["status"].append(random.choices(statuses, weights=[0.7, 0.2, 0.1])[0])
            else:
                # Active parents have mostly active children
                statuses = ["Active", "Inactive", "Maintenance", "Full", "Reserved"]
                status_weights = [0.75, 0.05, 0.05, 0.1, 0.05]
                data["status"].append(random.choices(statuses, weights=status_weights)[0])
        else:
            statuses = ["Active", "Inactive", "Maintenance", "Full", "Reserved"]
            status_weights = [0.8, 0.05, 0.05, 0.05, 0.05]  # Mostly active
            data["status"].append(random.choices(statuses, weights=status_weights)[0])
        
        # Add this location to the facility's location list
        locations_by_facility[facility_id].append(location_id)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} storage location records across {len(facilities_df)} facilities.")
    print(f"Data saved to {output_file}")
    
    return df

def display_statistics(storage_locations_df, facilities_df=None):
    """
    Display basic statistics about the generated storage locations data
    
    Parameters:
    - storage_locations_df: DataFrame containing storage locations data
    - facilities_df: DataFrame containing facilities data (optional)
    """
    if storage_locations_df is None or len(storage_locations_df) == 0:
        print("No storage locations data to analyze.")
        return
    
    print("\nStorage Locations Statistics:")
    print(f"Total storage locations: {len(storage_locations_df)}")
    
    # Location type distribution (top 10)
    print("\nLocation Type Distribution (top 10):")
    type_counts = storage_locations_df['location_type'].value_counts().head(10)
    for loc_type, count in type_counts.items():
        print(f"  {loc_type}: {count} ({count/len(storage_locations_df)*100:.1f}%)")
    
    # Status distribution
    print("\nStatus Distribution:")
    status_counts = storage_locations_df['status'].value_counts()
    for status, count in status_counts.items():
        print(f"  {status}: {count} ({count/len(storage_locations_df)*100:.1f}%)")
    
    # Storage conditions distribution (top 10)
    print("\nStorage Conditions Distribution (top 10):")
    condition_counts = storage_locations_df['storage_conditions'].value_counts().head(10)
    for condition, count in condition_counts.items():
        print(f"  {condition}: {count} ({count/len(storage_locations_df)*100:.1f}%)")
    
    # Parent-child relationships
    has_parent = storage_locations_df['parent_location_id'].apply(lambda x: x != "").sum()
    print(f"\nLocations with parent: {has_parent} ({has_parent/len(storage_locations_df)*100:.1f}%)")
    
    # Distribution by facility
    facility_counts = storage_locations_df.groupby('facility_id').size().sort_values(ascending=False)
    
    print("\nStorage Locations by Facility:")
    if facilities_df is not None:
        # Create a mapping of facility_id to facility_name
        facility_names = dict(zip(facilities_df['facility_id'], facilities_df['facility_name']))
        
        for facility_id, count in facility_counts.items():
            facility_name = facility_names.get(facility_id, facility_id)
            print(f"  {facility_name}: {count} locations ({count/len(storage_locations_df)*100:.1f}%)")
    else:
        for facility_id, count in facility_counts.items():
            print(f"  {facility_id}: {count} locations ({count/len(storage_locations_df)*100:.1f}%)")
    
    # Capacity and utilization statistics
    print("\nCapacity Statistics:")
    print(f"  Total maximum capacity: {storage_locations_df['maximum_capacity'].sum():,}")
    print(f"  Average maximum capacity: {storage_locations_df['maximum_capacity'].mean():.1f}")
    print(f"  Average utilization: {storage_locations_df['current_utilization'].mean():.1f}%")
    
    # Full or near-full locations
    full_locations = storage_locations_df[storage_locations_df['current_utilization'] >= 90]
    print(f"\nLocations at or above 90% capacity: {len(full_locations)} ({len(full_locations)/len(storage_locations_df)*100:.1f}%)")
    
    # Underutilized locations
    under_locations = storage_locations_df[storage_locations_df['current_utilization'] <= 30]
    print(f"Locations at or below 30% capacity: {len(under_locations)} ({len(under_locations)/len(storage_locations_df)*100:.1f}%)")
    
    # Hierarchy depth analysis
    if has_parent > 0:
        print("\nLocation Hierarchy Analysis:")
        
        # Count locations at each hierarchy level
        storage_locations_df['hierarchy_level'] = 1  # Start all at level 1
        
        # Create a copy to avoid SettingWithCopyWarning
        temp_df = storage_locations_df.copy()
        
        # Create a mapping of location_id to index for faster lookup
        location_id_to_index = {loc_id: i for i, loc_id in enumerate(temp_df['location_id'])}
        
        # Iterate to propagate levels
        max_iterations = 10  # Avoid infinite loops
        for _ in range(max_iterations):
            changes = 0
            for i, row in temp_df.iterrows():
                if row['parent_location_id'] != "":
                    parent_idx = location_id_to_index.get(row['parent_location_id'])
                    if parent_idx is not None:
                        parent_level = temp_df.at[parent_idx, 'hierarchy_level']
                        child_level = parent_level + 1
                        if temp_df.at[i, 'hierarchy_level'] != child_level:
                            temp_df.at[i, 'hierarchy_level'] = child_level
                            changes += 1
            if changes == 0:
                break
                
        # Count locations at each level
        level_counts = temp_df['hierarchy_level'].value_counts().sort_index()
        for level, count in level_counts.items():
            print(f"  Level {level}: {count} locations ({count/len(temp_df)*100:.1f}%)")
        
        print(f"  Maximum hierarchy depth: {level_counts.index.max()}")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load required data
    facilities_df = load_facilities_data()
    
    if facilities_df is not None:
        # Generate storage locations data
        storage_locations_df = generate_storage_locations(
            facilities_df,
            num_locations=None,  # Auto-calculate based on facilities
            output_file="data/storage_locations.csv"
        )
        
        # Display statistics
        if storage_locations_df is not None:
            display_statistics(storage_locations_df, facilities_df)
            
            # Display sample data
            print("\nSample storage locations data (first 5 records):")
            print(storage_locations_df.head(5))

Successfully generated 444 storage location records across 15 facilities.
Data saved to data/storage_locations.csv

Storage Locations Statistics:
Total storage locations: 444

Location Type Distribution (top 10):
  Hazardous Material: 13 (2.9%)
  Bin: 10 (2.3%)
  Freezer: 9 (2.0%)
  Shelf: 8 (1.8%)
  Sub-Location 29: 8 (1.8%)
  Sub-Location 19: 8 (1.8%)
  Pallet Rack: 8 (1.8%)
  Sub-Location 73: 7 (1.6%)
  Sub-Location 99: 7 (1.6%)
  Damaged Goods: 6 (1.4%)

Status Distribution:
  Active: 315 (70.9%)
  Inactive: 47 (10.6%)
  Full: 31 (7.0%)
  Maintenance: 30 (6.8%)
  Reserved: 21 (4.7%)

Storage Conditions Distribution (top 10):
  Room Temperature (15-25°C): 48 (10.8%)
  Dry: 42 (9.5%)
  Ambient: 41 (9.2%)
  Nitrogen Atmosphere: 34 (7.7%)
  ESD Protected: 33 (7.4%)
  Clean Room ISO Class 7: 31 (7.0%)
  Explosion Proof: 30 (6.8%)
  Freezer (-20°C): 22 (5.0%)
  Humidity Controlled (40-60% RH): 22 (5.0%)
  Standard Warehouse Conditions: 22 (5.0%)

Locations with parent: 355 (80.0%)

Stora

Shifts Table

In [14]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta, time
import random
import os

def load_facilities_data(facilities_file="data/facilities.csv"):
    """
    Load the previously generated facilities data
    
    Parameters:
    - facilities_file: CSV file containing facilities data
    
    Returns:
    - DataFrame containing the facilities data
    """
    try:
        return pd.read_csv(facilities_file)
    except FileNotFoundError:
        print(f"Error: Facilities data file {facilities_file} not found.")
        print("Please run the facilities data generation script first.")
        return None

def load_personnel_data(personnel_file="data/personnel.csv"):
    """
    Load the previously generated personnel data if available
    
    Parameters:
    - personnel_file: CSV file containing personnel data
    
    Returns:
    - DataFrame containing the personnel data or None if not available
    """
    try:
        return pd.read_csv(personnel_file)
    except FileNotFoundError:
        print(f"Note: Personnel data file {personnel_file} not found.")
        print("Shifts will be generated with synthetic supervisor IDs.")
        return None

def generate_shifts_data(facilities_df, personnel_df=None, output_file="data/shifts.csv"):
    """
    Generate synthetic data for the Shifts table from ISA-95 Level 4.
    
    Parameters:
    - facilities_df: DataFrame containing facilities data
    - personnel_df: DataFrame containing personnel data (optional)
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated shifts data
    """
    if facilities_df is None or len(facilities_df) == 0:
        print("Error: No facilities data available.")
        return None
    
    # Generate supervisor IDs if personnel_df is not provided
    if personnel_df is None or len(personnel_df) == 0:
        print("Generating synthetic supervisor IDs...")
        supervisor_ids = [f"PERS-{uuid.uuid4().hex[:8].upper()}" for _ in range(30)]
    else:
        # Try to select supervisors or managers if job title info is available
        if 'job_title' in personnel_df.columns:
            supervisors = personnel_df[personnel_df['job_title'].str.contains('Supervisor|Manager|Lead|Foreman', case=False, na=False)]
            if len(supervisors) > 0:
                supervisor_ids = supervisors['personnel_id'].tolist()
            else:
                supervisor_ids = personnel_df['personnel_id'].sample(min(30, len(personnel_df))).tolist()
        else:
            supervisor_ids = personnel_df['personnel_id'].sample(min(30, len(personnel_df))).tolist()
    
    # Define common shift patterns
    shift_patterns = {
        "Standard": [
            {"name": "Day Shift", "start": "08:00", "end": "16:00", "breaks": [{"start": "12:00", "end": "12:30", "type": "Lunch"}]},
            {"name": "Evening Shift", "start": "16:00", "end": "00:00", "breaks": [{"start": "20:00", "end": "20:30", "type": "Dinner"}]},
            {"name": "Night Shift", "start": "00:00", "end": "08:00", "breaks": [{"start": "04:00", "end": "04:30", "type": "Meal"}]}
        ],
        "Manufacturing": [
            {"name": "First Shift", "start": "06:00", "end": "14:00", "breaks": [{"start": "10:00", "end": "10:15", "type": "Break"}, {"start": "12:00", "end": "12:30", "type": "Lunch"}]},
            {"name": "Second Shift", "start": "14:00", "end": "22:00", "breaks": [{"start": "18:00", "end": "18:15", "type": "Break"}, {"start": "19:00", "end": "19:30", "type": "Dinner"}]},
            {"name": "Third Shift", "start": "22:00", "end": "06:00", "breaks": [{"start": "02:00", "end": "02:15", "type": "Break"}, {"start": "03:00", "end": "03:30", "type": "Meal"}]}
        ],
        "Distribution": [
            {"name": "Morning Shift", "start": "07:00", "end": "15:30", "breaks": [{"start": "10:00", "end": "10:15", "type": "Break"}, {"start": "12:00", "end": "12:30", "type": "Lunch"}]},
            {"name": "Afternoon Shift", "start": "15:00", "end": "23:30", "breaks": [{"start": "18:00", "end": "18:15", "type": "Break"}, {"start": "20:00", "end": "20:30", "type": "Dinner"}]},
            {"name": "Overnight Shift", "start": "23:00", "end": "07:30", "breaks": [{"start": "02:00", "end": "02:15", "type": "Break"}, {"start": "04:00", "end": "04:30", "type": "Meal"}]}
        ],
        "Office": [
            {"name": "Business Hours", "start": "09:00", "end": "17:00", "breaks": [{"start": "12:00", "end": "13:00", "type": "Lunch"}]},
            {"name": "Extended Hours", "start": "08:00", "end": "18:00", "breaks": [{"start": "12:30", "end": "13:30", "type": "Lunch"}]}
        ],
        "Continuous": [
            {"name": "A Shift", "start": "06:00", "end": "18:00", "breaks": [{"start": "10:00", "end": "10:15", "type": "Break"}, {"start": "14:00", "end": "14:30", "type": "Meal"}]},
            {"name": "B Shift", "start": "18:00", "end": "06:00", "breaks": [{"start": "22:00", "end": "22:15", "type": "Break"}, {"start": "02:00", "end": "02:30", "type": "Meal"}]}
        ],
        "Weekend": [
            {"name": "Weekend Day", "start": "07:00", "end": "19:00", "breaks": [{"start": "12:00", "end": "12:45", "type": "Lunch"}]},
            {"name": "Weekend Night", "start": "19:00", "end": "07:00", "breaks": [{"start": "00:00", "end": "00:45", "type": "Meal"}]}
        ]
    }
    
    # Generate data structure
    data = {
        "shift_id": [],
        "shift_name": [],
        "facility_id": [],
        "start_time": [],
        "end_time": [],
        "break_periods": [],
        "supervisor_id": [],
        "notes": []
    }
    
    # Determine which shift patterns to use for each facility
    facility_shift_patterns = {}
    
    for _, facility in facilities_df.iterrows():
        facility_id = facility['facility_id']
        
        # Determine appropriate shift pattern based on facility type
        if 'facility_type' in facility:
            if facility['facility_type'] == 'Manufacturing Plant':
                if 'operating_hours' in facility and '24/7' in str(facility['operating_hours']):
                    pattern_type = random.choice(["Manufacturing", "Continuous"])
                else:
                    pattern_type = "Manufacturing"
            elif facility['facility_type'] in ['Warehouse', 'Distribution Center']:
                if 'operating_hours' in facility and '24/7' in str(facility['operating_hours']):
                    pattern_type = random.choice(["Distribution", "Continuous"])
                else:
                    pattern_type = "Distribution"
            elif facility['facility_type'] == 'R&D Center':
                pattern_type = random.choice(["Standard", "Office"])
            else:  # Administrative Office
                pattern_type = "Office"
        else:
            # Default if facility type not available
            pattern_type = random.choice(list(shift_patterns.keys()))
        
        # Store the selected pattern type
        facility_shift_patterns[facility_id] = pattern_type
        
        # Determine number of shifts for this facility
        if pattern_type == "Office":
            num_shifts = random.randint(1, 2)  # Offices usually have 1-2 shifts
        elif pattern_type == "Continuous":
            num_shifts = 2  # Continuous operations usually have 2 shifts
        else:
            num_shifts = len(shift_patterns[pattern_type])
            
        # Add weekend shifts?
        add_weekend = random.random() < 0.4  # 40% chance of weekend shifts
        
        # Generate shifts for this facility
        for i in range(num_shifts):
            # Create unique shift ID
            shift_id = f"SHIFT-{uuid.uuid4().hex[:8].upper()}"
            data["shift_id"].append(shift_id)
            
            # Set facility ID
            data["facility_id"].append(facility_id)
            
            # Get shift template
            shift_template = shift_patterns[pattern_type][i % len(shift_patterns[pattern_type])]
            
            # Set shift name (append facility name or code for uniqueness)
            if 'facility_name' in facility:
                facility_code = facility['facility_name'].split()[0]  # Use first word of facility name
            else:
                facility_code = facility_id[-4:]  # Use last 4 chars of ID
                
            shift_name = f"{shift_template['name']} ({facility_code})"
            data["shift_name"].append(shift_name)
            
            # Set start and end times
            data["start_time"].append(shift_template['start'])
            data["end_time"].append(shift_template['end'])
            
            # Set break periods as JSON string
            data["break_periods"].append(str(shift_template['breaks']))
            
            # Assign supervisor
            data["supervisor_id"].append(random.choice(supervisor_ids))
            
            # Set notes
            notes_options = [
                "",  # Empty notes most common
                "Cross-trained personnel required",
                "Heavy machinery operation certification needed",
                "Quality inspection responsibilities",
                "Maintenance activities scheduled during this shift",
                "Handover procedures are critical",
                "Specialized training required",
                "High volume production period",
                "Security escort needed for sensitive areas"
            ]
            notes_weights = [0.7, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]  # 70% empty notes
            data["notes"].append(random.choices(notes_options, weights=notes_weights)[0])
        
        # Add weekend shifts if applicable
        if add_weekend:
            for i in range(min(2, len(shift_patterns["Weekend"]))):
                # Create unique shift ID
                shift_id = f"SHIFT-{uuid.uuid4().hex[:8].upper()}"
                data["shift_id"].append(shift_id)
                
                # Set facility ID
                data["facility_id"].append(facility_id)
                
                # Get shift template
                shift_template = shift_patterns["Weekend"][i]
                
                # Set shift name
                if 'facility_name' in facility:
                    facility_code = facility['facility_name'].split()[0]
                else:
                    facility_code = facility_id[-4:]
                    
                shift_name = f"{shift_template['name']} ({facility_code})"
                data["shift_name"].append(shift_name)
                
                # Set start and end times
                data["start_time"].append(shift_template['start'])
                data["end_time"].append(shift_template['end'])
                
                # Set break periods as JSON string
                data["break_periods"].append(str(shift_template['breaks']))
                
                # Assign supervisor
                data["supervisor_id"].append(random.choice(supervisor_ids))
                
                # Set weekend-specific notes
                weekend_notes = [
                    "Weekend coverage",
                    "Reduced staff, cross-training required",
                    "Limited support services available",
                    "On-call maintenance only",
                    "No shipping/receiving on weekends",
                    "Security measures heightened on weekends"
                ]
                data["notes"].append(random.choice(weekend_notes))
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} shift records across {len(facilities_df)} facilities.")
    print(f"Data saved to {output_file}")
    
    return df

def display_statistics(shifts_df, facilities_df=None):
    """
    Display basic statistics about the generated shifts data
    
    Parameters:
    - shifts_df: DataFrame containing shifts data
    - facilities_df: DataFrame containing facilities data (optional)
    """
    if shifts_df is None or len(shifts_df) == 0:
        print("No shifts data to analyze.")
        return
    
    print("\nShifts Statistics:")
    print(f"Total shifts: {len(shifts_df)}")
    
    # Convert time strings to datetime.time objects for analysis
    shifts_df['start_time_obj'] = shifts_df['start_time'].apply(
        lambda x: datetime.strptime(x, '%H:%M').time() if isinstance(x, str) else None
    )
    shifts_df['end_time_obj'] = shifts_df['end_time'].apply(
        lambda x: datetime.strptime(x, '%H:%M').time() if isinstance(x, str) else None
    )
    
    # Calculate shift duration
    def calculate_duration(start, end):
        if start is None or end is None:
            return None
        
        # Convert to datetime for calculation
        start_dt = datetime.combine(datetime.today().date(), start)
        end_dt = datetime.combine(datetime.today().date(), end)
        
        # Adjust for overnight shifts
        if end <= start:
            end_dt += timedelta(days=1)
            
        duration = end_dt - start_dt
        return duration.total_seconds() / 3600  # Return hours
    
    shifts_df['duration_hours'] = shifts_df.apply(
        lambda row: calculate_duration(row['start_time_obj'], row['end_time_obj']), axis=1
    )
    
    # Shift duration statistics
    print("\nShift Duration Statistics:")
    print(f"  Average shift duration: {shifts_df['duration_hours'].mean():.1f} hours")
    print(f"  Minimum shift duration: {shifts_df['duration_hours'].min():.1f} hours")
    print(f"  Maximum shift duration: {shifts_df['duration_hours'].max():.1f} hours")
    
    # Shift distribution by time of day
    def get_time_category(start_time):
        if start_time is None:
            return "Unknown"
        
        hour = start_time.hour
        if 5 <= hour < 12:
            return "Morning"
        elif 12 <= hour < 17:
            return "Afternoon"
        elif 17 <= hour < 22:
            return "Evening"
        else:
            return "Night"
    
    shifts_df['time_category'] = shifts_df['start_time_obj'].apply(get_time_category)
    
    print("\nShift Distribution by Time of Day:")
    time_counts = shifts_df['time_category'].value_counts()
    for category, count in time_counts.items():
        print(f"  {category}: {count} shifts ({count/len(shifts_df)*100:.1f}%)")
    
    # Shift count by facility
    facility_counts = shifts_df.groupby('facility_id').size().sort_values(ascending=False)
    
    print("\nShifts by Facility:")
    if facilities_df is not None:
        # Create a mapping of facility_id to facility_name
        facility_names = dict(zip(facilities_df['facility_id'], facilities_df['facility_name']))
        
        for facility_id, count in facility_counts.items():
            facility_name = facility_names.get(facility_id, facility_id)
            print(f"  {facility_name}: {count} shifts ({count/len(shifts_df)*100:.1f}%)")
    else:
        for facility_id, count in facility_counts.items():
            print(f"  {facility_id}: {count} shifts ({count/len(shifts_df)*100:.1f}%)")
    
    # Break period analysis
    print("\nBreak Period Analysis:")
    
    # Count shifts with break periods
    has_breaks = shifts_df['break_periods'].apply(lambda x: x != "[]" and x != "").sum()
    print(f"  Shifts with breaks: {has_breaks} ({has_breaks/len(shifts_df)*100:.1f}%)")
    
    # Popular shift names
    print("\nCommon Shift Names:")
    name_pattern_counts = shifts_df['shift_name'].apply(lambda x: x.split('(')[0].strip()).value_counts().head(5)
    for pattern, count in name_pattern_counts.items():
        print(f"  {pattern}: {count} shifts ({count/len(shifts_df)*100:.1f}%)")
    
    # Check for 24-hour coverage
    print("\n24-Hour Coverage Analysis:")
    
    # Group by facility and check if shifts cover a full 24 hours
    # This is a simplistic approach; a more thorough analysis would check for overlaps and gaps
    facility_coverage = {}
    
    for facility_id in shifts_df['facility_id'].unique():
        facility_shifts = shifts_df[shifts_df['facility_id'] == facility_id]
        
        # Check if there are night shifts (starting after 20:00 or before 06:00)
        has_night = facility_shifts['start_time_obj'].apply(
            lambda x: x.hour >= 20 or x.hour < 6 if x is not None else False
        ).any()
        
        # Check if there are day shifts (starting between 06:00 and 12:00)
        has_day = facility_shifts['start_time_obj'].apply(
            lambda x: 6 <= x.hour < 12 if x is not None else False
        ).any()
        
        # Check if there are afternoon/evening shifts (starting between 12:00 and 20:00)
        has_evening = facility_shifts['start_time_obj'].apply(
            lambda x: 12 <= x.hour < 20 if x is not None else False
        ).any()
        
        # Determine coverage
        if has_night and has_day and has_evening:
            coverage = "Full 24-hour coverage likely"
        elif has_night and has_day:
            coverage = "Partial coverage (missing afternoon/evening)"
        elif has_night and has_evening:
            coverage = "Partial coverage (missing morning/day)"
        elif has_day and has_evening:
            coverage = "Partial coverage (missing night)"
        elif has_night:
            coverage = "Night coverage only"
        elif has_day:
            coverage = "Day coverage only"
        elif has_evening:
            coverage = "Afternoon/evening coverage only"
        else:
            coverage = "Unknown coverage"
            
        facility_coverage[facility_id] = coverage
    
    coverage_counts = pd.Series(facility_coverage).value_counts()
    for coverage, count in coverage_counts.items():
        print(f"  {coverage}: {count} facilities ({count/len(facility_coverage)*100:.1f}%)")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load required data
    facilities_df = load_facilities_data()
    personnel_df = load_personnel_data()
    
    if facilities_df is not None:
        # Generate shifts data
        shifts_df = generate_shifts_data(
            facilities_df,
            personnel_df,
            output_file="data/shifts.csv"
        )
        
        # Display statistics
        if shifts_df is not None:
            display_statistics(shifts_df, facilities_df)
            
            # Display sample data
            print("\nSample shifts data (first 5 records):")
            print(shifts_df.head(5))

Note: Personnel data file data/personnel.csv not found.
Shifts will be generated with synthetic supervisor IDs.
Generating synthetic supervisor IDs...
Successfully generated 58 shift records across 15 facilities.
Data saved to data/shifts.csv

Shifts Statistics:
Total shifts: 58

Shift Duration Statistics:
  Average shift duration: 9.8 hours
  Minimum shift duration: 8.0 hours
  Maximum shift duration: 12.0 hours

Shift Distribution by Time of Day:
  Morning: 25 shifts (43.1%)
  Afternoon: 11 shifts (19.0%)
  Night: 11 shifts (19.0%)
  Evening: 11 shifts (19.0%)

Shifts by Facility:
  R&D Center Jones: 5 shifts (8.6%)
  North America Distribution Warehouse: 5 shifts (8.6%)
  Johnson Distribution Warehouse: 5 shifts (8.6%)
  Miller Manufacturing Plant: 5 shifts (8.6%)
  Europe Distribution Warehouse: 5 shifts (8.6%)
  R&D Center Brownburg: 5 shifts (8.6%)
  West Martinfield Distribution Center: 5 shifts (8.6%)
  Smithhaven Food Manufacturing Plant: 4 shifts (6.9%)
  Latin America Distri

Inventory Transactions Table

In [15]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def load_materials_data(materials_file="data/materials.csv"):
    """
    Load the previously generated materials data
    
    Parameters:
    - materials_file: CSV file containing materials data
    
    Returns:
    - DataFrame containing the materials data or None if not available
    """
    try:
        return pd.read_csv(materials_file)
    except FileNotFoundError:
        print(f"Note: Materials data file {materials_file} not found.")
        print("Inventory transactions will be generated with synthetic material IDs.")
        return None

def load_storage_locations_data(storage_locations_file="data/storage_locations.csv"):
    """
    Load the previously generated storage locations data
    
    Parameters:
    - storage_locations_file: CSV file containing storage locations data
    
    Returns:
    - DataFrame containing the storage locations data or None if not available
    """
    try:
        return pd.read_csv(storage_locations_file)
    except FileNotFoundError:
        print(f"Note: Storage locations data file {storage_locations_file} not found.")
        print("Inventory transactions will be generated with synthetic location IDs.")
        return None

def load_work_orders_data(work_orders_file="data/work_orders.csv"):
    """
    Load the previously generated work orders data
    
    Parameters:
    - work_orders_file: CSV file containing work orders data
    
    Returns:
    - DataFrame containing the work orders data or None if not available
    """
    try:
        return pd.read_csv(work_orders_file)
    except FileNotFoundError:
        print(f"Note: Work orders data file {work_orders_file} not found.")
        print("Inventory transactions will be generated with synthetic work order IDs where needed.")
        return None

def load_purchase_orders_data(purchase_orders_file="data/purchase_orders.csv"):
    """
    Load the previously generated purchase orders data
    
    Parameters:
    - purchase_orders_file: CSV file containing purchase orders data
    
    Returns:
    - DataFrame containing the purchase orders data or None if not available
    """
    try:
        return pd.read_csv(purchase_orders_file)
    except FileNotFoundError:
        print(f"Note: Purchase orders data file {purchase_orders_file} not found.")
        print("Inventory transactions will be generated with synthetic purchase order IDs where needed.")
        return None

def load_material_lots_data(material_lots_file="data/material_lots.csv"):
    """
    Load the previously generated material lots data
    
    Parameters:
    - material_lots_file: CSV file containing material lots data
    
    Returns:
    - DataFrame containing the material lots data or None if not available
    """
    try:
        return pd.read_csv(material_lots_file)
    except FileNotFoundError:
        print(f"Note: Material lots data file {material_lots_file} not found.")
        print("Inventory transactions will be generated with synthetic lot IDs.")
        return None

def load_personnel_data(personnel_file="data/personnel.csv"):
    """
    Load the previously generated personnel data
    
    Parameters:
    - personnel_file: CSV file containing personnel data
    
    Returns:
    - DataFrame containing the personnel data or None if not available
    """
    try:
        return pd.read_csv(personnel_file)
    except FileNotFoundError:
        print(f"Note: Personnel data file {personnel_file} not found.")
        print("Inventory transactions will be generated with synthetic operator IDs.")
        return None

def generate_inventory_transactions(materials_df=None, storage_locations_df=None, 
                                  work_orders_df=None, purchase_orders_df=None, 
                                  material_lots_df=None, personnel_df=None,
                                  num_transactions=1000, start_time=None, end_time=None,
                                  output_file="data/inventory_transactions.csv"):
    """
    Generate synthetic data for the InventoryTransactions table from ISA-95 Level 4.
    
    Parameters:
    - materials_df: DataFrame containing materials data (optional)
    - storage_locations_df: DataFrame containing storage locations data (optional)
    - work_orders_df: DataFrame containing work orders data (optional)
    - purchase_orders_df: DataFrame containing purchase orders data (optional)
    - material_lots_df: DataFrame containing material lots data (optional)
    - personnel_df: DataFrame containing personnel data (optional)
    - num_transactions: Number of transaction records to generate
    - start_time: Start time for transaction dates (defaults to 365 days ago)
    - end_time: End time for transaction dates (defaults to current date)
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated inventory transactions data
    """
    # Set default time range if not provided
    if start_time is None:
        start_time = datetime.now() - timedelta(days=365)
    if end_time is None:
        end_time = datetime.now()
    
    # Generate material IDs if materials_df is not provided
    if materials_df is None or len(materials_df) == 0:
        print("Generating synthetic material IDs...")
        material_ids = [f"MAT-{uuid.uuid4().hex[:8].upper()}" for _ in range(50)]
    else:
        material_ids = materials_df['material_id'].tolist()
    
    # Generate location IDs if storage_locations_df is not provided
    if storage_locations_df is None or len(storage_locations_df) == 0:
        print("Generating synthetic storage location IDs...")
        location_ids = [f"LOC-{uuid.uuid4().hex[:8].upper()}" for _ in range(30)]
    else:
        location_ids = storage_locations_df['location_id'].tolist()
    
    # Generate lot IDs if material_lots_df is not provided
    if material_lots_df is None or len(material_lots_df) == 0:
        print("Generating synthetic lot IDs...")
        lot_ids = [f"LOT-{uuid.uuid4().hex[:8].upper()}" for _ in range(100)]
        
        # Create a mapping of material_id to lot_ids
        material_to_lots = {}
        for material_id in material_ids:
            # Assign 2-5 lots to each material
            num_lots = random.randint(2, 5)
            material_to_lots[material_id] = random.sample(lot_ids, min(num_lots, len(lot_ids)))
    else:
        lot_ids = material_lots_df['lot_id'].tolist()
        
        # Create a mapping of material_id to lot_ids if possible
        material_to_lots = {}
        if 'material_id' in material_lots_df.columns:
            for material_id in material_ids:
                material_lots = material_lots_df[material_lots_df['material_id'] == material_id]
                if len(material_lots) > 0:
                    material_to_lots[material_id] = material_lots['lot_id'].tolist()
                else:
                    # Assign random lots if no specific lots found for this material
                    num_lots = random.randint(1, 3)
                    material_to_lots[material_id] = random.sample(lot_ids, min(num_lots, len(lot_ids)))
        else:
            # If no material_id in material_lots_df, create random assignments
            for material_id in material_ids:
                num_lots = random.randint(1, 3)
                material_to_lots[material_id] = random.sample(lot_ids, min(num_lots, len(lot_ids)))
    
    # Generate work order IDs if work_orders_df is not provided
    if work_orders_df is None or len(work_orders_df) == 0:
        print("Generating synthetic work order IDs...")
        work_order_ids = [f"WO-{uuid.uuid4().hex[:8].upper()}" for _ in range(30)]
    else:
        work_order_ids = work_orders_df['work_order_id'].tolist()
    
    # Generate purchase order IDs if purchase_orders_df is not provided
    if purchase_orders_df is None or len(purchase_orders_df) == 0:
        print("Generating synthetic purchase order IDs...")
        purchase_order_ids = [f"PO-{uuid.uuid4().hex[:8].upper()}" for _ in range(30)]
    else:
        purchase_order_ids = purchase_orders_df['po_id'].tolist()
    
    # Generate operator IDs if personnel_df is not provided
    if personnel_df is None or len(personnel_df) == 0:
        print("Generating synthetic operator IDs...")
        operator_ids = [f"PERS-{uuid.uuid4().hex[:8].upper()}" for _ in range(20)]
    else:
        operator_ids = personnel_df['personnel_id'].tolist()
    
    # Define transaction types and their probabilities
    transaction_types = {
        "Receipt": 0.2,       # Receiving materials from suppliers
        "Issue": 0.3,         # Issuing materials to production
        "Transfer": 0.25,     # Moving materials between locations
        "Adjustment": 0.05,   # Inventory count adjustments
        "Return": 0.05,       # Returns to inventory
        "Scrap": 0.05,        # Scrapping materials
        "Quality Hold": 0.05, # Placing materials on quality hold
        "Release": 0.05       # Releasing materials from hold
    }
    
    # Define reason codes by transaction type
    reason_codes = {
        "Receipt": ["Purchase Order Receipt", "Production Return", "Transfer In", "Inventory Correction"],
        "Issue": ["Production Issue", "Transfer Out", "Sample Issue", "QC Testing", "R&D Use"],
        "Transfer": ["Inventory Optimization", "Storage Consolidation", "Move to Production", "Staging for Shipping"],
        "Adjustment": ["Cycle Count", "Physical Inventory", "Damaged in Storage", "Expiration", "System Reconciliation"],
        "Return": ["Production Excess", "QC Rejection", "Customer Return", "Unused Material"],
        "Scrap": ["Expired", "Damaged", "Failed QC", "Contaminated", "Obsolete"],
        "Quality Hold": ["Out of Specification", "Pending Test Results", "Supplier Investigation", "Process Deviation"],
        "Release": ["QC Approval", "Investigation Complete", "Deviation Approved", "Rework Complete"]
    }
    
    # Generate data structure
    data = {
        "transaction_id": [f"TRX-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_transactions)],
        "transaction_type": [],
        "material_id": [],
        "lot_id": [],
        "timestamp": [],
        "quantity": [],
        "from_location_id": [],
        "to_location_id": [],
        "work_order_id": [],
        "reference_document": [],
        "operator_id": [],
        "transaction_reason": [],
        "unit_cost": []
    }
    
    # Keep track of material-location inventory for realistic transactions
    inventory = {}  # (material_id, location_id, lot_id) -> quantity
    
    # Generate data for each transaction
    for i in range(num_transactions):
        # Generate timestamp within the specified range
        time_range_seconds = int((end_time - start_time).total_seconds())
        random_seconds = random.randint(0, time_range_seconds)
        timestamp = start_time + timedelta(seconds=random_seconds)
        data["timestamp"].append(timestamp.strftime("%Y-%m-%d %H:%M:%S"))
        
        # Select transaction type (weighted random)
        transaction_type = random.choices(
            list(transaction_types.keys()), 
            weights=list(transaction_types.values())
        )[0]
        data["transaction_type"].append(transaction_type)
        
        # Select material
        material_id = random.choice(material_ids)
        data["material_id"].append(material_id)
        
        # Select lot based on material
        if material_id in material_to_lots and material_to_lots[material_id]:
            lot_id = random.choice(material_to_lots[material_id])
        else:
            lot_id = random.choice(lot_ids)
        data["lot_id"].append(lot_id)
        
        # Handle from_location and to_location based on transaction type
        if transaction_type == "Receipt":
            # Receipts come from outside (no from_location) to a storage location
            from_location_id = ""
            to_location_id = random.choice(location_ids)
            
            # Reference document is typically a purchase order
            if random.random() < 0.8:  # 80% of receipts have a PO reference
                reference_document = f"PO:{random.choice(purchase_order_ids)}"
            else:
                reference_document = ""
                
        elif transaction_type == "Issue":
            # Issues go from a storage location to production (no to_location)
            from_location_id = random.choice(location_ids)
            to_location_id = ""
            
            # Reference document is typically a work order
            if random.random() < 0.8:  # 80% of issues have a WO reference
                reference_document = f"WO:{random.choice(work_order_ids)}"
            else:
                reference_document = ""
                
        elif transaction_type == "Transfer":
            # Transfers go from one storage location to another
            available_locations = list(location_ids)
            from_location_id = random.choice(available_locations)
            
            # Ensure to_location is different from from_location
            remaining_locations = [loc for loc in available_locations if loc != from_location_id]
            if remaining_locations:
                to_location_id = random.choice(remaining_locations)
            else:
                # If only one location available, use it for both (not ideal but prevents errors)
                to_location_id = from_location_id
                
            # Reference document could be various things
            ref_types = ["", f"WO:{random.choice(work_order_ids)}", "Transfer Order:TO-" + str(random.randint(10000, 99999))]
            reference_document = random.choice(ref_types)
            
        elif transaction_type in ["Adjustment", "Scrap"]:
            # Adjustments and scraps occur at a specific location (from_location)
            from_location_id = random.choice(location_ids)
            to_location_id = ""
            
            # Reference document could be various things
            ref_types = ["", "Count Sheet:CS-" + str(random.randint(10000, 99999)), "QC Report:QC-" + str(random.randint(10000, 99999))]
            reference_document = random.choice(ref_types)
            
        elif transaction_type == "Return":
            # Returns go from production (no from_location) to a storage location
            from_location_id = ""
            to_location_id = random.choice(location_ids)
            
            # Reference document is typically a work order
            if random.random() < 0.7:  # 70% of returns have a WO reference
                reference_document = f"WO:{random.choice(work_order_ids)}"
            else:
                reference_document = ""
                
        elif transaction_type == "Quality Hold":
            # Quality holds change the status of inventory at a location
            from_location_id = random.choice(location_ids)
            to_location_id = from_location_id  # Same location, just changing status
            
            # Reference document is typically a QC document
            reference_document = "QC Hold:" + str(random.randint(10000, 99999))
            
        else:  # Release
            # Releases change the status of inventory at a location
            from_location_id = random.choice(location_ids)
            to_location_id = from_location_id  # Same location, just changing status
            
            # Reference document is typically a QC document
            reference_document = "QC Release:" + str(random.randint(10000, 99999))
        
        data["from_location_id"].append(from_location_id)
        data["to_location_id"].append(to_location_id)
        data["reference_document"].append(reference_document)
        
        # Determine quantity based on transaction type and maintain inventory
        inventory_key = (material_id, from_location_id, lot_id)
        
        if transaction_type in ["Receipt", "Return"]:
            # Incoming transactions can have any quantity
            quantity = round(random.uniform(10, 1000), 2)
            
            # Update inventory
            destination_key = (material_id, to_location_id, lot_id)
            if destination_key in inventory:
                inventory[destination_key] = inventory[destination_key] + quantity
            else:
                inventory[destination_key] = quantity
                
        elif transaction_type in ["Issue", "Transfer", "Scrap"]:
            # Outgoing transactions need available inventory
            if inventory_key in inventory and inventory[inventory_key] > 0:
                # Use up to 80% of available inventory
                max_quantity = inventory[inventory_key] * 0.8
                quantity = round(random.uniform(1, max_quantity), 2)
                
                # Update inventory at source
                inventory[inventory_key] = inventory[inventory_key] - quantity
                
                # Update inventory at destination if applicable
                if transaction_type == "Transfer" and to_location_id:
                    destination_key = (material_id, to_location_id, lot_id)
                    if destination_key in inventory:
                        inventory[destination_key] = inventory[destination_key] + quantity
                    else:
                        inventory[destination_key] = quantity
            else:
                # No inventory available, create a small quantity
                quantity = round(random.uniform(1, 100), 2)
                
                # Add to inventory first (anachronistic but ensures future transactions have inventory)
                if from_location_id:
                    inventory[inventory_key] = quantity
        
        elif transaction_type == "Adjustment":
            # Adjustments can be positive or negative
            if random.random() < 0.5:  # 50% positive adjustments
                quantity = round(random.uniform(1, 100), 2)
                
                # Update inventory
                if inventory_key in inventory:
                    inventory[inventory_key] = inventory[inventory_key] + quantity
                else:
                    inventory[inventory_key] = quantity
            else:
                # Negative adjustment
                if inventory_key in inventory and inventory[inventory_key] > 0:
                    # Use up to 30% of available inventory
                    max_quantity = inventory[inventory_key] * 0.3
                    quantity = -round(random.uniform(1, max_quantity), 2)
                    
                    # Update inventory
                    inventory[inventory_key] = inventory[inventory_key] + quantity  # Adding negative
                else:
                    # No inventory available, create a small negative quantity
                    quantity = -round(random.uniform(1, 50), 2)
        
        else:  # Quality Hold or Release
            # These don't change quantity, just status
            if inventory_key in inventory and inventory[inventory_key] > 0:
                quantity = inventory[inventory_key]  # Use the full amount in inventory
            else:
                quantity = round(random.uniform(10, 500), 2)  # Create some quantity if none exists
        
        data["quantity"].append(quantity)
        
        # Set work order ID (for certain transaction types)
        if transaction_type in ["Issue", "Return"] and random.random() < 0.8:
            data["work_order_id"].append(random.choice(work_order_ids))
        else:
            data["work_order_id"].append("")
        
        # Set operator
        data["operator_id"].append(random.choice(operator_ids))
        
        # Set transaction reason
        if transaction_type in reason_codes:
            data["transaction_reason"].append(random.choice(reason_codes[transaction_type]))
        else:
            data["transaction_reason"].append("")
        
        # Set unit cost (for financial tracking)
        if transaction_type in ["Receipt", "Adjustment", "Return"]:
            # These transaction types typically record cost
            unit_cost = round(random.uniform(5, 500), 2)
        else:
            # These use the existing cost basis
            unit_cost = ""
            
        data["unit_cost"].append(unit_cost)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Sort by timestamp to create a chronological history
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp')
    
    # Reset the index after sorting
    df = df.reset_index(drop=True)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} inventory transaction records.")
    print(f"Data saved to {output_file}")
    
    return df

def display_statistics(inventory_transactions_df, materials_df=None, storage_locations_df=None):
    """
    Display basic statistics about the generated inventory transactions data
    
    Parameters:
    - inventory_transactions_df: DataFrame containing inventory transactions data
    - materials_df: DataFrame containing materials data (optional)
    - storage_locations_df: DataFrame containing storage locations data (optional)
    """
    if inventory_transactions_df is None or len(inventory_transactions_df) == 0:
        print("No inventory transactions data to analyze.")
        return
    
    print("\nInventory Transactions Statistics:")
    print(f"Total transactions: {len(inventory_transactions_df)}")
    
    # Transaction type distribution
    print("\nTransaction Type Distribution:")
    type_counts = inventory_transactions_df['transaction_type'].value_counts()
    for trx_type, count in type_counts.items():
        print(f"  {trx_type}: {count} ({count/len(inventory_transactions_df)*100:.1f}%)")
    
    # Transaction volume statistics
    print("\nTransaction Volume Statistics:")
    
    # Convert quantity to numeric, handling any non-numeric values
    inventory_transactions_df['quantity_numeric'] = pd.to_numeric(
        inventory_transactions_df['quantity'], errors='coerce'
    )
    
    positive_transactions = inventory_transactions_df[inventory_transactions_df['quantity_numeric'] > 0]
    negative_transactions = inventory_transactions_df[inventory_transactions_df['quantity_numeric'] < 0]
    
    print(f"  Positive transactions: {len(positive_transactions)} ({len(positive_transactions)/len(inventory_transactions_df)*100:.1f}%)")
    if len(positive_transactions) > 0:
        print(f"  Average positive quantity: {positive_transactions['quantity_numeric'].mean():.2f}")
        print(f"  Total positive quantity: {positive_transactions['quantity_numeric'].sum():.2f}")
        
    print(f"  Negative transactions: {len(negative_transactions)} ({len(negative_transactions)/len(inventory_transactions_df)*100:.1f}%)")
    if len(negative_transactions) > 0:
        print(f"  Average negative quantity: {negative_transactions['quantity_numeric'].mean():.2f}")
        print(f"  Total negative quantity: {negative_transactions['quantity_numeric'].sum():.2f}")
    
    # Time-based analysis
    inventory_transactions_df['timestamp'] = pd.to_datetime(inventory_transactions_df['timestamp'])
    
    print("\nTransaction Timeline:")
    print(f"  First transaction: {inventory_transactions_df['timestamp'].min()}")
    print(f"  Last transaction: {inventory_transactions_df['timestamp'].max()}")
    
    # Transactions by month
    inventory_transactions_df['month'] = inventory_transactions_df['timestamp'].dt.to_period('M')
    monthly_transactions = inventory_transactions_df.groupby('month').size()
    
    print("\nTransactions by Month (latest 6 months):")
    for month, count in monthly_transactions.tail(6).items():
        print(f"  {month}: {count} transactions")
    
    # Material analysis
    material_counts = inventory_transactions_df.groupby('material_id').size().sort_values(ascending=False)
    
    print("\nTop 10 Materials by Transaction Count:")
    if materials_df is not None:
        # Create a mapping of material_id to material_name
        material_names = dict(zip(materials_df['material_id'], materials_df['material_name']))
        
        for material_id, count in material_counts.head(10).items():
            material_name = material_names.get(material_id, material_id)
            print(f"  {material_name}: {count} transactions ({count/len(inventory_transactions_df)*100:.1f}%)")
    else:
        for material_id, count in material_counts.head(10).items():
            print(f"  {material_id}: {count} transactions ({count/len(inventory_transactions_df)*100:.1f}%)")
    
    # Location analysis
    # Combine from and to locations for overall location activity
    from_locations = inventory_transactions_df[inventory_transactions_df['from_location_id'] != ""]
    to_locations = inventory_transactions_df[inventory_transactions_df['to_location_id'] != ""]
    
    from_counts = from_locations.groupby('from_location_id').size()
    to_counts = to_locations.groupby('to_location_id').size()
    
    # Combine the counts
    location_activity = pd.Series(0, index=set(from_counts.index) | set(to_counts.index))
    location_activity = location_activity.add(from_counts, fill_value=0).add(to_counts, fill_value=0)
    location_activity = location_activity.sort_values(ascending=False)
    
    print("\nTop 10 Locations by Transaction Activity:")
    if storage_locations_df is not None:
        # Create a mapping of location_id to location_name
        location_names = dict(zip(storage_locations_df['location_id'], storage_locations_df['location_name']))
        
        for location_id, count in location_activity.head(10).items():
            location_name = location_names.get(location_id, location_id)
            print(f"  {location_name}: {count} transactions")
    else:
        for location_id, count in location_activity.head(10).items():
            print(f"  {location_id}: {count} transactions")
    
    # Reason code analysis
    print("\nTop 10 Transaction Reasons:")
    reason_counts = inventory_transactions_df['transaction_reason'].value_counts().head(10)
    for reason, count in reason_counts.items():
        print(f"  {reason}: {count} ({count/len(inventory_transactions_df)*100:.1f}%)")
    
    # Work order reference analysis
    has_wo = inventory_transactions_df['work_order_id'].apply(lambda x: x != "").sum()
    print(f"\nTransactions with work order reference: {has_wo} ({has_wo/len(inventory_transactions_df)*100:.1f}%)")
    
    # Reference document analysis
    has_ref = inventory_transactions_df['reference_document'].apply(lambda x: x != "").sum()
    print(f"Transactions with reference document: {has_ref} ({has_ref/len(inventory_transactions_df)*100:.1f}%)")
    
    # Cost analysis
    inventory_transactions_df['unit_cost_numeric'] = pd.to_numeric(
        inventory_transactions_df['unit_cost'], errors='coerce'
    )
    
    cost_transactions = inventory_transactions_df[pd.notna(inventory_transactions_df['unit_cost_numeric'])]
    if len(cost_transactions) > 0:
        print("\nCost Statistics:")
        print(f"  Transactions with cost data: {len(cost_transactions)} ({len(cost_transactions)/len(inventory_transactions_df)*100:.1f}%)")
        print(f"  Average unit cost: ${cost_transactions['unit_cost_numeric'].mean():.2f}")
        print(f"  Minimum unit cost: ${cost_transactions['unit_cost_numeric'].min():.2f}")
        print(f"  Maximum unit cost: ${cost_transactions['unit_cost_numeric'].max():.2f}")
        
        # Calculate total value movement
        cost_transactions['transaction_value'] = cost_transactions['quantity_numeric'] * cost_transactions['unit_cost_numeric']
        print(f"  Total transaction value: ${cost_transactions['transaction_value'].sum():.2f}")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load required data
    materials_df = load_materials_data()
    storage_locations_df = load_storage_locations_data()
    work_orders_df = load_work_orders_data()
    purchase_orders_df = load_purchase_orders_data()
    material_lots_df = load_material_lots_data()
    personnel_df = load_personnel_data()
    
    # Generate inventory transactions data
    inventory_transactions_df = generate_inventory_transactions(
        materials_df,
        storage_locations_df,
        work_orders_df,
        purchase_orders_df,
        material_lots_df,
        personnel_df,
        num_transactions=1000,  # Generate 1000 transaction records
        output_file="data/inventory_transactions.csv"
    )
    
    # Display statistics
    if inventory_transactions_df is not None:
        display_statistics(inventory_transactions_df, materials_df, storage_locations_df)
        
        # Display sample data
        print("\nSample inventory transactions data (first 5 records):")
        print(inventory_transactions_df.head(5))

Note: Personnel data file data/personnel.csv not found.
Inventory transactions will be generated with synthetic operator IDs.
Generating synthetic operator IDs...
Successfully generated 1000 inventory transaction records.
Data saved to data/inventory_transactions.csv

Inventory Transactions Statistics:
Total transactions: 1000

Transaction Type Distribution:
  Issue: 321 (32.1%)
  Transfer: 245 (24.5%)
  Receipt: 196 (19.6%)
  Scrap: 55 (5.5%)
  Return: 51 (5.1%)
  Quality Hold: 46 (4.6%)
  Release: 44 (4.4%)
  Adjustment: 42 (4.2%)

Transaction Volume Statistics:
  Positive transactions: 979 (97.9%)
  Average positive quantity: 177.82
  Total positive quantity: 174082.17
  Negative transactions: 21 (2.1%)
  Average negative quantity: -22.83
  Total negative quantity: -479.52

Transaction Timeline:
  First transaction: 2024-07-16 14:35:47
  Last transaction: 2025-07-16 04:44:39

Transactions by Month (latest 6 months):
  2025-02: 61 transactions
  2025-03: 85 transactions
  2025-04: 87

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cost_transactions['transaction_value'] = cost_transactions['quantity_numeric'] * cost_transactions['unit_cost_numeric']


Pending Tables

In [16]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def load_work_orders_data(work_orders_file="data/work_orders.csv"):
    """
    Load the previously generated work orders data
    
    Parameters:
    - work_orders_file: CSV file containing work orders data
    
    Returns:
    - DataFrame containing the work orders data or None if not available
    """
    try:
        return pd.read_csv(work_orders_file)
    except FileNotFoundError:
        print(f"Note: Work orders data file {work_orders_file} not found.")
        print("Costs will be generated with synthetic work order IDs where needed.")
        return None

def load_products_data(products_file="data/products.csv"):
    """
    Load the previously generated products data
    
    Parameters:
    - products_file: CSV file containing products data
    
    Returns:
    - DataFrame containing the products data or None if not available
    """
    try:
        return pd.read_csv(products_file)
    except FileNotFoundError:
        print(f"Note: Products data file {products_file} not found.")
        print("Costs will be generated with synthetic product IDs where needed.")
        return None

def load_equipment_data(equipment_file="data/equipment.csv"):
    """
    Load the previously generated equipment data
    
    Parameters:
    - equipment_file: CSV file containing equipment data
    
    Returns:
    - DataFrame containing the equipment data or None if not available
    """
    try:
        return pd.read_csv(equipment_file)
    except FileNotFoundError:
        print(f"Note: Equipment data file {equipment_file} not found.")
        print("Costs will be generated with synthetic equipment IDs where needed.")
        return None

def load_batches_data(batches_file="data/batches.csv"):
    """
    Load the previously generated batches data
    
    Parameters:
    - batches_file: CSV file containing batches data
    
    Returns:
    - DataFrame containing the batches data or None if not available
    """
    try:
        return pd.read_csv(batches_file)
    except FileNotFoundError:
        print(f"Note: Batches data file {batches_file} not found.")
        print("Costs will be generated with synthetic batch IDs where needed.")
        return None

def generate_costs_data(work_orders_df=None, products_df=None, equipment_df=None, 
                      batches_df=None, num_costs=500, start_time=None, end_time=None,
                      output_file="data/costs.csv"):
    """
    Generate synthetic data for the Costs table from ISA-95 Level 4.
    
    Parameters:
    - work_orders_df: DataFrame containing work orders data (optional)
    - products_df: DataFrame containing products data (optional)
    - equipment_df: DataFrame containing equipment data (optional)
    - batches_df: DataFrame containing batches data (optional)
    - num_costs: Number of cost records to generate
    - start_time: Start time for cost dates (defaults to 365 days ago)
    - end_time: End time for cost dates (defaults to current date)
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated costs data
    """
    # Set default time range if not provided
    if start_time is None:
        start_time = datetime.now() - timedelta(days=365)
    if end_time is None:
        end_time = datetime.now()
    
    # Generate work order IDs if work_orders_df is not provided
    if work_orders_df is None or len(work_orders_df) == 0:
        print("Generating synthetic work order IDs...")
        work_order_ids = [f"WO-{uuid.uuid4().hex[:8].upper()}" for _ in range(50)]
    else:
        work_order_ids = work_orders_df['work_order_id'].tolist()
    
    # Generate product IDs if products_df is not provided
    if products_df is None or len(products_df) == 0:
        print("Generating synthetic product IDs...")
        product_ids = [f"PROD-{uuid.uuid4().hex[:8].upper()}" for _ in range(30)]
    else:
        product_ids = products_df['product_id'].tolist()
    
    # Generate equipment IDs if equipment_df is not provided
    if equipment_df is None or len(equipment_df) == 0:
        print("Generating synthetic equipment IDs...")
        equipment_ids = [f"EQ-{uuid.uuid4().hex[:8].upper()}" for _ in range(20)]
    else:
        equipment_ids = equipment_df['equipment_id'].tolist()
    
    # Generate batch IDs if batches_df is not provided
    if batches_df is None or len(batches_df) == 0:
        print("Generating synthetic batch IDs...")
        batch_ids = [f"BATCH-{uuid.uuid4().hex[:8].upper()}" for _ in range(40)]
    else:
        batch_ids = batches_df['batch_id'].tolist()
    
    # Define cost types and their probabilities
    cost_types = {
        "Labor": 0.3,
        "Material": 0.3,
        "Overhead": 0.15,
        "Energy": 0.05,
        "Maintenance": 0.1,
        "Quality": 0.05,
        "Setup": 0.05
    }
    
    # Define cost categories
    cost_categories = {
        "Labor": ["Direct Labor", "Indirect Labor", "Supervision", "Quality Control", "Engineering"],
        "Material": ["Raw Material", "Packaging", "Consumables", "Spare Parts", "Chemicals"],
        "Overhead": ["Facility", "Depreciation", "Insurance", "Utilities", "IT"],
        "Energy": ["Electricity", "Gas", "Water", "Steam", "Compressed Air"],
        "Maintenance": ["Preventive", "Corrective", "Calibration", "Cleaning", "Inspection"],
        "Quality": ["Testing", "Inspection", "Rework", "Documentation", "Validation"],
        "Setup": ["Machine Setup", "Changeover", "Tooling", "Programming", "Validation"]
    }
    
    # Define cost centers
    cost_centers = ["Production", "Maintenance", "Quality", "Engineering", "Facilities", "Supply Chain", 
                   "Utilities", "R&D", "Administration"]
    
    # Define currencies
    currencies = ["USD", "EUR", "GBP", "JPY", "CAD", "AUD", "CNY"]
    currency_weights = [0.6, 0.15, 0.1, 0.05, 0.05, 0.03, 0.02]  # Mostly USD
    
    # Generate data structure
    data = {
        "cost_id": [f"COST-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_costs)],
        "cost_type": [],
        "work_order_id": [],
        "product_id": [],
        "equipment_id": [],
        "batch_id": [],
        "timestamp": [],
        "amount": [],
        "currency": [],
        "cost_category": [],
        "cost_center": [],
        "planned_cost": [],
        "variance": []
    }
    
    # Generate data for each cost record
    for i in range(num_costs):
        # Select cost type (weighted random)
        cost_type = random.choices(
            list(cost_types.keys()), 
            weights=list(cost_types.values())
        )[0]
        data["cost_type"].append(cost_type)
        
        # Generate timestamp within the specified range
        time_range_seconds = int((end_time - start_time).total_seconds())
        random_seconds = random.randint(0, time_range_seconds)
        timestamp = start_time + timedelta(seconds=random_seconds)
        data["timestamp"].append(timestamp.strftime("%Y-%m-%d %H:%M:%S"))
        
        # Assign work order, product, equipment, and batch based on cost type
        # Not all costs are associated with all entities
        
        # Work order association
        if cost_type in ["Labor", "Material", "Setup", "Quality"] and random.random() < 0.9:
            # These cost types are almost always associated with work orders
            data["work_order_id"].append(random.choice(work_order_ids))
        else:
            # Other cost types may or may not be associated with work orders
            if random.random() < 0.5:
                data["work_order_id"].append(random.choice(work_order_ids))
            else:
                data["work_order_id"].append("")
        
        # Product association
        if cost_type in ["Material", "Quality"] and random.random() < 0.9:
            # These cost types are almost always associated with products
            data["product_id"].append(random.choice(product_ids))
        else:
            # Other cost types may or may not be associated with products
            if random.random() < 0.4:
                data["product_id"].append(random.choice(product_ids))
            else:
                data["product_id"].append("")
        
        # Equipment association
        if cost_type in ["Maintenance", "Energy", "Setup"] and random.random() < 0.9:
            # These cost types are almost always associated with equipment
            data["equipment_id"].append(random.choice(equipment_ids))
        else:
            # Other cost types may or may not be associated with equipment
            if random.random() < 0.4:
                data["equipment_id"].append(random.choice(equipment_ids))
            else:
                data["equipment_id"].append("")
        
        # Batch association
        if data["work_order_id"][-1] and random.random() < 0.7:
            # If associated with a work order, likely associated with a batch
            data["batch_id"].append(random.choice(batch_ids))
        else:
            data["batch_id"].append("")
        
        # Generate cost amount based on cost type
        if cost_type == "Labor":
            # Labor costs are typically higher
            amount = random.uniform(100, 5000)
        elif cost_type == "Material":
            # Material costs vary widely
            amount = random.uniform(50, 10000)
        elif cost_type == "Overhead":
            # Overhead costs can be substantial
            amount = random.uniform(500, 20000)
        elif cost_type == "Energy":
            # Energy costs are moderate
            amount = random.uniform(100, 3000)
        elif cost_type == "Maintenance":
            # Maintenance costs depend on the scope
            amount = random.uniform(200, 8000)
        elif cost_type == "Quality":
            # Quality costs are typically lower
            amount = random.uniform(50, 2000)
        else:  # Setup
            # Setup costs are moderate
            amount = random.uniform(100, 3000)
            
        data["amount"].append(round(amount, 2))
        
        # Set currency (weighted random)
        data["currency"].append(random.choices(currencies, weights=currency_weights)[0])
        
        # Set cost category
        if cost_type in cost_categories:
            data["cost_category"].append(random.choice(cost_categories[cost_type]))
        else:
            data["cost_category"].append("General")
        
        # Set cost center
        if cost_type == "Labor":
            center_weights = [0.6, 0.05, 0.1, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0]  # Mostly Production
        elif cost_type == "Material":
            center_weights = [0.5, 0.0, 0.05, 0.05, 0.0, 0.3, 0.0, 0.1, 0.0]  # Production or Supply Chain
        elif cost_type == "Overhead":
            center_weights = [0.2, 0.05, 0.05, 0.05, 0.3, 0.05, 0.1, 0.05, 0.15]  # Varied
        elif cost_type == "Energy":
            center_weights = [0.3, 0.05, 0.05, 0.05, 0.1, 0.0, 0.45, 0.0, 0.0]  # Utilities or Production
        elif cost_type == "Maintenance":
            center_weights = [0.1, 0.7, 0.0, 0.1, 0.1, 0.0, 0.0, 0.0, 0.0]  # Mostly Maintenance
        elif cost_type == "Quality":
            center_weights = [0.1, 0.0, 0.7, 0.1, 0.0, 0.0, 0.0, 0.1, 0.0]  # Mostly Quality
        else:  # Setup
            center_weights = [0.7, 0.1, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0]  # Mostly Production
            
        data["cost_center"].append(random.choices(cost_centers, weights=center_weights)[0])
        
        # Set planned cost and variance
        # About 70% of costs have a planned amount
        if random.random() < 0.7:
            # Planned costs are usually close to actual but can vary
            variation = random.uniform(0.7, 1.3)  # -30% to +30%
            planned_cost = amount / variation
            data["planned_cost"].append(round(planned_cost, 2))
            
            # Calculate variance (actual - planned)
            variance = amount - planned_cost
            data["variance"].append(round(variance, 2))
        else:
            data["planned_cost"].append("")
            data["variance"].append("")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} cost records.")
    print(f"Data saved to {output_file}")
    
    return df

#-----------------------------------------------------------------------------------
# Material Lots Data Generation
#-----------------------------------------------------------------------------------

def generate_material_lots_data(materials_df=None, suppliers_df=None, storage_locations_df=None, 
                             num_lots=200, output_file="data/material_lots.csv"):
    """
    Generate synthetic data for the MaterialLots table from ISA-95 Level 4.
    
    Parameters:
    - materials_df: DataFrame containing materials data (optional)
    - suppliers_df: DataFrame containing suppliers data (optional)
    - storage_locations_df: DataFrame containing storage locations data (optional)
    - num_lots: Number of material lot records to generate
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated material lots data
    """
    # Generate material IDs if materials_df is not provided
    if materials_df is None or len(materials_df) == 0:
        print("Generating synthetic material IDs...")
        material_ids = [f"MAT-{uuid.uuid4().hex[:8].upper()}" for _ in range(50)]
        
        # Create synthetic material types for lot assignment
        material_types = {}
        material_units = {}
        for mat_id in material_ids:
            material_types[mat_id] = random.choice(["Raw Material", "Packaging", "WIP", "Intermediate", "Consumable"])
            material_units[mat_id] = random.choice(["kg", "L", "g", "m", "piece", "unit", "box", "roll"])
    else:
        material_ids = materials_df['material_id'].tolist()
        
        # Extract material types and units if available
        if 'material_type' in materials_df.columns:
            material_types = dict(zip(materials_df['material_id'], materials_df['material_type']))
        else:
            material_types = {mat_id: random.choice(["Raw Material", "Packaging", "WIP", "Intermediate", "Consumable"]) 
                             for mat_id in material_ids}
            
        if 'unit_of_measure' in materials_df.columns:
            material_units = dict(zip(materials_df['material_id'], materials_df['unit_of_measure']))
        else:
            material_units = {mat_id: random.choice(["kg", "L", "g", "m", "piece", "unit", "box", "roll"]) 
                             for mat_id in material_ids}
    
    # Generate supplier IDs if suppliers_df is not provided
    if suppliers_df is None or len(suppliers_df) == 0:
        print("Generating synthetic supplier IDs...")
        supplier_ids = [f"SUP-{uuid.uuid4().hex[:8].upper()}" for _ in range(20)]
    else:
        supplier_ids = suppliers_df['supplier_id'].tolist()
    
    # Generate storage location IDs if storage_locations_df is not provided
    if storage_locations_df is None or len(storage_locations_df) == 0:
        print("Generating synthetic storage location IDs...")
        storage_location_ids = [f"LOC-{uuid.uuid4().hex[:8].upper()}" for _ in range(30)]
    else:
        storage_location_ids = storage_locations_df['location_id'].tolist()
    
    # Generate data structure
    data = {
        "lot_id": [f"LOT-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_lots)],
        "material_id": [],
        "lot_quantity": [],
        "quantity_unit": [],
        "status": [],
        "creation_date": [],
        "expiration_date": [],
        "supplier_id": [],
        "supplier_lot_id": [],
        "receipt_date": [],
        "storage_location_id": [],
        "quality_status": [],
        "cost_per_unit": [],
        "parent_lot_id": []
    }
    
    # Define status options
    statuses = ["Available", "Reserved", "In Use", "Consumed", "On Hold", "Quarantined", "Rejected"]
    status_weights = [0.6, 0.1, 0.1, 0.05, 0.05, 0.05, 0.05]  # Mostly available
    
    # Define quality status options
    quality_statuses = ["Released", "Under Test", "Approved", "Rejected", "Pending Review"]
    quality_weights = [0.7, 0.1, 0.1, 0.05, 0.05]  # Mostly released
    
    # Keep track of created lots by material for parent-child relationships
    lots_by_material = {material_id: [] for material_id in material_ids}
    
    # Generate data for each material lot
    for i in range(num_lots):
        # Select material
        material_id = random.choice(material_ids)
        data["material_id"].append(material_id)
        
        # Determine if this is a split lot (child lot)
        is_child_lot = False
        if i > num_lots * 0.2 and lots_by_material[material_id] and random.random() < 0.2:  # 20% chance for child lots
            is_child_lot = True
            parent_lot_id = random.choice(lots_by_material[material_id])
            data["parent_lot_id"].append(parent_lot_id)
        else:
            data["parent_lot_id"].append("")
        
        # Remember this lot for potential future splits
        lots_by_material[material_id].append(data["lot_id"][i])
        
        # Set quantity based on material type
        material_type = material_types.get(material_id, "Raw Material")
        
        if material_type == "Raw Material":
            if is_child_lot:
                # Child lots are smaller
                quantity = random.uniform(10, 200)
            else:
                quantity = random.uniform(100, 2000)
        elif material_type == "Packaging":
            if is_child_lot:
                quantity = random.uniform(50, 500)
            else:
                quantity = random.uniform(500, 10000)
        elif material_type in ["WIP", "Intermediate"]:
            if is_child_lot:
                quantity = random.uniform(5, 50)
            else:
                quantity = random.uniform(50, 500)
        else:  # Consumable
            if is_child_lot:
                quantity = random.uniform(1, 20)
            else:
                quantity = random.uniform(10, 200)
            
        data["lot_quantity"].append(round(quantity, 2))
        
        # Set unit
        unit = material_units.get(material_id, "kg")
        data["quantity_unit"].append(unit)
        
        # Set status
        if is_child_lot:
            # Child lots typically inherit status from parent, but we don't track that here
            # so just make them mostly available
            data["status"].append(random.choices(statuses, weights=[0.8, 0.1, 0.1, 0.0, 0.0, 0.0, 0.0])[0])
        else:
            data["status"].append(random.choices(statuses, weights=status_weights)[0])
        
        # Generate creation date (within last 2 years)
        days_ago = random.randint(1, 730)
        creation_date = datetime.now() - timedelta(days=days_ago)
        data["creation_date"].append(creation_date.strftime("%Y-%m-%d"))
        
        # Set expiration date based on material type
        if material_type == "Raw Material":
            # Raw materials typically have longer shelf life
            shelf_life_days = random.randint(365, 1825)  # 1-5 years
        elif material_type == "Packaging":
            # Packaging materials have very long shelf life
            shelf_life_days = random.randint(730, 3650)  # 2-10 years
        elif material_type in ["WIP", "Intermediate"]:
            # Intermediate products have shorter shelf life
            shelf_life_days = random.randint(30, 365)  # 1 month to 1 year
        else:  # Consumable
            # Consumables vary widely
            shelf_life_days = random.randint(90, 1095)  # 3 months to 3 years
            
        expiration_date = creation_date + timedelta(days=shelf_life_days)
        data["expiration_date"].append(expiration_date.strftime("%Y-%m-%d"))
        
        # Set supplier info
        if material_type in ["Raw Material", "Packaging", "Consumable"] and not is_child_lot:
            # External materials have supplier info
            data["supplier_id"].append(random.choice(supplier_ids))
            data["supplier_lot_id"].append(f"SUPLOT-{random.randint(10000, 99999)}")
            
            # Receipt date is between creation date and today
            max_receipt_days = min((datetime.now() - creation_date).days, 30)  # Within 30 days of creation
            if max_receipt_days > 0:
                receipt_days = random.randint(0, max_receipt_days)
            else:
                receipt_days = 0
            receipt_date = creation_date + timedelta(days=receipt_days)
            data["receipt_date"].append(receipt_date.strftime("%Y-%m-%d"))
        else:
            # Internally produced materials don't have supplier info
            data["supplier_id"].append("")
            data["supplier_lot_id"].append("")
            data["receipt_date"].append("")
        
        # Set storage location
        data["storage_location_id"].append(random.choice(storage_location_ids))
        
        # Set quality status
        if data["status"][-1] in ["On Hold", "Quarantined", "Rejected"]:
            # Problematic lots have corresponding quality status
            if data["status"][-1] == "On Hold":
                data["quality_status"].append("Under Test")
            elif data["status"][-1] == "Quarantined":
                data["quality_status"].append("Pending Review")
            else:  # Rejected
                data["quality_status"].append("Rejected")
        else:
            # Normal lots have standard quality status
            data["quality_status"].append(random.choices(quality_statuses, weights=quality_weights)[0])
        
        # Set cost per unit
        if material_type == "Raw Material":
            cost = random.uniform(1, 100)
        elif material_type == "Packaging":
            cost = random.uniform(0.1, 10)
        elif material_type in ["WIP", "Intermediate"]:
            cost = random.uniform(5, 200)
        else:  # Consumable
            cost = random.uniform(0.5, 50)
            
        data["cost_per_unit"].append(round(cost, 2))
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} material lot records.")
    print(f"Data saved to {output_file}")
    
    return df

#-----------------------------------------------------------------------------------
# Material Consumption Data Generation
#-----------------------------------------------------------------------------------

def generate_material_consumption_data(material_lots_df=None, batches_df=None, work_orders_df=None,
                                   equipment_df=None, personnel_df=None, num_records=300,
                                   output_file="data/material_consumption.csv"):
    """
    Generate synthetic data for the MaterialConsumption table from ISA-95 Level 4.
    
    Parameters:
    - material_lots_df: DataFrame containing material lots data (optional)
    - batches_df: DataFrame containing batches data (optional)
    - work_orders_df: DataFrame containing work orders data (optional)
    - equipment_df: DataFrame containing equipment data (optional)
    - personnel_df: DataFrame containing personnel data (optional)
    - num_records: Number of material consumption records to generate
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated material consumption data
    """
    # Load material lots or generate synthetic IDs
    if material_lots_df is None or len(material_lots_df) == 0:
        print("Generating synthetic lot IDs...")
        lot_ids = [f"LOT-{uuid.uuid4().hex[:8].upper()}" for _ in range(50)]
    else:
        # Use only available lots
        available_lots = material_lots_df[material_lots_df['status'].isin(['Available', 'Reserved', 'In Use'])]
        if len(available_lots) > 0:
            lot_ids = available_lots['lot_id'].tolist()
        else:
            lot_ids = material_lots_df['lot_id'].tolist()
    
    # Load batches or generate synthetic IDs
    if batches_df is None or len(batches_df) == 0:
        print("Generating synthetic batch IDs...")
        batch_ids = [f"BATCH-{uuid.uuid4().hex[:8].upper()}" for _ in range(40)]
    else:
        batch_ids = batches_df['batch_id'].tolist()
    
    # Load work orders or generate synthetic IDs
    if work_orders_df is None or len(work_orders_df) == 0:
        print("Generating synthetic work order IDs...")
        work_order_ids = [f"WO-{uuid.uuid4().hex[:8].upper()}" for _ in range(30)]
    else:
        work_order_ids = work_orders_df['work_order_id'].tolist()
    
    # Load equipment or generate synthetic IDs
    if equipment_df is None or len(equipment_df) == 0:
        print("Generating synthetic equipment IDs...")
        equipment_ids = [f"EQ-{uuid.uuid4().hex[:8].upper()}" for _ in range(20)]
    else:
        equipment_ids = equipment_df['equipment_id'].tolist()
    
    # Load personnel or generate synthetic IDs
    if personnel_df is None or len(personnel_df) == 0:
        print("Generating synthetic operator IDs...")
        operator_ids = [f"PERS-{uuid.uuid4().hex[:8].upper()}" for _ in range(15)]
    else:
        operator_ids = personnel_df['personnel_id'].tolist()
    
    # Generate batch step IDs (these would normally come from BatchStepExecution table)
    step_ids = [f"STEP-{uuid.uuid4().hex[:8].upper()}" for _ in range(30)]
    
    # Generate data structure
    data = {
        "consumption_id": [f"CONS-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_records)],
        "lot_id": [],
        "batch_id": [],
        "work_order_id": [],
        "timestamp": [],
        "quantity": [],
        "unit": [],
        "equipment_id": [],
        "step_id": [],
        "operator_id": [],
        "planned_consumption": [],
        "consumption_variance": []
    }
    
    # Create a mapping of batches to work orders (normally this would come from Batches table)
    batch_to_work_order = {}
    for batch_id in batch_ids:
        batch_to_work_order[batch_id] = random.choice(work_order_ids)
    
    # Keep track of lot consumption to avoid over-consumption
    lot_consumption = {lot_id: 0 for lot_id in lot_ids}
    
    # Get lot quantities and units if available
    lot_quantities = {}
    lot_units = {}
    
    if material_lots_df is not None and 'lot_quantity' in material_lots_df.columns:
        for _, lot in material_lots_df.iterrows():
            if 'lot_id' in lot and 'lot_quantity' in lot:
                lot_quantities[lot['lot_id']] = lot['lot_quantity']
                if 'quantity_unit' in lot:
                    lot_units[lot['lot_id']] = lot['quantity_unit']
    
    # Generate data for each consumption record
    for i in range(num_records):
        # Select lot
        if lot_ids:
            # Try to find a lot that hasn't been fully consumed
            available_lots = [lot_id for lot_id in lot_ids 
                             if lot_id not in lot_consumption or 
                             (lot_id in lot_quantities and lot_consumption[lot_id] < lot_quantities[lot_id])]
            
            if available_lots:
                lot_id = random.choice(available_lots)
            else:
                # If all lots are consumed, just pick a random one
                lot_id = random.choice(lot_ids)
        else:
            lot_id = f"LOT-{uuid.uuid4().hex[:8].upper()}"
            
        data["lot_id"].append(lot_id)
        
        # Select batch
        batch_id = random.choice(batch_ids)
        data["batch_id"].append(batch_id)
        
        # Set work order based on batch
        if batch_id in batch_to_work_order:
            data["work_order_id"].append(batch_to_work_order[batch_id])
        else:
            data["work_order_id"].append(random.choice(work_order_ids))
        
        # Generate timestamp (within last year)
        days_ago = random.randint(1, 365)
        hours_ago = random.randint(0, 23)
        minutes_ago = random.randint(0, 59)
        timestamp = datetime.now() - timedelta(days=days_ago, hours=hours_ago, minutes=minutes_ago)
        data["timestamp"].append(timestamp.strftime("%Y-%m-%d %H:%M:%S"))
        
        # Set quantity
        if lot_id in lot_quantities and lot_quantities[lot_id] > 0:
            # Use a portion of the available lot quantity
            max_available = lot_quantities[lot_id] - lot_consumption.get(lot_id, 0)
            if max_available <= 0:
                # This shouldn't happen with our filtering, but just in case
                quantity = random.uniform(0.1, 10)
            else:
                # Use between 10% and 50% of the available quantity
                quantity = random.uniform(0.1 * max_available, 0.5 * max_available)
        else:
            # No quantity info available, generate a reasonable value
            quantity = random.uniform(1, 100)
            
        data["quantity"].append(round(quantity, 2))
        
        # Update lot consumption tracking
        if lot_id in lot_consumption:
            lot_consumption[lot_id] += quantity
        else:
            lot_consumption[lot_id] = quantity
        
        # Set unit
        if lot_id in lot_units:
            data["unit"].append(lot_units[lot_id])
        else:
            data["unit"].append(random.choice(["kg", "L", "g", "unit", "piece"]))
        
        # Set equipment
        data["equipment_id"].append(random.choice(equipment_ids))
        
        # Set process step
        data["step_id"].append(random.choice(step_ids))
        
        # Set operator
        data["operator_id"].append(random.choice(operator_ids))
        
        # Set planned consumption and variance
        # About 80% of consumption records have planned values
        if random.random() < 0.8:
            # Planned consumption is usually close to actual but can vary
            variation = random.uniform(0.8, 1.2)  # -20% to +20%
            planned = quantity / variation
            data["planned_consumption"].append(round(planned, 2))
            
            # Calculate variance (actual - planned)
            variance = quantity - planned
            data["consumption_variance"].append(round(variance, 2))
        else:
            data["planned_consumption"].append("")
            data["consumption_variance"].append("")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} material consumption records.")
    print(f"Data saved to {output_file}")
    
    return df

#-----------------------------------------------------------------------------------
# Display Statistics Functions
#-----------------------------------------------------------------------------------

def display_costs_statistics(costs_df, work_orders_df=None, products_df=None, equipment_df=None):
    """
    Display basic statistics about the generated costs data
    
    Parameters:
    - costs_df: DataFrame containing costs data
    - work_orders_df: DataFrame containing work orders data (optional)
    - products_df: DataFrame containing products data (optional)
    - equipment_df: DataFrame containing equipment data (optional)
    """
    if costs_df is None or len(costs_df) == 0:
        print("No costs data to analyze.")
        return
    
    print("\nCosts Statistics:")
    print(f"Total cost records: {len(costs_df)}")
    
    # Cost type distribution
    print("\nCost Type Distribution:")
    type_counts = costs_df['cost_type'].value_counts()
    for cost_type, count in type_counts.items():
        print(f"  {cost_type}: {count} ({count/len(costs_df)*100:.1f}%)")
    
    # Convert amount to numeric
    costs_df['amount_numeric'] = pd.to_numeric(costs_df['amount'], errors='coerce')
    
    # Amount statistics
    print("\nCost Amount Statistics:")
    print(f"  Total cost amount: ${costs_df['amount_numeric'].sum():,.2f}")
    print(f"  Average cost amount: ${costs_df['amount_numeric'].mean():,.2f}")
    print(f"  Minimum cost amount: ${costs_df['amount_numeric'].min():,.2f}")
    print(f"  Maximum cost amount: ${costs_df['amount_numeric'].max():,.2f}")
    
    # Cost category distribution (top 10)
    print("\nCost Category Distribution (top 10):")
    category_counts = costs_df['cost_category'].value_counts().head(10)
    for category, count in category_counts.items():
        print(f"  {category}: {count} ({count/len(costs_df)*100:.1f}%)")
    
    # Cost center distribution
    print("\nCost Center Distribution:")
    center_counts = costs_df['cost_center'].value_counts()
    for center, count in center_counts.items():
        print(f"  {center}: {count} ({count/len(costs_df)*100:.1f}%)")
    
    # Currency distribution
    print("\nCurrency Distribution:")
    currency_counts = costs_df['currency'].value_counts()
    for currency, count in currency_counts.items():
        print(f"  {currency}: {count} ({count/len(costs_df)*100:.1f}%)")
    
    # Association statistics
    has_wo = costs_df['work_order_id'].apply(lambda x: x != "").sum()
    has_product = costs_df['product_id'].apply(lambda x: x != "").sum()
    has_equipment = costs_df['equipment_id'].apply(lambda x: x != "").sum()
    has_batch = costs_df['batch_id'].apply(lambda x: x != "").sum()
    
    print("\nAssociation Statistics:")
    print(f"  Costs with work order: {has_wo} ({has_wo/len(costs_df)*100:.1f}%)")
    print(f"  Costs with product: {has_product} ({has_product/len(costs_df)*100:.1f}%)")
    print(f"  Costs with equipment: {has_equipment} ({has_equipment/len(costs_df)*100:.1f}%)")
    print(f"  Costs with batch: {has_batch} ({has_batch/len(costs_df)*100:.1f}%)")
    
    # Variance analysis
    costs_df['planned_cost_numeric'] = pd.to_numeric(costs_df['planned_cost'], errors='coerce')
    costs_df['variance_numeric'] = pd.to_numeric(costs_df['variance'], errors='coerce')
    
    has_planned = costs_df['planned_cost'].apply(lambda x: x != "").sum()
    print(f"\nCosts with planned values: {has_planned} ({has_planned/len(costs_df)*100:.1f}%)")
    
    if has_planned > 0:
        # Calculate average variance percentage
        costs_df['variance_percent'] = costs_df.apply(
            lambda row: row['variance_numeric'] / row['planned_cost_numeric'] * 100 if pd.notna(row['variance_numeric']) and pd.notna(row['planned_cost_numeric']) and row['planned_cost_numeric'] != 0 else None,
            axis=1
        )
        
        print("\nVariance Statistics:")
        print(f"  Average variance: ${costs_df['variance_numeric'].mean():,.2f}")
        print(f"  Average variance percentage: {costs_df['variance_percent'].mean():.1f}%")
        
        # Count over/under budget
        over_budget = costs_df[costs_df['variance_numeric'] > 0]
        under_budget = costs_df[costs_df['variance_numeric'] < 0]
        
        print(f"  Costs over budget: {len(over_budget)} ({len(over_budget)/has_planned*100:.1f}% of planned costs)")
        print(f"  Costs under budget: {len(under_budget)} ({len(under_budget)/has_planned*100:.1f}% of planned costs)")
    
    # Time-based analysis
    costs_df['timestamp'] = pd.to_datetime(costs_df['timestamp'])
    
    print("\nCost Timeline:")
    print(f"  First cost recorded: {costs_df['timestamp'].min()}")
    print(f"  Last cost recorded: {costs_df['timestamp'].max()}")
    
    # Costs by month
    costs_df['month'] = costs_df['timestamp'].dt.to_period('M')
    monthly_costs = costs_df.groupby('month')['amount_numeric'].sum()
    
    print("\nCosts by Month (latest 6 months):")
    for month, amount in monthly_costs.tail(6).items():
        print(f"  {month}: ${amount:,.2f}")
    
    # Cost breakdown by association
    if has_wo > 0 and work_orders_df is not None:
        print("\nTop 5 Work Orders by Cost:")
        wo_costs = costs_df.groupby('work_order_id')['amount_numeric'].sum().sort_values(ascending=False)
        for wo_id, amount in wo_costs.head(5).items():
            if wo_id != "":
                print(f"  {wo_id}: ${amount:,.2f}")
    
    if has_product > 0 and products_df is not None:
        print("\nTop 5 Products by Cost:")
        # Create a mapping of product_id to product_name
        product_names = dict(zip(products_df['product_id'], products_df['product_name']))
        
        product_costs = costs_df.groupby('product_id')['amount_numeric'].sum().sort_values(ascending=False)
        for product_id, amount in product_costs.head(5).items():
            if product_id != "":
                product_name = product_names.get(product_id, product_id)
                print(f"  {product_name}: ${amount:,.2f}")
    
    if has_equipment > 0 and equipment_df is not None:
        print("\nTop 5 Equipment by Cost:")
        # Create a mapping of equipment_id to equipment_name
        equipment_names = dict(zip(equipment_df['equipment_id'], equipment_df['equipment_name']))
        
        equipment_costs = costs_df.groupby('equipment_id')['amount_numeric'].sum().sort_values(ascending=False)
        for equipment_id, amount in equipment_costs.head(5).items():
            if equipment_id != "":
                equipment_name = equipment_names.get(equipment_id, equipment_id)
                print(f"  {equipment_name}: ${amount:,.2f}")

def display_material_lots_statistics(material_lots_df, materials_df=None, suppliers_df=None, storage_locations_df=None):
    """
    Display basic statistics about the generated material lots data
    
    Parameters:
    - material_lots_df: DataFrame containing material lots data
    - materials_df: DataFrame containing materials data (optional)
    - suppliers_df: DataFrame containing suppliers data (optional)
    - storage_locations_df: DataFrame containing storage locations data (optional)
    """
    if material_lots_df is None or len(material_lots_df) == 0:
        print("No material lots data to analyze.")
        return
    
    print("\nMaterial Lots Statistics:")
    print(f"Total material lots: {len(material_lots_df)}")
    
    # Status distribution
    print("\nLot Status Distribution:")
    status_counts = material_lots_df['status'].value_counts()
    for status, count in status_counts.items():
        print(f"  {status}: {count} ({count/len(material_lots_df)*100:.1f}%)")
    
    # Quality status distribution
    print("\nQuality Status Distribution:")
    quality_counts = material_lots_df['quality_status'].value_counts()
    for status, count in quality_counts.items():
        print(f"  {status}: {count} ({count/len(material_lots_df)*100:.1f}%)")
    
    # Convert quantity to numeric
    material_lots_df['quantity_numeric'] = pd.to_numeric(material_lots_df['lot_quantity'], errors='coerce')
    
    # Quantity statistics by unit
    print("\nQuantity Statistics by Unit:")
    for unit in material_lots_df['quantity_unit'].unique():
        unit_lots = material_lots_df[material_lots_df['quantity_unit'] == unit]
        print(f"  {unit}:")
        print(f"    Count: {len(unit_lots)} lots")
        print(f"    Total quantity: {unit_lots['quantity_numeric'].sum():,.2f} {unit}")
        print(f"    Average quantity: {unit_lots['quantity_numeric'].mean():,.2f} {unit}")
    
    # Material distribution
    material_counts = material_lots_df.groupby('material_id').size().sort_values(ascending=False)
    
    print("\nTop 10 Materials by Lot Count:")
    if materials_df is not None:
        # Create a mapping of material_id to material_name
        material_names = dict(zip(materials_df['material_id'], materials_df['material_name']))
        
        for material_id, count in material_counts.head(10).items():
            material_name = material_names.get(material_id, material_id)
            print(f"  {material_name}: {count} lots ({count/len(material_lots_df)*100:.1f}%)")
    else:
        for material_id, count in material_counts.head(10).items():
            print(f"  {material_id}: {count} lots ({count/len(material_lots_df)*100:.1f}%)")
    
    # Supplier analysis
    has_supplier = material_lots_df['supplier_id'].apply(lambda x: x != "").sum()
    print(f"\nLots with supplier: {has_supplier} ({has_supplier/len(material_lots_df)*100:.1f}%)")
    
    if has_supplier > 0 and suppliers_df is not None:
        # Create a mapping of supplier_id to supplier_name
        supplier_names = dict(zip(suppliers_df['supplier_id'], suppliers_df['supplier_name']))
        
        supplier_counts = material_lots_df[material_lots_df['supplier_id'] != ""].groupby('supplier_id').size().sort_values(ascending=False)
        
        print("\nTop 5 Suppliers by Lot Count:")
        for supplier_id, count in supplier_counts.head(5).items():
            supplier_name = supplier_names.get(supplier_id, supplier_id)
            print(f"  {supplier_name}: {count} lots ({count/has_supplier*100:.1f}% of supplied lots)")
    
    # Storage location analysis
    location_counts = material_lots_df.groupby('storage_location_id').size().sort_values(ascending=False)
    
    print("\nTop 5 Storage Locations by Lot Count:")
    if storage_locations_df is not None:
        # Create a mapping of location_id to location_name
        location_names = dict(zip(storage_locations_df['location_id'], storage_locations_df['location_name']))
        
        for location_id, count in location_counts.head(5).items():
            location_name = location_names.get(location_id, location_id)
            print(f"  {location_name}: {count} lots ({count/len(material_lots_df)*100:.1f}%)")
    else:
        for location_id, count in location_counts.head(5).items():
            print(f"  {location_id}: {count} lots ({count/len(material_lots_df)*100:.1f}%)")
    
    # Parent-child relationship analysis
    has_parent = material_lots_df['parent_lot_id'].apply(lambda x: x != "").sum()
    print(f"\nLots with parent (split lots): {has_parent} ({has_parent/len(material_lots_df)*100:.1f}%)")
    
    # Cost analysis
    material_lots_df['cost_numeric'] = pd.to_numeric(material_lots_df['cost_per_unit'], errors='coerce')
    material_lots_df['total_cost'] = material_lots_df['quantity_numeric'] * material_lots_df['cost_numeric']
    
    print("\nCost Statistics:")
    print(f"  Average cost per unit: ${material_lots_df['cost_numeric'].mean():,.2f}")
    print(f"  Total inventory value: ${material_lots_df['total_cost'].sum():,.2f}")
    
    # Time-based analysis
    material_lots_df['creation_date'] = pd.to_datetime(material_lots_df['creation_date'])
    material_lots_df['expiration_date'] = pd.to_datetime(material_lots_df['expiration_date'])
    
    print("\nTime-based Analysis:")
    print(f"  Oldest lot created: {material_lots_df['creation_date'].min()}")
    print(f"  Newest lot created: {material_lots_df['creation_date'].max()}")
    
    # Calculate shelf life
    material_lots_df['shelf_life_days'] = (material_lots_df['expiration_date'] - material_lots_df['creation_date']).dt.days
    
    print(f"  Average shelf life: {material_lots_df['shelf_life_days'].mean():.1f} days ({material_lots_df['shelf_life_days'].mean()/365:.1f} years)")
    print(f"  Minimum shelf life: {material_lots_df['shelf_life_days'].min()} days")
    print(f"  Maximum shelf life: {material_lots_df['shelf_life_days'].max()} days ({material_lots_df['shelf_life_days'].max()/365:.1f} years)")
    
    # Expiration analysis
    current_date = datetime.now().date()
    expired_lots = material_lots_df[material_lots_df['expiration_date'].dt.date < current_date]
    expiring_soon = material_lots_df[(material_lots_df['expiration_date'].dt.date >= current_date) & 
                                   (material_lots_df['expiration_date'].dt.date <= current_date + timedelta(days=90))]
    
    print(f"\nExpired lots: {len(expired_lots)} ({len(expired_lots)/len(material_lots_df)*100:.1f}%)")
    print(f"Lots expiring within 90 days: {len(expiring_soon)} ({len(expiring_soon)/len(material_lots_df)*100:.1f}%)")

def display_material_consumption_statistics(consumption_df, material_lots_df=None, batches_df=None, equipment_df=None):
    """
    Display basic statistics about the generated material consumption data
    
    Parameters:
    - consumption_df: DataFrame containing material consumption data
    - material_lots_df: DataFrame containing material lots data (optional)
    - batches_df: DataFrame containing batches data (optional)
    - equipment_df: DataFrame containing equipment data (optional)
    """
    if consumption_df is None or len(consumption_df) == 0:
        print("No material consumption data to analyze.")
        return
    
    print("\nMaterial Consumption Statistics:")
    print(f"Total consumption records: {len(consumption_df)}")
    
    # Convert quantity to numeric
    consumption_df['quantity_numeric'] = pd.to_numeric(consumption_df['quantity'], errors='coerce')
    
    # Quantity statistics by unit
    print("\nQuantity Statistics by Unit:")
    for unit in consumption_df['unit'].unique():
        unit_consumptions = consumption_df[consumption_df['unit'] == unit]
        print(f"  {unit}:")
        print(f"    Count: {len(unit_consumptions)} records")
        print(f"    Total consumed: {unit_consumptions['quantity_numeric'].sum():,.2f} {unit}")
        print(f"    Average consumption: {unit_consumptions['quantity_numeric'].mean():,.2f} {unit}")
    
    # Lot distribution
    lot_counts = consumption_df.groupby('lot_id').size().sort_values(ascending=False)
    
    print("\nTop 10 Lots by Consumption Frequency:")
    if material_lots_df is not None:
        # Create a mapping of lot_id to material_id
        lot_to_material = dict(zip(material_lots_df['lot_id'], material_lots_df['material_id']))
        
        for lot_id, count in lot_counts.head(10).items():
            material_id = lot_to_material.get(lot_id, "Unknown")
            print(f"  {lot_id} (Material: {material_id}): {count} records ({count/len(consumption_df)*100:.1f}%)")
    else:
        for lot_id, count in lot_counts.head(10).items():
            print(f"  {lot_id}: {count} records ({count/len(consumption_df)*100:.1f}%)")
    
    # Batch distribution
    batch_counts = consumption_df.groupby('batch_id').size().sort_values(ascending=False)
    
    print("\nTop 10 Batches by Consumption Records:")
    if batches_df is not None:
        # Use batch_id to look up product_id if available
        batch_to_product = {}
        if 'batch_id' in batches_df.columns and 'product_id' in batches_df.columns:
            batch_to_product = dict(zip(batches_df['batch_id'], batches_df['product_id']))
        
        for batch_id, count in batch_counts.head(10).items():
            product_id = batch_to_product.get(batch_id, "")
            if product_id:
                print(f"  {batch_id} (Product: {product_id}): {count} records ({count/len(consumption_df)*100:.1f}%)")
            else:
                print(f"  {batch_id}: {count} records ({count/len(consumption_df)*100:.1f}%)")
    else:
        for batch_id, count in batch_counts.head(10).items():
            print(f"  {batch_id}: {count} records ({count/len(consumption_df)*100:.1f}%)")
    
    # Equipment distribution
    equipment_counts = consumption_df.groupby('equipment_id').size().sort_values(ascending=False)
    
    print("\nTop 5 Equipment by Consumption:")
    if equipment_df is not None:
        # Create a mapping of equipment_id to equipment_name
        equipment_names = dict(zip(equipment_df['equipment_id'], equipment_df['equipment_name']))
        
        for equipment_id, count in equipment_counts.head(5).items():
            equipment_name = equipment_names.get(equipment_id, equipment_id)
            print(f"  {equipment_name}: {count} records ({count/len(consumption_df)*100:.1f}%)")
    else:
        for equipment_id, count in equipment_counts.head(5).items():
            print(f"  {equipment_id}: {count} records ({count/len(consumption_df)*100:.1f}%)")
    
    # Planned vs. actual analysis
    consumption_df['planned_numeric'] = pd.to_numeric(consumption_df['planned_consumption'], errors='coerce')
    consumption_df['variance_numeric'] = pd.to_numeric(consumption_df['consumption_variance'], errors='coerce')
    
    has_planned = consumption_df['planned_consumption'].apply(lambda x: x != "").sum()
    print(f"\nRecords with planned consumption: {has_planned} ({has_planned/len(consumption_df)*100:.1f}%)")
    
    if has_planned > 0:
        # Calculate average variance percentage
        consumption_df['variance_percent'] = consumption_df.apply(
            lambda row: row['variance_numeric'] / row['planned_numeric'] * 100 if pd.notna(row['variance_numeric']) and pd.notna(row['planned_numeric']) and row['planned_numeric'] != 0 else None,
            axis=1
        )
        
        print("\nConsumption Variance Statistics:")
        print(f"  Average variance: {consumption_df['variance_numeric'].mean():,.2f}")
        print(f"  Average variance percentage: {consumption_df['variance_percent'].mean():.1f}%")
        
        # Count over/under consumption
        over_consumption = consumption_df[consumption_df['variance_numeric'] > 0]
        under_consumption = consumption_df[consumption_df['variance_numeric'] < 0]
        
        print(f"  Over-consumption records: {len(over_consumption)} ({len(over_consumption)/has_planned*100:.1f}% of planned records)")
        print(f"  Under-consumption records: {len(under_consumption)} ({len(under_consumption)/has_planned*100:.1f}% of planned records)")
    
    # Time-based analysis
    consumption_df['timestamp'] = pd.to_datetime(consumption_df['timestamp'])
    
    print("\nConsumption Timeline:")
    print(f"  First consumption: {consumption_df['timestamp'].min()}")
    print(f"  Last consumption: {consumption_df['timestamp'].max()}")
    
    # Consumption by day of week
    consumption_df['day_of_week'] = consumption_df['timestamp'].dt.day_name()
    day_counts = consumption_df.groupby('day_of_week').size()
    
    # Ensure proper day order
    days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    day_counts = day_counts.reindex(days_order)
    
    print("\nConsumption by Day of Week:")
    for day, count in day_counts.items():
        print(f"  {day}: {count} records ({count/len(consumption_df)*100:.1f}%)")
    
    # Consumption by hour of day
    consumption_df['hour'] = consumption_df['timestamp'].dt.hour
    hour_counts = consumption_df.groupby('hour').size()
    
    print("\nConsumption by Hour of Day (Top 5):")
    for hour, count in hour_counts.sort_values(ascending=False).head(5).items():
        print(f"  {hour}:00 - {hour}:59: {count} records ({count/len(consumption_df)*100:.1f}%)")

#-----------------------------------------------------------------------------------
# Main Function to Run All Level 4 Data Generation
#-----------------------------------------------------------------------------------

def generate_all_level4_data():
    """
    Generate all remaining ISA-95 Level 4 tables in a single function
    """
    # Create data directory if it doesn't exist
    os.makedirs("data", exist_ok=True)
    
    # Load existing data if available
    try:
        materials_df = pd.read_csv("data/materials.csv")
        print("Loaded existing materials data.")
    except FileNotFoundError:
        materials_df = None
        print("Materials data not found.")
    
    try:
        products_df = pd.read_csv("data/products.csv")
        print("Loaded existing products data.")
    except FileNotFoundError:
        products_df = None
        print("Products data not found.")
        
    try:
        facilities_df = pd.read_csv("data/facilities.csv")
        print("Loaded existing facilities data.")
    except FileNotFoundError:
        facilities_df = None
        print("Facilities data not found.")
    
    try:
        storage_locations_df = pd.read_csv("data/storage_locations.csv")
        print("Loaded existing storage locations data.")
    except FileNotFoundError:
        storage_locations_df = None
        print("Storage locations data not found.")
    
    try:
        equipment_df = pd.read_csv("data/equipment.csv")
        print("Loaded existing equipment data.")
    except FileNotFoundError:
        equipment_df = None
        print("Equipment data not found.")
    
    try:
        work_orders_df = pd.read_csv("data/work_orders.csv")
        print("Loaded existing work orders data.")
    except FileNotFoundError:
        work_orders_df = None
        print("Work orders data not found.")
    
    try:
        batches_df = pd.read_csv("data/batches.csv")
        print("Loaded existing batches data.")
    except FileNotFoundError:
        batches_df = None
        print("Batches data not found.")
        
    try:
        suppliers_df = pd.read_csv("data/suppliers.csv")
        print("Loaded existing suppliers data.")
    except FileNotFoundError:
        suppliers_df = None
        print("Suppliers data not found.")
        
    try:
        personnel_df = pd.read_csv("data/personnel.csv")
        print("Loaded existing personnel data.")
    except FileNotFoundError:
        personnel_df = None
        print("Personnel data not found.")
    
    # 1. Generate Material Lots data
    print("\n1. Generating Material Lots data...")
    material_lots_df = generate_material_lots_data(
        materials_df=materials_df,
        suppliers_df=suppliers_df,
        storage_locations_df=storage_locations_df,
        num_lots=200,
        output_file="data/material_lots.csv"
    )
    
    # Display statistics
    if material_lots_df is not None:
        display_material_lots_statistics(material_lots_df, materials_df, suppliers_df, storage_locations_df)
        
        # Display sample data
        print("\nSample material lots data (first 5 records):")
        print(material_lots_df.head(5))
    
    # 2. Generate Material Consumption data
    print("\n2. Generating Material Consumption data...")
    material_consumption_df = generate_material_consumption_data(
        material_lots_df=material_lots_df,
        batches_df=batches_df,
        work_orders_df=work_orders_df,
        equipment_df=equipment_df,
        personnel_df=personnel_df,
        num_records=300,
        output_file="data/material_consumption.csv"
    )
    
    # Display statistics
    if material_consumption_df is not None:
        display_material_consumption_statistics(material_consumption_df, material_lots_df, batches_df, equipment_df)
        
        # Display sample data
        print("\nSample material consumption data (first 5 records):")
        print(material_consumption_df.head(5))
    
    # 3. Generate Costs data
    print("\n3. Generating Costs data...")
    costs_df = generate_costs_data(
        work_orders_df=work_orders_df,
        products_df=products_df,
        equipment_df=equipment_df,
        batches_df=batches_df,
        num_costs=500,
        output_file="data/costs.csv"
    )
    
    # Display statistics
    if costs_df is not None:
        display_costs_statistics(costs_df, work_orders_df, products_df, equipment_df)
        
        # Display sample data
        print("\nSample costs data (first 5 records):")
        print(costs_df.head(5))
    
    print("\nAll ISA-95 Level 4 data generation complete!")

if __name__ == "__main__":
    generate_all_level4_data()



Loaded existing materials data.
Loaded existing products data.
Loaded existing facilities data.
Loaded existing storage locations data.
Loaded existing equipment data.
Loaded existing work orders data.
Loaded existing batches data.
Loaded existing suppliers data.
Personnel data not found.

1. Generating Material Lots data...
Successfully generated 200 material lot records.
Data saved to data/material_lots.csv

Material Lots Statistics:
Total material lots: 200

Lot Status Distribution:
  Available: 135 (67.5%)
  In Use: 22 (11.0%)
  Reserved: 19 (9.5%)
  Quarantined: 9 (4.5%)
  Consumed: 6 (3.0%)
  Rejected: 5 (2.5%)
  On Hold: 4 (2.0%)

Quality Status Distribution:
  Released: 137 (68.5%)
  Under Test: 19 (9.5%)
  Pending Review: 16 (8.0%)
  Rejected: 15 (7.5%)
  Approved: 13 (6.5%)

Quantity Statistics by Unit:
  unit:
    Count: 21 lots
    Total quantity: 20,904.21 unit
    Average quantity: 995.44 unit
  m³:
    Count: 15 lots
    Total quantity: 15,965.64 m³
    Average quantity: