Level 2: Process Monitoring & Supervision Tables

Equipment

In [8]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def load_level1_data(sensors_file="data/sensors_data.csv", 
                     actuators_file="data/actuators_data.csv",
                     control_loops_file="data/control_loops.csv"):
    """
    Load previously generated Level 1 data to extract equipment IDs and create relationships
    
    Parameters:
    - sensors_file: CSV file containing sensor data
    - actuators_file: CSV file containing actuator data
    - control_loops_file: CSV file containing control loops data
    
    Returns:
    - Set of unique equipment IDs from Level 1 data
    """
    equipment_ids = set()
    
    # Try to load sensors data
    try:
        sensors_df = pd.read_csv(sensors_file)
        if 'equipment_id' in sensors_df.columns:
            equipment_ids.update(sensors_df['equipment_id'].unique())
    except (FileNotFoundError, pd.errors.EmptyDataError):
        print(f"Note: Could not load equipment IDs from {sensors_file}")
    
    # Try to load actuators data
    try:
        actuators_df = pd.read_csv(actuators_file)
        if 'equipment_id' in actuators_df.columns:
            equipment_ids.update(actuators_df['equipment_id'].unique())
    except (FileNotFoundError, pd.errors.EmptyDataError):
        print(f"Note: Could not load equipment IDs from {actuators_file}")
    
    # Try to load control loops data
    try:
        control_loops_df = pd.read_csv(control_loops_file)
        if 'equipment_id' in control_loops_df.columns:
            equipment_ids.update(control_loops_df['equipment_id'].unique())
    except (FileNotFoundError, pd.errors.EmptyDataError):
        print(f"Note: Could not load equipment IDs from {control_loops_file}")
    
    return equipment_ids

def generate_equipment_data(existing_equipment_ids=None, num_equipment=100, output_file="data/equipment.csv"):
    """
    Generate synthetic data for the Equipment table from ISA-95 Level 2.
    
    Parameters:
    - existing_equipment_ids: Set of equipment IDs from Level 1 to maintain relationships
    - num_equipment: Total number of equipment records to generate
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated equipment data
    """
    # Define possible values for categorical fields
    equipment_types = [
        "Reactor", "Mixer", "Pump", "Compressor", "Heat Exchanger", "Distillation Column",
        "Filter", "Dryer", "Tank", "Furnace", "Conveyor", "Mill", "Centrifuge", "Evaporator",
        "Crystallizer", "Extruder", "Boiler", "Blender", "Separator", "Packaging Machine",
        "CNC Machine", "Robot", "Injection Molder", "Press", "Welder", "Assembly Station",
        "Filling Machine", "Labeling Machine", "Testing Station", "Sterilizer"
    ]
    
    # Create subtypes for more specific classification
    equipment_subtypes = {
        "Reactor": ["Batch Reactor", "Continuous Stirred Tank Reactor", "Plug Flow Reactor", "Fluidized Bed Reactor"],
        "Mixer": ["Ribbon Blender", "Paddle Mixer", "High Shear Mixer", "Tumble Mixer", "V-Blender"],
        "Pump": ["Centrifugal Pump", "Positive Displacement Pump", "Diaphragm Pump", "Peristaltic Pump"],
        "Compressor": ["Reciprocating Compressor", "Rotary Screw Compressor", "Centrifugal Compressor"],
        "Heat Exchanger": ["Shell and Tube", "Plate", "Spiral", "Air Cooled"],
        "Distillation Column": ["Batch Distillation", "Continuous Distillation", "Vacuum Distillation"],
        "Filter": ["Bag Filter", "Cartridge Filter", "Plate and Frame Filter", "Rotary Drum Filter"],
        "Dryer": ["Spray Dryer", "Rotary Dryer", "Fluidized Bed Dryer", "Vacuum Dryer"],
        "Tank": ["Storage Tank", "Mixing Tank", "Buffer Tank", "Pressure Vessel"],
        "Furnace": ["Electric Furnace", "Gas Furnace", "Induction Furnace"],
        "Conveyor": ["Belt Conveyor", "Roller Conveyor", "Screw Conveyor", "Pneumatic Conveyor"],
        "Mill": ["Ball Mill", "Hammer Mill", "Roller Mill", "Colloid Mill"],
        "Centrifuge": ["Disc Centrifuge", "Basket Centrifuge", "Decanter Centrifuge"],
        "Evaporator": ["Falling Film Evaporator", "Rising Film Evaporator", "Multiple Effect Evaporator"],
        "Crystallizer": ["Batch Crystallizer", "Continuous Crystallizer", "Cooling Crystallizer"],
        "Extruder": ["Single Screw Extruder", "Twin Screw Extruder", "Multi-Screw Extruder"],
        "Boiler": ["Fire Tube Boiler", "Water Tube Boiler", "Electric Boiler"],
        "Blender": ["V-Blender", "Double Cone Blender", "Ribbon Blender", "Paddle Blender"],
        "Separator": ["Cyclone Separator", "Magnetic Separator", "Electrostatic Separator"],
        "Packaging Machine": ["Form-Fill-Seal Machine", "Case Packer", "Wrapping Machine", "Cartoner"],
        "CNC Machine": ["CNC Mill", "CNC Lathe", "CNC Router", "CNC Plasma Cutter"],
        "Robot": ["Articulated Robot", "SCARA Robot", "Delta Robot", "Collaborative Robot"],
        "Injection Molder": ["Hydraulic Injection Molder", "Electric Injection Molder", "Hybrid Injection Molder"],
        "Press": ["Hydraulic Press", "Mechanical Press", "Pneumatic Press"],
        "Welder": ["Arc Welder", "MIG Welder", "TIG Welder", "Spot Welder"],
        "Assembly Station": ["Manual Assembly Station", "Semi-Automated Assembly Station", "Fully Automated Assembly Station"],
        "Filling Machine": ["Volumetric Filler", "Gravity Filler", "Piston Filler", "Vacuum Filler"],
        "Labeling Machine": ["Pressure Sensitive Labeler", "Cut and Stack Labeler", "Sleeve Labeler"],
        "Testing Station": ["Visual Inspection Station", "Dimensional Testing Station", "Functional Testing Station"],
        "Sterilizer": ["Steam Sterilizer", "Ethylene Oxide Sterilizer", "Radiation Sterilizer", "Chemical Sterilizer"]
    }
    
    manufacturers = [
        "Siemens", "ABB", "Emerson", "Honeywell", "Schneider Electric", "Yokogawa", 
        "Rockwell Automation", "GE", "Mitsubishi Electric", "Omron", "Endress+Hauser",
        "Festo", "SMC", "Bürkert", "Danfoss", "Alfa Laval", "Sulzer", "Andritz", 
        "Atlas Copco", "SPX Flow", "Grundfos", "ITT", "Flowserve", "KSB", "Weir",
        "Metso Outotec", "Thermo Fisher Scientific", "WIKA", "Eaton", "Phoenix Contact"
    ]
    
    statuses = ["Running", "Idle", "Maintenance", "Fault", "Standby", "Offline", "Startup", "Shutdown"]
    
    # Generate process area IDs (to be used as foreign keys)
    # In a real system, these would come from the ProcessAreas table
    process_area_ids = [f"AREA-{uuid.uuid4().hex[:8].upper()}" for _ in range(10)]
    
    # Create a set of existing equipment data (if any)
    if existing_equipment_ids is not None and len(existing_equipment_ids) > 0:
        existing_equipment = list(existing_equipment_ids)
        # Calculate how many new equipment records to generate
        num_new_equipment = max(0, num_equipment - len(existing_equipment))
        print(f"Found {len(existing_equipment)} existing equipment IDs from Level 1.")
        print(f"Generating {num_new_equipment} additional equipment records.")
    else:
        existing_equipment = []
        num_new_equipment = num_equipment
        print(f"No existing equipment IDs found. Generating {num_new_equipment} new equipment records.")
    
    # Generate new equipment IDs if needed
    new_equipment_ids = [f"EQ-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_new_equipment)]
    
    # Combine existing and new equipment IDs
    all_equipment_ids = existing_equipment + new_equipment_ids
    
    # Ensure we don't exceed the requested number
    all_equipment_ids = all_equipment_ids[:num_equipment]
    
    # Generate equipment data
    data = {
        "equipment_id": all_equipment_ids,
        "equipment_name": [],
        "equipment_type": [],
        "area_id": [],
        "manufacturer": [],
        "model_number": [],
        "serial_number": [],
        "installation_date": [],
        "last_maintenance_date": [],
        "next_maintenance_date": [],
        "equipment_status": [],
        "parent_equipment_id": []
    }
    
    # Create hierarchical structure with some equipment having parent equipment
    # First, identify which equipment will be parents (about 20%)
    potential_parents = random.sample(all_equipment_ids, int(len(all_equipment_ids) * 0.2))
    
    # Create a dictionary to track parent-child relationships
    equipment_hierarchy = {}
    
    # Now generate the details for each equipment
    for eq_id in all_equipment_ids:
        # Select equipment type
        eq_type = random.choice(equipment_types)
        data["equipment_type"].append(eq_type)
        
        # Select subtype if available
        if eq_type in equipment_subtypes:
            subtype = random.choice(equipment_subtypes[eq_type])
            # Create a descriptive name
            eq_name = f"{subtype} {random.randint(100, 999)}"
        else:
            eq_name = f"{eq_type} {random.randint(100, 999)}"
        
        data["equipment_name"].append(eq_name)
        
        # Assign to a process area
        data["area_id"].append(random.choice(process_area_ids))
        
        # Select manufacturer
        manufacturer = random.choice(manufacturers)
        data["manufacturer"].append(manufacturer)
        
        # Generate model number
        model_prefix = manufacturer[:3].upper()
        data["model_number"].append(f"{model_prefix}-{eq_type[:2].upper()}{random.randint(1000, 9999)}")
        
        # Generate serial number
        data["serial_number"].append(f"SN{random.randint(100000, 999999)}")
        
        # Generate installation date (1-10 years ago)
        install_date = datetime.now() - timedelta(days=random.randint(365, 3650))
        data["installation_date"].append(install_date.strftime("%Y-%m-%d"))
        
        # Generate last maintenance date (0-6 months ago)
        last_maint_date = datetime.now() - timedelta(days=random.randint(0, 180))
        data["last_maintenance_date"].append(last_maint_date.strftime("%Y-%m-%d"))
        
        # Generate next maintenance date (0-6 months in future)
        next_maint_date = datetime.now() + timedelta(days=random.randint(1, 180))
        data["next_maintenance_date"].append(next_maint_date.strftime("%Y-%m-%d"))
        
        # Generate status (weighted towards running and idle)
        status_weights = [0.6, 0.2, 0.05, 0.05, 0.05, 0.02, 0.02, 0.01]  # Weights for each status
        data["equipment_status"].append(random.choices(statuses, weights=status_weights)[0])
        
        # Determine parent equipment (if any)
        # About 30% of equipment will have a parent
        if eq_id not in potential_parents and random.random() < 0.3:
            # Ensure we don't create circular references
            available_parents = [p for p in potential_parents if p != eq_id and p not in equipment_hierarchy.get(eq_id, [])]
            if available_parents:
                parent_id = random.choice(available_parents)
                data["parent_equipment_id"].append(parent_id)
                
                # Update hierarchy
                if parent_id not in equipment_hierarchy:
                    equipment_hierarchy[parent_id] = []
                equipment_hierarchy[parent_id].append(eq_id)
            else:
                data["parent_equipment_id"].append("")
        else:
            data["parent_equipment_id"].append("")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} equipment records.")
    print(f"Data saved to {output_file}")
    
    return df

def display_statistics(df):
    """
    Display basic statistics about the generated equipment data
    
    Parameters:
    - df: DataFrame containing equipment data
    """
    if df is None or len(df) == 0:
        print("No data to analyze.")
        return
    
    print("\nEquipment Statistics:")
    print(f"Total equipment: {len(df)}")
    
    print("\nEquipment Type Distribution:")
    type_counts = df['equipment_type'].value_counts().head(10)
    for eq_type, count in type_counts.items():
        print(f"  {eq_type}: {count} ({count/len(df)*100:.1f}%)")
    
    print("\nStatus Distribution:")
    status_counts = df['equipment_status'].value_counts()
    for status, count in status_counts.items():
        print(f"  {status}: {count} ({count/len(df)*100:.1f}%)")
    
    print("\nProcess Area Distribution:")
    area_counts = df['area_id'].value_counts()
    for area_id, count in area_counts.items():
        print(f"  {area_id}: {count} ({count/len(df)*100:.1f}%)")
    
    print("\nManufacturer Distribution:")
    mfg_counts = df['manufacturer'].value_counts().head(5)
    for mfg, count in mfg_counts.items():
        print(f"  {mfg}: {count} ({count/len(df)*100:.1f}%)")
    
    print("\nHierarchy Statistics:")
    parent_count = df['parent_equipment_id'].apply(lambda x: x != "").sum()
    print(f"  Equipment with parent: {parent_count} ({parent_count/len(df)*100:.1f}%)")
    print(f"  Top-level equipment: {len(df) - parent_count} ({(len(df) - parent_count)/len(df)*100:.1f}%)")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load existing equipment IDs from Level 1 data for consistency
    existing_equipment_ids = load_level1_data()
    
    # Generate equipment data
    equipment_df = generate_equipment_data(
        existing_equipment_ids=existing_equipment_ids, 
        num_equipment=150,  # Generate 150 equipment records in total
        output_file="data/equipment.csv"
    )
    
    # Display sample data and statistics
    if equipment_df is not None:
        print("\nSample data (first 5 records):")
        print(equipment_df.head())
        
        # Display statistics
        display_statistics(equipment_df)

Found 20 existing equipment IDs from Level 1.
Generating 130 additional equipment records.
Successfully generated 150 equipment records.
Data saved to data/equipment.csv

Sample data (first 5 records):
  equipment_id               equipment_name     equipment_type        area_id  \
0  EQ-9A644928  Falling Film Evaporator 715         Evaporator  AREA-CF44DDA4   
1  EQ-8953C760              Case Packer 398  Packaging Machine  AREA-B64653E6   
2  EQ-9F6621AF             Vacuum Dryer 654              Dryer  AREA-B64653E6   
3  EQ-D56E7067  Continuous Crystallizer 637       Crystallizer  AREA-7FBCEF28   
4  EQ-422154D2             Paddle Mixer 177              Mixer  AREA-7FBCEF28   

  manufacturer model_number serial_number installation_date  \
0        Eaton   EAT-EV9056      SN516340        2023-10-15   
1     Yokogawa   YOK-PA6751      SN932707        2024-04-01   
2    Flowserve   FLO-DR5102      SN615451        2016-12-31   
3   Alfa Laval   ALF-CR6667      SN923969        2015-10-25

EquipmentStates

In [9]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os
import csv

def load_equipment_data(equipment_file="data/equipment.csv"):
    """
    Load the previously generated equipment data
    
    Parameters:
    - equipment_file: CSV file containing equipment data
    
    Returns:
    - DataFrame containing the equipment data
    """
    try:
        return pd.read_csv(equipment_file)
    except FileNotFoundError:
        print(f"Error: Equipment data file {equipment_file} not found.")
        print("Please run the equipment data generation script first.")
        return None

def generate_equipment_states(equipment_df, states_per_equipment=20, 
                            start_time=None, end_time=None, output_file="data/equipment_states.csv"):
    """
    Generate synthetic equipment state data based on the equipment table
    
    Parameters:
    - equipment_df: DataFrame containing equipment data
    - states_per_equipment: Average number of state transitions per equipment
    - start_time: Start time for state history (defaults to 7 days ago)
    - end_time: End time for state history (defaults to now)
    - output_file: CSV file to save the states data
    
    Returns:
    - DataFrame containing a sample of the generated states data
    """
    if equipment_df is None or len(equipment_df) == 0:
        print("Error: No equipment data available.")
        return None
    
    # Set default time range if not provided
    if start_time is None:
        start_time = datetime.now() - timedelta(days=7)
    if end_time is None:
        end_time = datetime.now()
    
    # Create batch IDs and work order IDs to simulate relationships
    batch_ids = [f"BATCH-{uuid.uuid4().hex[:8].upper()}" for _ in range(30)]
    work_order_ids = [f"WO-{uuid.uuid4().hex[:8].upper()}" for _ in range(20)]
    
    # Create operator IDs
    operator_ids = [f"OP-{uuid.uuid4().hex[:6].upper()}" for _ in range(15)]
    
    # Define possible equipment states and transitions
    equipment_states = {
        # General states for most equipment
        "general": [
            "Running", "Idle", "Setup", "Cleaning", "Maintenance", "Fault", 
            "Shutdown", "Standby", "Startup", "Emergency Stop"
        ],
        # Specific states for reactors
        "Reactor": [
            "Running", "Idle", "Charging", "Reaction", "Discharging", "Cleaning", 
            "Heating", "Cooling", "Maintenance", "Fault", "Standby"
        ],
        # Specific states for pumps
        "Pump": [
            "Running", "Idle", "Startup", "Shutdown", "Maintenance", "Fault", 
            "Standby", "Cavitation", "Overheating"
        ],
        # Specific states for mixers
        "Mixer": [
            "Running", "Idle", "Loading", "Mixing", "Unloading", "Cleaning", 
            "Maintenance", "Fault", "Standby"
        ],
        # Specific states for tanks
        "Tank": [
            "Filling", "Holding", "Emptying", "Idle", "Cleaning", "Maintenance", 
            "Fault", "Agitating", "Heating", "Cooling"
        ],
        # Specific states for packaging machines
        "Packaging Machine": [
            "Running", "Idle", "Setup", "Loading Material", "Fault", "Maintenance", 
            "Standby", "Adjusting", "Cleaning", "Warming Up"
        ],
        # Specific states for CNC machines
        "CNC Machine": [
            "Running", "Idle", "Setup", "Tool Change", "Program Loading", "Maintenance", 
            "Fault", "Homing", "Material Loading", "Finished"
        ]
    }
    
    # Define state transition probabilities from current state to next state
    # Higher probability for common transitions, lower for rare ones
    transition_probabilities = {
        "Running": {"Running": 0.7, "Idle": 0.1, "Fault": 0.05, "Maintenance": 0.05, "Cleaning": 0.05, "Shutdown": 0.05},
        "Idle": {"Running": 0.5, "Idle": 0.2, "Setup": 0.1, "Maintenance": 0.1, "Shutdown": 0.1},
        "Setup": {"Running": 0.8, "Idle": 0.1, "Fault": 0.1},
        "Cleaning": {"Idle": 0.5, "Running": 0.3, "Maintenance": 0.2},
        "Maintenance": {"Idle": 0.6, "Running": 0.3, "Fault": 0.1},
        "Fault": {"Maintenance": 0.5, "Idle": 0.3, "Shutdown": 0.2},
        "Shutdown": {"Startup": 0.5, "Idle": 0.5},
        "Standby": {"Running": 0.6, "Idle": 0.3, "Shutdown": 0.1},
        "Startup": {"Running": 0.8, "Fault": 0.1, "Idle": 0.1},
        "Emergency Stop": {"Maintenance": 0.6, "Idle": 0.3, "Shutdown": 0.1},
        # Reactor specific
        "Charging": {"Reaction": 0.8, "Fault": 0.1, "Idle": 0.1},
        "Reaction": {"Discharging": 0.7, "Fault": 0.1, "Cooling": 0.2},
        "Discharging": {"Cleaning": 0.6, "Idle": 0.4},
        "Heating": {"Reaction": 0.7, "Fault": 0.1, "Idle": 0.2},
        "Cooling": {"Discharging": 0.7, "Idle": 0.3},
        # Pump specific
        "Cavitation": {"Fault": 0.6, "Maintenance": 0.4},
        "Overheating": {"Shutdown": 0.5, "Maintenance": 0.5},
        # Mixer specific
        "Loading": {"Mixing": 0.9, "Fault": 0.1},
        "Mixing": {"Unloading": 0.8, "Fault": 0.1, "Idle": 0.1},
        "Unloading": {"Cleaning": 0.5, "Idle": 0.5},
        # Tank specific
        "Filling": {"Holding": 0.8, "Agitating": 0.1, "Fault": 0.1},
        "Holding": {"Emptying": 0.6, "Agitating": 0.2, "Heating": 0.1, "Cooling": 0.1},
        "Emptying": {"Cleaning": 0.5, "Idle": 0.5},
        "Agitating": {"Holding": 0.5, "Emptying": 0.3, "Fault": 0.2},
        "Heating": {"Holding": 0.8, "Fault": 0.2},
        # Packaging machine specific
        "Loading Material": {"Running": 0.8, "Fault": 0.2},
        "Adjusting": {"Running": 0.9, "Fault": 0.1},
        "Warming Up": {"Running": 0.9, "Fault": 0.1},
        # CNC machine specific
        "Tool Change": {"Running": 0.9, "Fault": 0.1},
        "Program Loading": {"Running": 0.8, "Homing": 0.2},
        "Homing": {"Running": 0.9, "Fault": 0.1},
        "Material Loading": {"Running": 0.9, "Idle": 0.1},
        "Finished": {"Idle": 0.8, "Material Loading": 0.2}
    }
    
    # Define typical state durations in minutes
    state_durations = {
        "Running": (60, 480),        # 1-8 hours
        "Idle": (15, 120),           # 15 min - 2 hours
        "Setup": (20, 90),           # 20-90 min
        "Cleaning": (30, 120),       # 30 min - 2 hours
        "Maintenance": (60, 240),    # 1-4 hours
        "Fault": (15, 240),          # 15 min - 4 hours
        "Shutdown": (5, 30),         # 5-30 min
        "Standby": (10, 120),        # 10 min - 2 hours
        "Startup": (5, 30),          # 5-30 min
        "Emergency Stop": (5, 60),   # 5-60 min
        # Reactor specific
        "Charging": (15, 90),        # 15-90 min
        "Reaction": (60, 360),       # 1-6 hours
        "Discharging": (15, 60),     # 15-60 min
        "Heating": (15, 120),        # 15 min - 2 hours
        "Cooling": (30, 180),        # 30 min - 3 hours
        # Pump specific
        "Cavitation": (5, 30),       # 5-30 min
        "Overheating": (10, 45),     # 10-45 min
        # Mixer specific
        "Loading": (10, 45),         # 10-45 min
        "Mixing": (30, 240),         # 30 min - 4 hours
        "Unloading": (10, 45),       # 10-45 min
        # Tank specific
        "Filling": (20, 120),        # 20 min - 2 hours
        "Holding": (60, 720),        # 1-12 hours
        "Emptying": (20, 120),       # 20 min - 2 hours
        "Agitating": (30, 180),      # 30 min - 3 hours
        # Packaging machine specific
        "Loading Material": (5, 30), # 5-30 min
        "Adjusting": (10, 45),       # 10-45 min
        "Warming Up": (10, 30),      # 10-30 min
        # CNC machine specific
        "Tool Change": (2, 15),      # 2-15 min
        "Program Loading": (1, 5),   # 1-5 min
        "Homing": (1, 5),            # 1-5 min
        "Material Loading": (5, 20), # 5-20 min
        "Finished": (1, 10)          # 1-10 min
    }
    
    # Define transition reasons
    transition_reasons = {
        "Running": ["Production schedule", "Normal operation", "Process started", "Resumed after break"],
        "Idle": ["Production complete", "Waiting for materials", "Break time", "Shift change", "No orders"],
        "Setup": ["Product changeover", "New batch preparation", "Recipe change", "Tooling change"],
        "Cleaning": ["Scheduled cleaning", "Product changeover", "Contamination prevention", "End of batch"],
        "Maintenance": ["Scheduled maintenance", "Preventative service", "Component replacement", "Calibration"],
        "Fault": ["Error detected", "Component failure", "Safety interlock", "Process deviation", "Power issue"],
        "Shutdown": ["End of shift", "Planned downtime", "Weekend shutdown", "Holiday shutdown", "Energy saving"],
        "Standby": ["Waiting for upstream process", "Energy saving mode", "Temporary pause", "Break time"],
        "Startup": ["Beginning of shift", "Power restored", "After maintenance", "Morning startup"],
        "Emergency Stop": ["Safety button pressed", "Hazard detected", "Operator emergency", "Control system trigger"],
        # Additional specific reasons
        "Charging": ["Raw material loading", "Batch start", "Recipe preparation"],
        "Reaction": ["Process running", "Chemical reaction", "Temperature reached"],
        "Discharging": ["Batch complete", "Transfer to storage", "Moving to next stage"],
        "Heating": ["Temperature ramp-up", "Process requirement", "Preparation phase"],
        "Cooling": ["Temperature reduction", "Process complete", "Preparation for discharge"],
        "Cavitation": ["Insufficient inlet pressure", "Air in system", "Pump issue"],
        "Overheating": ["Excessive load", "Cooling failure", "Friction issue"],
        "Loading": ["Material addition", "Batch preparation", "Recipe start"],
        "Mixing": ["Blending process", "Homogenization", "Formula requirement"],
        "Unloading": ["Process complete", "Transfer operation", "Batch finished"],
        "Filling": ["Inventory replenishment", "Process start", "Batch preparation"],
        "Holding": ["Process requirement", "Waiting for test results", "Stabilization period"],
        "Emptying": ["Transfer to production", "Tank cleaning preparation", "Process complete"],
        "Agitating": ["Prevent settling", "Improve mixing", "Maintain suspension"],
        "Tool Change": ["Different operation", "Tool wear", "Program requirement"],
        "Program Loading": ["New part", "Production change", "Updated program"],
        "Homing": ["Reference position", "Startup procedure", "After emergency stop"],
        "Material Loading": ["New workpiece", "Batch start", "Production run"],
        "Finished": ["Operation complete", "Program end", "Batch finished"]
    }
    
    # Prepare the output file with CSV writer for memory efficiency
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = [
            'state_id', 'equipment_id', 'state_name', 'start_timestamp', 
            'end_timestamp', 'duration_seconds', 'previous_state', 
            'transition_reason', 'operator_id', 'batch_id', 'work_order_id'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        print(f"Generating equipment state history...")
        states_count = 0
        
        # Process each equipment
        for _, equipment in equipment_df.iterrows():
            equipment_id = equipment['equipment_id']
            equipment_type = equipment['equipment_type']
            
            # Get equipment status from equipment table
            current_status = equipment['equipment_status'] if 'equipment_status' in equipment.index else "Running"
            
            # Determine appropriate states for this equipment type
            if equipment_type in equipment_states:
                possible_states = equipment_states[equipment_type]
            else:
                possible_states = equipment_states["general"]
            
            # Vary the number of state transitions per equipment to make it more realistic
            # Some equipment changes state frequently, others rarely
            equipment_state_count = max(2, int(random.normalvariate(states_per_equipment, states_per_equipment/4)))
            
            # Initialize with a random state or use current status if it's a valid state
            if current_status in possible_states:
                current_state = current_status
            else:
                current_state = random.choice(possible_states)
            
            # Initialize time to the start time
            current_time = start_time
            
            # Generate state transitions for this equipment
            for i in range(equipment_state_count):
                # Create a unique state ID
                state_id = f"STATE-{uuid.uuid4().hex[:12].upper()}"
                
                # Determine state duration based on typical ranges
                if current_state in state_durations:
                    min_duration, max_duration = state_durations[current_state]
                else:
                    min_duration, max_duration = 30, 120  # Default 30 min - 2 hours
                
                # Add some randomness to duration
                duration_minutes = random.uniform(min_duration, max_duration)
                
                # Convert to seconds
                duration_seconds = duration_minutes * 60
                
                # Calculate end time
                state_end_time = current_time + timedelta(seconds=duration_seconds)
                
                # If we've exceeded the end time, truncate and finish
                if state_end_time > end_time:
                    state_end_time = end_time
                    duration_seconds = (state_end_time - current_time).total_seconds()
                
                # Determine if this state is associated with a batch
                has_batch = random.random() < 0.7  # 70% chance of having a batch
                if has_batch:
                    batch_id = random.choice(batch_ids)
                    # If there's a batch, higher chance of having a work order
                    if random.random() < 0.8:  # 80% chance of having a work order if there's a batch
                        work_order_id = random.choice(work_order_ids)
                    else:
                        work_order_id = ""
                else:
                    batch_id = ""
                    work_order_id = ""
                
                # Determine if an operator was involved
                # Higher chance for manual states like Setup, Maintenance
                if current_state in ["Setup", "Maintenance", "Cleaning", "Emergency Stop"]:
                    operator_chance = 0.9  # 90% chance
                else:
                    operator_chance = 0.3  # 30% chance
                
                if random.random() < operator_chance:
                    operator_id = random.choice(operator_ids)
                else:
                    operator_id = ""
                
                # Determine transition reason
                if current_state in transition_reasons:
                    reason = random.choice(transition_reasons[current_state])
                else:
                    reason = "Normal operation"
                
                # Write the state record to the CSV
                writer.writerow({
                    'state_id': state_id,
                    'equipment_id': equipment_id,
                    'state_name': current_state,
                    'start_timestamp': current_time.strftime("%Y-%m-%d %H:%M:%S"),
                    'end_timestamp': state_end_time.strftime("%Y-%m-%d %H:%M:%S"),
                    'duration_seconds': int(duration_seconds),
                    'previous_state': "" if i == 0 else previous_state,
                    'transition_reason': reason,
                    'operator_id': operator_id,
                    'batch_id': batch_id,
                    'work_order_id': work_order_id
                })
                
                states_count += 1
                
                # Remember the current state for the next record
                previous_state = current_state
                
                # Move to the next time point
                current_time = state_end_time
                
                # If we've reached the end time, stop generating states for this equipment
                if current_time >= end_time:
                    break
                
                # Determine the next state based on transition probabilities
                if current_state in transition_probabilities:
                    next_state_probs = transition_probabilities[current_state]
                    
                    # Filter to only include states valid for this equipment
                    valid_next_states = {s: p for s, p in next_state_probs.items() if s in possible_states}
                    
                    # If no valid transitions, pick a random valid state
                    if not valid_next_states:
                        current_state = random.choice(possible_states)
                    else:
                        # Normalize probabilities
                        total_prob = sum(valid_next_states.values())
                        normalized_probs = {s: p/total_prob for s, p in valid_next_states.items()}
                        
                        # Select next state based on probabilities
                        next_state = random.choices(
                            list(normalized_probs.keys()), 
                            weights=list(normalized_probs.values()),
                            k=1
                        )[0]
                        
                        current_state = next_state
                else:
                    # If no defined transitions, pick a random state
                    current_state = random.choice(possible_states)
                
                if states_count % 10000 == 0:
                    print(f"Generated {states_count} state records so far...")
    
    print(f"Successfully generated {states_count} equipment state records.")
    print(f"Data saved to {output_file}")
    
    # Return a sample of the data (first 1000 rows) for preview
    return pd.read_csv(output_file, nrows=1000)

def get_sample_statistics(output_file):
    """
    Get basic statistics about the generated equipment states data
    
    Parameters:
    - output_file: CSV file containing the states data
    """
    try:
        # Read the first chunk to get column information
        sample_df = pd.read_csv(output_file, nrows=1000)
        
        # Sample statistics on chunks to avoid memory issues
        num_rows = 0
        equipment_ids = set()
        state_names = set()
        state_durations = []
        batch_count = 0
        work_order_count = 0
        operator_count = 0
        
        # Process in chunks
        for chunk in pd.read_csv(output_file, chunksize=10000):
            num_rows += len(chunk)
            equipment_ids.update(chunk['equipment_id'].unique())
            state_names.update(chunk['state_name'].unique())
            
            # Sample some durations (not all to save memory)
            sample_durations = chunk['duration_seconds'].sample(min(1000, len(chunk))).tolist()
            state_durations.extend(sample_durations)
            
            # Count associations
            batch_count += chunk['batch_id'].apply(lambda x: x != "").sum()
            work_order_count += chunk['work_order_id'].apply(lambda x: x != "").sum()
            operator_count += chunk['operator_id'].apply(lambda x: x != "").sum()
        
        print("\nEquipment States Statistics:")
        print(f"Total state records: {num_rows}")
        print(f"Unique equipment: {len(equipment_ids)}")
        print(f"Unique state names: {len(state_names)}")
        print(f"State names: {sorted(state_names)}")
        
        if state_durations:
            avg_duration_min = sum(state_durations) / len(state_durations) / 60
            print(f"Average state duration: {avg_duration_min:.1f} minutes")
        
        # Calculate percentages
        if num_rows > 0:
            print(f"States with batch association: {batch_count} ({batch_count/num_rows*100:.1f}%)")
            print(f"States with work order association: {work_order_count} ({work_order_count/num_rows*100:.1f}%)")
            print(f"States with operator association: {operator_count} ({operator_count/num_rows*100:.1f}%)")
        
        # Get time range if available
        if 'start_timestamp' in sample_df.columns and 'end_timestamp' in sample_df.columns:
            min_time = pd.to_datetime(sample_df['start_timestamp']).min()
            max_time = pd.to_datetime(sample_df['end_timestamp']).max()
            print(f"Time range (from sample): approximately {min_time} to {max_time}")
            
    except Exception as e:
        print(f"Error getting statistics: {e}")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load the equipment data
    equipment_df = load_equipment_data()
    
    if equipment_df is not None:
        # Define the time range (past 7 days)
        end_time = datetime.now()
        start_time = end_time - timedelta(days=7)
        
        # Generate equipment states history
        sample_df = generate_equipment_states(
            equipment_df, 
            states_per_equipment=20,  # Average 20 state changes per equipment over the time period
            start_time=start_time,
            end_time=end_time,
            output_file="data/equipment_states.csv"
        )
        
        # Display a sample of the data
        if sample_df is not None:
            print("\nSample data (first 5 records):")
            print(sample_df.head())
            
            # Get statistics
            get_sample_statistics("data/equipment_states.csv")

Generating equipment state history...
Successfully generated 2821 equipment state records.
Data saved to data/equipment_states.csv

Sample data (first 5 records):
             state_id equipment_id state_name      start_timestamp  \
0  STATE-7F39D01AE889  EQ-9A644928    Running  2025-07-08 19:15:24   
1  STATE-DF87A7248323  EQ-9A644928       Idle  2025-07-08 23:53:24   
2  STATE-DD6A8152D5C3  EQ-9A644928   Shutdown  2025-07-09 01:08:00   
3  STATE-EA999AAD9A46  EQ-9A644928       Idle  2025-07-09 01:18:22   
4  STATE-FAEB61CA1A68  EQ-9A644928       Idle  2025-07-09 02:40:14   

         end_timestamp  duration_seconds previous_state    transition_reason  \
0  2025-07-08 23:53:24             16680            NaN  Resumed after break   
1  2025-07-09 01:08:00              4475        Running         Shift change   
2  2025-07-09 01:18:22               622           Idle        Energy saving   
3  2025-07-09 02:40:14              4911       Shutdown         Shift change   
4  2025-07-09 03

Alarms

In [10]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os
import csv

def load_equipment_data(equipment_file="data/equipment.csv"):
    """
    Load the previously generated equipment data
    
    Parameters:
    - equipment_file: CSV file containing equipment data
    
    Returns:
    - DataFrame containing the equipment data
    """
    try:
        return pd.read_csv(equipment_file)
    except FileNotFoundError:
        print(f"Error: Equipment data file {equipment_file} not found.")
        print("Please run the equipment data generation script first.")
        return None

def load_equipment_states(states_file="data/equipment_states.csv"):
    """
    Load the previously generated equipment states data
    
    Parameters:
    - states_file: CSV file containing equipment states data
    
    Returns:
    - DataFrame containing the equipment states data (sample)
    """
    try:
        # Just load a sample to get the time range and equipment IDs
        return pd.read_csv(states_file, nrows=1000)
    except FileNotFoundError:
        print(f"Note: Equipment states file {states_file} not found.")
        print("Alarms will be generated without correlation to equipment states.")
        return None

def generate_alarms(equipment_df, equipment_states_df=None, num_alarms_per_equipment=10, 
                   start_time=None, end_time=None, output_file="data/alarms.csv"):
    """
    Generate synthetic alarm data based on the equipment table
    
    Parameters:
    - equipment_df: DataFrame containing equipment data
    - equipment_states_df: DataFrame containing equipment states data (optional)
    - num_alarms_per_equipment: Average number of alarms per equipment
    - start_time: Start time for alarm history (defaults to 7 days ago)
    - end_time: End time for alarm history (defaults to now)
    - output_file: CSV file to save the alarms data
    
    Returns:
    - DataFrame containing a sample of the generated alarms data
    """
    if equipment_df is None or len(equipment_df) == 0:
        print("Error: No equipment data available.")
        return None
    
    # Set default time range if not provided
    if start_time is None:
        start_time = datetime.now() - timedelta(days=7)
    if end_time is None:
        end_time = datetime.now()
    
    # Try to extract time range from equipment states if available
    if equipment_states_df is not None and len(equipment_states_df) > 0:
        try:
            # Convert string timestamps to datetime
            equipment_states_df['start_timestamp'] = pd.to_datetime(equipment_states_df['start_timestamp'])
            equipment_states_df['end_timestamp'] = pd.to_datetime(equipment_states_df['end_timestamp'])
            
            # Get time range from equipment states
            states_start = equipment_states_df['start_timestamp'].min()
            states_end = equipment_states_df['end_timestamp'].max()
            
            # Use equipment states time range if available
            if not pd.isna(states_start) and not pd.isna(states_end):
                start_time = states_start
                end_time = states_end
                print(f"Using time range from equipment states: {start_time} to {end_time}")
        except Exception as e:
            print(f"Warning: Could not extract time range from equipment states: {e}")
    
    # Create batch IDs and work order IDs to simulate relationships
    batch_ids = [f"BATCH-{uuid.uuid4().hex[:8].upper()}" for _ in range(30)]
    work_order_ids = [f"WO-{uuid.uuid4().hex[:8].upper()}" for _ in range(20)]
    
    # Create operator IDs
    operator_ids = [f"OP-{uuid.uuid4().hex[:6].upper()}" for _ in range(15)]
    
    # Define alarm types and details based on equipment types
    alarm_types = {
        # General alarms for most equipment
        "general": [
            "High Temperature", "Low Temperature", "High Pressure", "Low Pressure", 
            "High Flow", "Low Flow", "High Level", "Low Level", "Power Failure", 
            "Communication Error", "Sensor Fault", "Control Deviation", 
            "Emergency Stop", "Safety Interlock", "Maintenance Due"
        ],
        # Specific alarms for reactors
        "Reactor": [
            "High Temperature", "Low Temperature", "High Pressure", "Low Pressure", 
            "Agitator Failure", "Cooling System Failure", "Heating System Failure", 
            "Pressure Relief Valve", "Reaction Rate Deviation", "pH Deviation",
            "Reactor Integrity", "Exothermic Reaction", "Feed Rate Deviation"
        ],
        # Specific alarms for pumps
        "Pump": [
            "Cavitation", "High Vibration", "Low Flow", "No Flow", "Seal Failure", 
            "Motor Overload", "High Temperature", "Discharge Pressure High", 
            "Suction Pressure Low", "Efficiency Low", "Bearing Temperature High"
        ],
        # Specific alarms for mixers
        "Mixer": [
            "High Torque", "Low Torque", "High Temperature", "Vibration High", 
            "Motor Overload", "Shaft Misalignment", "Bearing Failure", 
            "Speed Deviation", "Mixer Imbalance", "Seal Failure"
        ],
        # Specific alarms for tanks
        "Tank": [
            "High Level", "Low Level", "High Temperature", "Low Temperature", 
            "High Pressure", "Leak Detected", "Agitator Failure", "Cooling System Failure", 
            "Heating System Failure", "Overflow Risk", "Empty Tank", "Contamination Risk"
        ],
        # Specific alarms for packaging machines
        "Packaging Machine": [
            "Material Jam", "Label Misalignment", "Out of Material", "Code Reader Failure", 
            "Seal Quality", "Film Break", "Conveyor Failure", "Temperature Deviation", 
            "Print Quality", "Package Count Error", "Machine Stop"
        ],
        # Specific alarms for CNC machines
        "CNC Machine": [
            "Tool Wear", "Axis Error", "Program Error", "Spindle Overload", 
            "Feed Rate Error", "Tool Change Error", "Coolant Low", "Material Error", 
            "Positioning Error", "Collision Risk", "Emergency Stop"
        ]
    }
    
    # Define priority levels and their distribution
    priority_levels = {
        1: "Critical",    # ~5%
        2: "High",        # ~15%
        3: "Medium",      # ~30%
        4: "Low",         # ~50%
    }
    
    priority_weights = [0.05, 0.15, 0.3, 0.5]
    
    # Define typical alarm durations (time to acknowledgment and resolution)
    # Format: (min_ack_minutes, max_ack_minutes, min_resolve_minutes, max_resolve_minutes)
    alarm_durations = {
        1: (1, 10, 10, 120),     # Critical: Quick ack (1-10min), resolve in 10min-2hrs
        2: (5, 30, 20, 240),     # High: Ack in 5-30min, resolve in 20min-4hrs
        3: (10, 120, 30, 480),   # Medium: Ack in 10min-2hrs, resolve in 30min-8hrs
        4: (30, 240, 60, 720)    # Low: Ack in 30min-4hrs, resolve in 1-12hrs
    }
    
    # Define alarm messages for each alarm type
    alarm_messages = {
        "High Temperature": [
            "Temperature exceeds safe operating limits", 
            "High temperature alarm - check cooling system",
            "Temperature above setpoint by {value}°C",
            "Overheating detected - verify cooling function"
        ],
        "Low Temperature": [
            "Temperature below minimum operating limits",
            "Low temperature alarm - check heating system",
            "Temperature below setpoint by {value}°C",
            "Insufficient heating detected - verify heaters"
        ],
        "High Pressure": [
            "Pressure exceeds maximum safe limit",
            "High pressure alarm - check relief valve",
            "Pressure above setpoint by {value} bar",
            "Excessive pressure detected - risk of damage"
        ],
        "Low Pressure": [
            "Pressure below minimum operating limit",
            "Low pressure alarm - check supply pressure",
            "Pressure below setpoint by {value} bar",
            "Insufficient pressure for operation"
        ],
        "High Flow": [
            "Flow rate exceeds maximum limit",
            "High flow alarm - check control valve",
            "Flow above setpoint by {value} m³/h",
            "Excessive flow detected - verify valve position"
        ],
        "Low Flow": [
            "Flow rate below minimum limit",
            "Low flow alarm - check for blockage",
            "Flow below setpoint by {value} m³/h",
            "Insufficient flow detected - verify pump operation"
        ],
        "High Level": [
            "Level exceeds maximum safe limit",
            "High level alarm - risk of overflow",
            "Level above setpoint by {value}%",
            "Excessive level detected - check outlet valve"
        ],
        "Low Level": [
            "Level below minimum operating limit",
            "Low level alarm - check supply",
            "Level below setpoint by {value}%",
            "Insufficient level detected - verify inlet flow"
        ],
        "Power Failure": [
            "Power supply interruption detected",
            "Power failure alarm - switching to backup",
            "Main power loss - check electrical supply",
            "Power quality issue detected"
        ],
        "Communication Error": [
            "Communication with control system lost",
            "Network communication failure",
            "Data transmission error - check connections",
            "Communication timeout - device not responding"
        ],
        "Sensor Fault": [
            "Sensor reading outside valid range",
            "Sensor calibration error detected",
            "Sensor failure - maintenance required",
            "Invalid sensor data - check wiring"
        ],
        "Control Deviation": [
            "Process variable deviating from setpoint",
            "Control loop unable to maintain setpoint",
            "PID control deviation exceeds {value}%",
            "Sustained control error detected"
        ],
        "Emergency Stop": [
            "Emergency stop button activated",
            "Emergency shutdown initiated",
            "E-stop circuit triggered - check safety devices",
            "Safety system activated emergency stop"
        ],
        "Safety Interlock": [
            "Safety interlock triggered - access violation",
            "Guard door opened during operation",
            "Safety circuit interrupted - check interlocks",
            "Safety barrier breach detected"
        ],
        "Maintenance Due": [
            "Scheduled maintenance overdue",
            "Service interval exceeded by {value} hours",
            "Maintenance required - performance degraded",
            "Preventative maintenance reminder"
        ]
    }
    
    # Define default messages for alarm types not specifically listed
    default_alarm_messages = [
        "{alarm_type} detected - check equipment",
        "{alarm_type} alarm activated",
        "{alarm_type} condition requires attention",
        "Alert: {alarm_type} on {equipment_name}"
    ]
    
    # Prepare the output file with CSV writer for memory efficiency
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = [
            'alarm_id', 'alarm_name', 'equipment_id', 'alarm_type', 'priority',
            'activation_timestamp', 'acknowledgment_timestamp', 'acknowledgment_operator_id',
            'resolution_timestamp', 'alarm_message', 'alarm_value', 'setpoint_value',
            'batch_id', 'work_order_id'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        print(f"Generating alarm history...")
        alarms_count = 0
        
        # Process each equipment
        for _, equipment in equipment_df.iterrows():
            equipment_id = equipment['equipment_id']
            equipment_type = equipment['equipment_type']
            equipment_name = equipment['equipment_name'] if 'equipment_name' in equipment.index else f"Equipment {equipment_id}"
            
            # Determine appropriate alarm types for this equipment type
            if equipment_type in alarm_types:
                possible_alarms = alarm_types[equipment_type]
            else:
                possible_alarms = alarm_types["general"]
            
            # Vary the number of alarms per equipment to make it more realistic
            # Some equipment has more alarms than others
            # Use a Poisson distribution around the average
            equipment_alarm_count = max(0, int(np.random.poisson(num_alarms_per_equipment)))
            
            # Generate alarms for this equipment
            for i in range(equipment_alarm_count):
                # Create a unique alarm ID
                alarm_id = f"ALARM-{uuid.uuid4().hex[:12].upper()}"
                
                # Select alarm type
                alarm_type = random.choice(possible_alarms)
                
                # Create alarm name
                alarm_name = f"{equipment_name} - {alarm_type}"
                
                # Determine priority (weighted random)
                priority = random.choices(list(priority_levels.keys()), weights=priority_weights)[0]
                
                # Generate activation timestamp
                activation_time = start_time + (end_time - start_time) * random.random()
                
                # Determine acknowledgment and resolution times
                min_ack, max_ack, min_resolve, max_resolve = alarm_durations[priority]
                
                ack_delay_minutes = random.uniform(min_ack, max_ack)
                resolve_delay_minutes = random.uniform(min_resolve, max_resolve)
                
                # Some alarms may not be acknowledged or resolved yet
                if random.random() < 0.05:  # 5% of alarms not acknowledged
                    acknowledgment_time = None
                    resolution_time = None
                    acknowledgment_operator_id = ""
                else:
                    acknowledgment_time = activation_time + timedelta(minutes=ack_delay_minutes)
                    
                    # Check if acknowledgment time is beyond end time
                    if acknowledgment_time > end_time:
                        acknowledgment_time = None
                        resolution_time = None
                        acknowledgment_operator_id = ""
                    else:
                        # Assign operator who acknowledged
                        acknowledgment_operator_id = random.choice(operator_ids)
                        
                        # Determine resolution time
                        if random.random() < 0.1:  # 10% of acknowledged alarms not resolved
                            resolution_time = None
                        else:
                            resolution_time = acknowledgment_time + timedelta(minutes=resolve_delay_minutes)
                            
                            # Check if resolution time is beyond end time
                            if resolution_time > end_time:
                                resolution_time = None
                
                # Generate alarm values
                if alarm_type in ["High Temperature", "Low Temperature"]:
                    setpoint = random.uniform(50, 150)
                    deviation = random.uniform(5, 30) * (1 if "High" in alarm_type else -1)
                    alarm_value = setpoint + deviation
                elif alarm_type in ["High Pressure", "Low Pressure"]:
                    setpoint = random.uniform(2, 10)
                    deviation = random.uniform(0.5, 3) * (1 if "High" in alarm_type else -1)
                    alarm_value = setpoint + deviation
                elif alarm_type in ["High Flow", "Low Flow"]:
                    setpoint = random.uniform(10, 100)
                    deviation = random.uniform(5, 30) * (1 if "High" in alarm_type else -1)
                    alarm_value = max(0, setpoint + deviation)
                elif alarm_type in ["High Level", "Low Level"]:
                    setpoint = random.uniform(40, 80)
                    deviation = random.uniform(10, 40) * (1 if "High" in alarm_type else -1)
                    alarm_value = max(0, min(100, setpoint + deviation))
                elif alarm_type == "Control Deviation":
                    setpoint = random.uniform(50, 150)
                    deviation = random.uniform(10, 50)
                    alarm_value = setpoint + deviation
                else:
                    # For other alarm types, no specific value
                    alarm_value = ""
                    setpoint = ""
                
                # Generate alarm message
                if alarm_type in alarm_messages:
                    message_template = random.choice(alarm_messages[alarm_type])
                    # Replace {value} with the deviation if present
                    if "{value}" in message_template and alarm_value != "" and setpoint != "":
                        try:
                            deviation_val = abs(float(alarm_value) - float(setpoint))
                            message = message_template.replace("{value}", f"{deviation_val:.1f}")
                        except (ValueError, TypeError):
                            message = message_template.replace("{value}", "significant")
                    else:
                        message = message_template
                else:
                    message_template = random.choice(default_alarm_messages)
                    message = message_template.replace("{alarm_type}", alarm_type).replace("{equipment_name}", equipment_name)
                
                # Determine if this alarm is associated with a batch
                has_batch = random.random() < 0.7  # 70% chance of having a batch
                if has_batch:
                    batch_id = random.choice(batch_ids)
                    # If there's a batch, higher chance of having a work order
                    if random.random() < 0.8:  # 80% chance of having a work order if there's a batch
                        work_order_id = random.choice(work_order_ids)
                    else:
                        work_order_id = ""
                else:
                    batch_id = ""
                    work_order_id = ""
                
                # Format timestamps as strings (None remains None)
                activation_timestamp = activation_time.strftime("%Y-%m-%d %H:%M:%S") if activation_time else None
                acknowledgment_timestamp = acknowledgment_time.strftime("%Y-%m-%d %H:%M:%S") if acknowledgment_time else None
                resolution_timestamp = resolution_time.strftime("%Y-%m-%d %H:%M:%S") if resolution_time else None
                
                # Write the alarm record to the CSV
                writer.writerow({
                    'alarm_id': alarm_id,
                    'alarm_name': alarm_name,
                    'equipment_id': equipment_id,
                    'alarm_type': alarm_type,
                    'priority': priority,
                    'activation_timestamp': activation_timestamp,
                    'acknowledgment_timestamp': acknowledgment_timestamp,
                    'acknowledgment_operator_id': acknowledgment_operator_id,
                    'resolution_timestamp': resolution_timestamp,
                    'alarm_message': message,
                    'alarm_value': alarm_value,
                    'setpoint_value': setpoint,
                    'batch_id': batch_id,
                    'work_order_id': work_order_id
                })
                
                alarms_count += 1
                if alarms_count % 10000 == 0:
                    print(f"Generated {alarms_count} alarm records so far...")
    
    print(f"Successfully generated {alarms_count} alarm records.")
    print(f"Data saved to {output_file}")
    
    # Return a sample of the data (first 1000 rows) for preview
    return pd.read_csv(output_file, nrows=1000)

def get_sample_statistics(output_file):
    """
    Get basic statistics about the generated alarms data
    
    Parameters:
    - output_file: CSV file containing the alarms data
    """
    try:
        # Read the first chunk to get column information
        sample_df = pd.read_csv(output_file, nrows=1000)
        
        # Sample statistics on chunks to avoid memory issues
        num_rows = 0
        equipment_ids = set()
        alarm_types = set()
        priority_counts = {1: 0, 2: 0, 3: 0, 4: 0}
        unacknowledged_count = 0
        unresolved_count = 0
        batch_count = 0
        work_order_count = 0
        
        # Process in chunks
        for chunk in pd.read_csv(output_file, chunksize=10000):
            num_rows += len(chunk)
            equipment_ids.update(chunk['equipment_id'].unique())
            alarm_types.update(chunk['alarm_type'].unique())
            
            # Count priorities
            for priority in range(1, 5):
                priority_counts[priority] += len(chunk[chunk['priority'] == priority])
            
            # Count unacknowledged and unresolved alarms
            unacknowledged_count += chunk['acknowledgment_timestamp'].isna().sum()
            unresolved_count += chunk['resolution_timestamp'].isna().sum() - chunk['acknowledgment_timestamp'].isna().sum()
            
            # Count associations
            batch_count += chunk['batch_id'].apply(lambda x: x != "").sum()
            work_order_count += chunk['work_order_id'].apply(lambda x: x != "").sum()
        
        print("\nAlarms Statistics:")
        print(f"Total alarm records: {num_rows}")
        print(f"Unique equipment with alarms: {len(equipment_ids)}")
        print(f"Unique alarm types: {len(alarm_types)}")
        print(f"Alarm types: {sorted(alarm_types)}")
        
        print("\nPriority Distribution:")
        for priority, count in priority_counts.items():
            print(f"  Priority {priority}: {count} ({count/num_rows*100:.1f}%)")
        
        # Calculate percentages
        if num_rows > 0:
            print(f"\nUnacknowledged alarms: {unacknowledged_count} ({unacknowledged_count/num_rows*100:.1f}%)")
            print(f"Acknowledged but unresolved alarms: {unresolved_count} ({unresolved_count/num_rows*100:.1f}%)")
            print(f"Alarms with batch association: {batch_count} ({batch_count/num_rows*100:.1f}%)")
            print(f"Alarms with work order association: {work_order_count} ({work_order_count/num_rows*100:.1f}%)")
        
        # Get time range if available
        if 'activation_timestamp' in sample_df.columns:
            try:
                min_time = pd.to_datetime(sample_df['activation_timestamp']).min()
                max_time = pd.to_datetime(sample_df['activation_timestamp']).max()
                print(f"Time range (from sample): approximately {min_time} to {max_time}")
            except:
                pass
            
    except Exception as e:
        print(f"Error getting statistics: {e}")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load the equipment data
    equipment_df = load_equipment_data()
    
    # Try to load equipment states data for time range correlation
    equipment_states_df = load_equipment_states()
    
    if equipment_df is not None:
        # Define the time range (past 7 days if not using equipment states time range)
        end_time = datetime.now()
        start_time = end_time - timedelta(days=7)
        
        # Generate alarms data
        sample_df = generate_alarms(
            equipment_df,
            equipment_states_df,
            num_alarms_per_equipment=10,  # Average 10 alarms per equipment over the time period
            start_time=start_time,
            end_time=end_time,
            output_file="data/alarms.csv"
        )
        
        # Display a sample of the data
        if sample_df is not None:
            print("\nSample data (first 5 records):")
            print(sample_df.head())
            
            # Get statistics
            get_sample_statistics("data/alarms.csv")

Using time range from equipment states: 2025-07-08 19:15:24 to 2025-07-13 02:18:12
Generating alarm history...
Successfully generated 1561 alarm records.
Data saved to data/alarms.csv

Sample data (first 5 records):
             alarm_id                                         alarm_name  \
0  ALARM-68E9C4871581     Falling Film Evaporator 715 - High Temperature   
1  ALARM-98FBCEB7D2DE  Falling Film Evaporator 715 - Communication Error   
2  ALARM-173B0A5B271A      Falling Film Evaporator 715 - Low Temperature   
3  ALARM-383C19D7D555             Falling Film Evaporator 715 - Low Flow   
4  ALARM-5569DAD55960       Falling Film Evaporator 715 - Emergency Stop   

  equipment_id           alarm_type  priority activation_timestamp  \
0  EQ-9A644928     High Temperature         3  2025-07-11 10:44:03   
1  EQ-9A644928  Communication Error         3  2025-07-11 19:51:49   
2  EQ-9A644928      Low Temperature         2  2025-07-10 05:14:12   
3  EQ-9A644928             Low Flow         4  

ProcessParameters

In [11]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os
import csv

def load_equipment_data(equipment_file="data/equipment.csv"):
    """
    Load the previously generated equipment data
    
    Parameters:
    - equipment_file: CSV file containing equipment data
    
    Returns:
    - DataFrame containing the equipment data
    """
    try:
        return pd.read_csv(equipment_file)
    except FileNotFoundError:
        print(f"Error: Equipment data file {equipment_file} not found.")
        print("Please run the equipment data generation script first.")
        return None

def load_recipes_data(recipes_file="data/recipes.csv"):
    """
    Load the previously generated recipes data if available
    
    Parameters:
    - recipes_file: CSV file containing recipes data
    
    Returns:
    - DataFrame containing the recipes data or None if not available
    """
    try:
        return pd.read_csv(recipes_file)
    except FileNotFoundError:
        print(f"Note: Recipes data file {recipes_file} not found.")
        print("Process parameters will be generated without recipe context.")
        return None

def load_batches_data(batches_file="data/batches.csv"):
    """
    Load the previously generated batches data if available
    
    Parameters:
    - batches_file: CSV file containing batches data
    
    Returns:
    - DataFrame containing the batches data or None if not available
    """
    try:
        return pd.read_csv(batches_file)
    except FileNotFoundError:
        print(f"Note: Batches data file {batches_file} not found.")
        print("Process parameters will be generated without batch context.")
        return None

def generate_process_parameters(equipment_df, recipes_df=None, batches_df=None, 
                               num_parameters_per_equipment=5, samples_per_parameter=100,
                               start_time=None, end_time=None, output_file="data/process_parameters.csv"):
    """
    Generate synthetic process parameters data based on the equipment table
    
    Parameters:
    - equipment_df: DataFrame containing equipment data
    - recipes_df: DataFrame containing recipes data (optional)
    - batches_df: DataFrame containing batches data (optional)
    - num_parameters_per_equipment: Average number of parameters per equipment
    - samples_per_parameter: Number of time-series samples per parameter
    - start_time: Start time for parameter history (defaults to 7 days ago)
    - end_time: End time for parameter history (defaults to now)
    - output_file: CSV file to save the process parameters data
    
    Returns:
    - DataFrame containing a sample of the generated process parameters data
    """
    if equipment_df is None or len(equipment_df) == 0:
        print("Error: No equipment data available.")
        return None
    
    # Set default time range if not provided
    if start_time is None:
        start_time = datetime.now() - timedelta(days=7)
    if end_time is None:
        end_time = datetime.now()
    
    # Define parameter types and details based on equipment types
    parameter_types = {
        # General parameters for most equipment
        "general": [
            {"name": "Temperature", "unit": "°C", "range": (20, 150), "precision": 1},
            {"name": "Pressure", "unit": "bar", "range": (1, 15), "precision": 2},
            {"name": "Flow Rate", "unit": "m³/h", "range": (0, 100), "precision": 1},
            {"name": "Level", "unit": "%", "range": (0, 100), "precision": 1},
            {"name": "Speed", "unit": "rpm", "range": (0, 3000), "precision": 0},
            {"name": "Power", "unit": "kW", "range": (0, 500), "precision": 1},
            {"name": "Vibration", "unit": "mm/s", "range": (0, 15), "precision": 2},
            {"name": "Current", "unit": "A", "range": (0, 100), "precision": 1},
            {"name": "Voltage", "unit": "V", "range": (0, 480), "precision": 0},
            {"name": "Efficiency", "unit": "%", "range": (50, 100), "precision": 1}
        ],
        # Specific parameters for reactors
        "Reactor": [
            {"name": "Reactor Temperature", "unit": "°C", "range": (50, 250), "precision": 1},
            {"name": "Reactor Pressure", "unit": "bar", "range": (1, 25), "precision": 2},
            {"name": "Agitator Speed", "unit": "rpm", "range": (0, 500), "precision": 0},
            {"name": "Jacket Temperature", "unit": "°C", "range": (10, 200), "precision": 1},
            {"name": "Reaction Time", "unit": "min", "range": (0, 500), "precision": 0},
            {"name": "pH", "unit": "pH", "range": (2, 12), "precision": 2},
            {"name": "Concentration", "unit": "g/L", "range": (0, 1000), "precision": 1},
            {"name": "Reactant Flow", "unit": "L/min", "range": (0, 100), "precision": 1},
            {"name": "Conversion", "unit": "%", "range": (0, 100), "precision": 1},
            {"name": "Catalyst Level", "unit": "%", "range": (0, 100), "precision": 1}
        ],
        # Specific parameters for pumps
        "Pump": [
            {"name": "Discharge Pressure", "unit": "bar", "range": (1, 25), "precision": 2},
            {"name": "Suction Pressure", "unit": "bar", "range": (0, 5), "precision": 2},
            {"name": "Flow Rate", "unit": "m³/h", "range": (0, 500), "precision": 1},
            {"name": "Pump Speed", "unit": "rpm", "range": (0, 3600), "precision": 0},
            {"name": "Motor Current", "unit": "A", "range": (0, 150), "precision": 1},
            {"name": "Differential Pressure", "unit": "bar", "range": (0, 20), "precision": 2},
            {"name": "Efficiency", "unit": "%", "range": (40, 95), "precision": 1},
            {"name": "Vibration", "unit": "mm/s", "range": (0, 20), "precision": 2},
            {"name": "Bearing Temperature", "unit": "°C", "range": (20, 120), "precision": 1},
            {"name": "NPSH Available", "unit": "m", "range": (0, 15), "precision": 2}
        ],
        # Specific parameters for heat exchangers
        "Heat Exchanger": [
            {"name": "Hot Side Inlet Temp", "unit": "°C", "range": (50, 250), "precision": 1},
            {"name": "Hot Side Outlet Temp", "unit": "°C", "range": (30, 200), "precision": 1},
            {"name": "Cold Side Inlet Temp", "unit": "°C", "range": (5, 30), "precision": 1},
            {"name": "Cold Side Outlet Temp", "unit": "°C", "range": (15, 100), "precision": 1},
            {"name": "Hot Side Flow", "unit": "m³/h", "range": (1, 200), "precision": 1},
            {"name": "Cold Side Flow", "unit": "m³/h", "range": (1, 200), "precision": 1},
            {"name": "Pressure Drop Hot", "unit": "bar", "range": (0, 2), "precision": 2},
            {"name": "Pressure Drop Cold", "unit": "bar", "range": (0, 2), "precision": 2},
            {"name": "Heat Transfer Rate", "unit": "kW", "range": (10, 5000), "precision": 0},
            {"name": "Fouling Factor", "unit": "m²K/kW", "range": (0, 0.5), "precision": 3}
        ],
        # Specific parameters for tanks
        "Tank": [
            {"name": "Level", "unit": "%", "range": (0, 100), "precision": 1},
            {"name": "Temperature", "unit": "°C", "range": (10, 80), "precision": 1},
            {"name": "Pressure", "unit": "bar", "range": (1, 5), "precision": 2},
            {"name": "Inlet Flow", "unit": "m³/h", "range": (0, 50), "precision": 1},
            {"name": "Outlet Flow", "unit": "m³/h", "range": (0, 50), "precision": 1},
            {"name": "Agitator Speed", "unit": "rpm", "range": (0, 200), "precision": 0},
            {"name": "pH", "unit": "pH", "range": (3, 11), "precision": 2},
            {"name": "Density", "unit": "kg/m³", "range": (800, 1200), "precision": 0},
            {"name": "Volume", "unit": "m³", "range": (0, 200), "precision": 1},
            {"name": "Jacket Temperature", "unit": "°C", "range": (5, 95), "precision": 1}
        ],
        # Specific parameters for dryers
        "Dryer": [
            {"name": "Inlet Temperature", "unit": "°C", "range": (100, 300), "precision": 1},
            {"name": "Outlet Temperature", "unit": "°C", "range": (50, 150), "precision": 1},
            {"name": "Moisture Content", "unit": "%", "range": (0, 30), "precision": 1},
            {"name": "Air Flow", "unit": "m³/h", "range": (500, 5000), "precision": 0},
            {"name": "Feed Rate", "unit": "kg/h", "range": (50, 2000), "precision": 0},
            {"name": "Residence Time", "unit": "min", "range": (5, 120), "precision": 0},
            {"name": "Humidity", "unit": "%RH", "range": (10, 90), "precision": 1},
            {"name": "Pressure Drop", "unit": "mbar", "range": (5, 100), "precision": 0},
            {"name": "Energy Consumption", "unit": "kWh", "range": (10, 1000), "precision": 0},
            {"name": "Product Temperature", "unit": "°C", "range": (40, 120), "precision": 1}
        ]
    }
    
    # Define control modes
    control_modes = ["Auto", "Manual", "Cascade", "Supervised", "Off"]
    control_mode_weights = [0.7, 0.15, 0.1, 0.04, 0.01]  # Probability weights
    
    # Create batch IDs and recipe IDs if not provided
    if batches_df is None or len(batches_df) == 0:
        print("Creating synthetic batch IDs for process parameters...")
        batch_ids = [f"BATCH-{uuid.uuid4().hex[:8].upper()}" for _ in range(30)]
    else:
        batch_ids = batches_df['batch_id'].unique().tolist()
        
    if recipes_df is None or len(recipes_df) == 0:
        print("Creating synthetic recipe IDs for process parameters...")
        recipe_ids = [f"RECIPE-{uuid.uuid4().hex[:8].upper()}" for _ in range(15)]
    else:
        recipe_ids = recipes_df['recipe_id'].unique().tolist()
    
    # Prepare the output file with CSV writer for memory efficiency
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = [
            'parameter_id', 'parameter_name', 'equipment_id', 'timestamp', 
            'setpoint_value', 'actual_value', 'deviation', 'unit', 
            'upper_control_limit', 'lower_control_limit', 'upper_spec_limit', 
            'lower_spec_limit', 'control_mode', 'recipe_id', 'batch_id'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        print(f"Generating process parameters data...")
        parameters_count = 0
        
        # Process each equipment
        for _, equipment in equipment_df.iterrows():
            equipment_id = equipment['equipment_id']
            equipment_type = equipment['equipment_type']
            
            # Determine appropriate parameters for this equipment type
            if equipment_type in parameter_types:
                possible_parameters = parameter_types[equipment_type]
            else:
                possible_parameters = parameter_types["general"]
            
            # Vary the number of parameters per equipment to make it more realistic
            equipment_parameter_count = max(1, int(random.normalvariate(num_parameters_per_equipment, num_parameters_per_equipment/4)))
            
            # Select parameters for this equipment (without replacement to avoid duplicates)
            selected_parameters = random.sample(possible_parameters, min(equipment_parameter_count, len(possible_parameters)))
            
            # For each parameter, generate time series data
            for param in selected_parameters:
                param_name = param["name"]
                param_unit = param["unit"]
                min_value, max_value = param["range"]
                precision = param["precision"]
                
                # Create a base parameter ID
                base_parameter_id = f"PARAM-{uuid.uuid4().hex[:8].upper()}"
                
                # Determine if this parameter is associated with a recipe and batch
                has_recipe = random.random() < 0.7  # 70% chance of having a recipe
                recipe_id = random.choice(recipe_ids) if has_recipe else ""
                
                has_batch = random.random() < 0.6  # 60% chance of having a batch
                batch_id = random.choice(batch_ids) if has_batch else ""
                
                # Determine control limits and specification limits
                # Control limits are tighter than spec limits
                range_width = max_value - min_value
                
                # Calculate control limits (typically +/- 2 sigma from setpoint)
                control_width = range_width * random.uniform(0.1, 0.3)  # 10-30% of range
                
                # Calculate spec limits (typically +/- 3 sigma from setpoint)
                spec_width = range_width * random.uniform(0.2, 0.5)  # 20-50% of range
                
                # Setpoint is typically in the middle of the range, but can vary
                setpoint_base = min_value + range_width * random.uniform(0.3, 0.7)
                
                # Control limits around setpoint
                lcl = max(min_value, setpoint_base - control_width/2)
                ucl = min(max_value, setpoint_base + control_width/2)
                
                # Spec limits outside control limits
                lsl = max(min_value, setpoint_base - spec_width/2)
                usl = min(max_value, setpoint_base + spec_width/2)
                
                # Round limits according to precision
                lcl = round(lcl, precision)
                ucl = round(ucl, precision)
                lsl = round(lsl, precision)
                usl = round(usl, precision)
                
                # Select control mode (mostly Auto)
                control_mode = random.choices(control_modes, weights=control_mode_weights)[0]
                
                # Generate time series data for this parameter
                # We'll create a base setpoint and then add variations and trends
                
                # Time points
                time_points = [
                    start_time + (end_time - start_time) * (i / (samples_per_parameter - 1))
                    for i in range(samples_per_parameter)
                ]
                
                # Create setpoint changes over time (step changes)
                # Typically setpoints change occasionally in steps
                setpoint_changes = []
                
                # Decide how many setpoint changes to make (0-5)
                num_changes = random.randint(0, 5)
                change_points = sorted(random.sample(range(1, samples_per_parameter), num_changes))
                
                current_setpoint = setpoint_base
                setpoint_values = [current_setpoint] * samples_per_parameter
                
                for change_point in change_points:
                    # Change setpoint by a percentage of the range
                    change_pct = random.uniform(-0.2, 0.2)  # -20% to +20%
                    change_amount = range_width * change_pct
                    new_setpoint = current_setpoint + change_amount
                    
                    # Keep within range
                    new_setpoint = max(min_value, min(max_value, new_setpoint))
                    
                    # Round to appropriate precision
                    new_setpoint = round(new_setpoint, precision)
                    
                    # Update all setpoints from this point forward
                    for i in range(change_point, samples_per_parameter):
                        setpoint_values[i] = new_setpoint
                    
                    current_setpoint = new_setpoint
                
                # Generate actual values with different patterns:
                # 1. Closely following setpoint with noise
                # 2. Gradual drift
                # 3. Oscillation around setpoint
                # 4. Step response to setpoint changes
                # 5. Occasional spikes/outliers
                
                pattern = random.choice(["tracking", "drift", "oscillation", "step", "spikes"])
                actual_values = []
                
                if pattern == "tracking":
                    # Closely tracks setpoint with normal noise
                    for i in range(samples_per_parameter):
                        setpoint = setpoint_values[i]
                        noise = random.normalvariate(0, range_width * 0.02)  # Small noise
                        actual_value = setpoint + noise
                        actual_values.append(actual_value)
                        
                elif pattern == "drift":
                    # Gradual drift from setpoint
                    drift_rate = random.uniform(-0.05, 0.05) * range_width / samples_per_parameter
                    for i in range(samples_per_parameter):
                        setpoint = setpoint_values[i]
                        drift = drift_rate * i
                        noise = random.normalvariate(0, range_width * 0.01)
                        actual_value = setpoint + drift + noise
                        actual_values.append(actual_value)
                        
                elif pattern == "oscillation":
                    # Oscillation around setpoint
                    period = random.uniform(10, 50)  # Oscillation period
                    amplitude = random.uniform(0.02, 0.1) * range_width  # Oscillation amplitude
                    for i in range(samples_per_parameter):
                        setpoint = setpoint_values[i]
                        oscillation = amplitude * np.sin(2 * np.pi * i / period)
                        noise = random.normalvariate(0, range_width * 0.01)
                        actual_value = setpoint + oscillation + noise
                        actual_values.append(actual_value)
                        
                elif pattern == "step":
                    # Step response to setpoint changes
                    response_lag = int(random.uniform(3, 10))  # Lag in samples
                    damping = random.uniform(0.7, 0.95)  # Damping factor
                    
                    # Initialize with first setpoint
                    current_response = setpoint_values[0]
                    for i in range(samples_per_parameter):
                        setpoint = setpoint_values[i]
                        
                        # Calculate step response with lag
                        if i > 0:
                            # Move towards setpoint with damping
                            current_response = current_response + (setpoint - current_response) * (1 - damping)
                        
                        noise = random.normalvariate(0, range_width * 0.01)
                        actual_value = current_response + noise
                        actual_values.append(actual_value)
                        
                else:  # spikes
                    # Occasional spikes/outliers
                    spike_probability = 0.05  # 5% chance of spike
                    for i in range(samples_per_parameter):
                        setpoint = setpoint_values[i]
                        if random.random() < spike_probability:
                            # Create a spike
                            spike_dir = 1 if random.random() > 0.5 else -1
                            spike_size = random.uniform(0.1, 0.3) * range_width
                            noise = spike_dir * spike_size
                        else:
                            noise = random.normalvariate(0, range_width * 0.02)
                        
                        actual_value = setpoint + noise
                        actual_values.append(actual_value)
                
                # Ensure values stay within min/max range
                actual_values = [max(min_value, min(max_value, val)) for val in actual_values]
                
                # Round values to appropriate precision
                actual_values = [round(val, precision) for val in actual_values]
                
                # Calculate deviations
                deviations = [actual - setpoint for actual, setpoint in zip(actual_values, setpoint_values)]
                
                # Write parameter records to CSV
                for i in range(samples_per_parameter):
                    parameter_id = f"{base_parameter_id}-{i+1}"
                    timestamp = time_points[i].strftime("%Y-%m-%d %H:%M:%S")
                    setpoint = setpoint_values[i]
                    actual = actual_values[i]
                    deviation = deviations[i]
                    
                    writer.writerow({
                        'parameter_id': parameter_id,
                        'parameter_name': param_name,
                        'equipment_id': equipment_id,
                        'timestamp': timestamp,
                        'setpoint_value': setpoint,
                        'actual_value': actual,
                        'deviation': round(deviation, precision),
                        'unit': param_unit,
                        'upper_control_limit': ucl,
                        'lower_control_limit': lcl,
                        'upper_spec_limit': usl,
                        'lower_spec_limit': lsl,
                        'control_mode': control_mode,
                        'recipe_id': recipe_id,
                        'batch_id': batch_id
                    })
                    
                    parameters_count += 1
                    
                if parameters_count % 50000 == 0:
                    print(f"Generated {parameters_count} parameter records so far...")
    
    print(f"Successfully generated {parameters_count} process parameter records.")
    print(f"Data saved to {output_file}")
    
    # Return a sample of the data (first 1000 rows) for preview
    return pd.read_csv(output_file, nrows=1000)

def get_sample_statistics(output_file):
    """
    Get basic statistics about the generated process parameters data
    
    Parameters:
    - output_file: CSV file containing the process parameters data
    """
    try:
        # Read the first chunk to get column information
        sample_df = pd.read_csv(output_file, nrows=1000)
        
        # Sample statistics on chunks to avoid memory issues
        num_rows = 0
        equipment_ids = set()
        parameter_names = set()
        control_modes = set()
        recipe_count = 0
        batch_count = 0
        
        # Create containers for deviation statistics
        all_deviations = []
        sample_count = 0
        max_samples = 100000  # Limit samples for memory efficiency
        
        # Process in chunks
        for chunk in pd.read_csv(output_file, chunksize=10000):
            num_rows += len(chunk)
            equipment_ids.update(chunk['equipment_id'].unique())
            parameter_names.update(chunk['parameter_name'].unique())
            control_modes.update(chunk['control_mode'].unique())
            
            # Count associations
            recipe_count += (chunk['recipe_id'] != "").sum()
            batch_count += (chunk['batch_id'] != "").sum()
            
            # Sample deviations (for statistical analysis)
            if sample_count < max_samples:
                deviation_sample = chunk['deviation'].sample(min(1000, len(chunk)))
                all_deviations.extend(deviation_sample.tolist())
                sample_count += len(deviation_sample)
        
        print("\nProcess Parameters Statistics:")
        print(f"Total parameter records: {num_rows}")
        print(f"Unique equipment with parameters: {len(equipment_ids)}")
        print(f"Unique parameter names: {len(parameter_names)}")
        print(f"Parameter names: {sorted(parameter_names)}")
        print(f"Control modes: {sorted(control_modes)}")
        
        # Calculate percentages
        if num_rows > 0:
            print(f"\nParameters with recipe association: {recipe_count} ({recipe_count/num_rows*100:.1f}%)")
            print(f"Parameters with batch association: {batch_count} ({batch_count/num_rows*100:.1f}%)")
        
        # Calculate deviation statistics
        if all_deviations:
            avg_deviation = sum(all_deviations) / len(all_deviations)
            abs_deviations = [abs(d) for d in all_deviations]
            avg_abs_deviation = sum(abs_deviations) / len(abs_deviations)
            max_deviation = max(abs_deviations)
            
            print(f"\nDeviation Statistics (from sample):")
            print(f"  Average deviation: {avg_deviation:.4f}")
            print(f"  Average absolute deviation: {avg_abs_deviation:.4f}")
            print(f"  Maximum absolute deviation: {max_deviation:.4f}")
        
        # Get time range if available
        if 'timestamp' in sample_df.columns:
            try:
                min_time = pd.to_datetime(sample_df['timestamp']).min()
                max_time = pd.to_datetime(sample_df['timestamp']).max()
                print(f"\nTime range (from sample): approximately {min_time} to {max_time}")
            except:
                pass
            
    except Exception as e:
        print(f"Error getting statistics: {e}")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load the equipment data
    equipment_df = load_equipment_data()
    
    # Try to load recipes and batches data if available
    recipes_df = load_recipes_data()
    batches_df = load_batches_data()
    
    if equipment_df is not None:
        # Define the time range (past 7 days)
        end_time = datetime.now()
        start_time = end_time - timedelta(days=7)
        
        # Generate process parameters data
        sample_df = generate_process_parameters(
            equipment_df,
            recipes_df,
            batches_df,
            num_parameters_per_equipment=5,  # Average 5 parameters per equipment
            samples_per_parameter=100,      # 100 time samples per parameter
            start_time=start_time,
            end_time=end_time,
            output_file="data/process_parameters.csv"
        )
        
        # Display a sample of the data
        if sample_df is not None:
            print("\nSample data (first 5 records):")
            print(sample_df.head())
            
            # Get statistics
            get_sample_statistics("data/process_parameters.csv")

Note: Recipes data file data/recipes.csv not found.
Process parameters will be generated without recipe context.
Note: Batches data file data/batches.csv not found.
Process parameters will be generated without batch context.
Creating synthetic batch IDs for process parameters...
Creating synthetic recipe IDs for process parameters...
Generating process parameters data...
Generated 50000 parameter records so far...
Successfully generated 68900 process parameter records.
Data saved to data/process_parameters.csv

Sample data (first 5 records):
       parameter_id parameter_name equipment_id            timestamp  \
0  PARAM-D33552D2-1          Speed  EQ-9A644928  2025-07-08 19:15:25   
1  PARAM-D33552D2-2          Speed  EQ-9A644928  2025-07-08 20:57:14   
2  PARAM-D33552D2-3          Speed  EQ-9A644928  2025-07-08 22:39:03   
3  PARAM-D33552D2-4          Speed  EQ-9A644928  2025-07-09 00:20:52   
4  PARAM-D33552D2-5          Speed  EQ-9A644928  2025-07-09 02:02:41   

   setpoint_value  

Recipes & Batch Steps

In [12]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def load_equipment_data(equipment_file="data/equipment.csv"):
    """
    Load the previously generated equipment data
    
    Parameters:
    - equipment_file: CSV file containing equipment data
    
    Returns:
    - DataFrame containing the equipment data
    """
    try:
        return pd.read_csv(equipment_file)
    except FileNotFoundError:
        print(f"Error: Equipment data file {equipment_file} not found.")
        print("Please run the equipment data generation script first.")
        return None
    
def load_products_data(products_file="data/products.csv"):
    """
    Load the previously generated products data if available
    
    Parameters:
    - products_file: CSV file containing products data
    
    Returns:
    - DataFrame containing the products data or None if not available
    """
    try:
        return pd.read_csv(products_file)
    except FileNotFoundError:
        print(f"Note: Products data file {products_file} not found.")
        print("Recipes will be generated with synthetic product IDs.")
        return None

def load_personnel_data(personnel_file="data/personnel.csv"):
    """
    Load the previously generated personnel data if available
    
    Parameters:
    - personnel_file: CSV file containing personnel data
    
    Returns:
    - DataFrame containing the personnel data or None if not available
    """
    try:
        return pd.read_csv(personnel_file)
    except FileNotFoundError:
        print(f"Note: Personnel data file {personnel_file} not found.")
        print("Recipes will be generated with synthetic personnel IDs.")
        return None

def generate_recipes_data(equipment_df, products_df=None, personnel_df=None, num_recipes=50, output_file="data/recipes.csv"):
    """
    Generate synthetic data for the Recipes table from ISA-95 Level 2.
    
    Parameters:
    - equipment_df: DataFrame containing equipment data
    - products_df: DataFrame containing products data (optional)
    - personnel_df: DataFrame containing personnel data (optional)
    - num_recipes: Number of recipe records to generate
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated recipes data
    """
    if equipment_df is None or len(equipment_df) == 0:
        print("Error: No equipment data available.")
        return None
    
    # Generate product IDs if not provided
    if products_df is None or len(products_df) == 0:
        print("Generating synthetic product IDs...")
        product_ids = [f"PROD-{uuid.uuid4().hex[:8].upper()}" for _ in range(int(num_recipes * 0.7))]
    else:
        product_ids = products_df['product_id'].unique().tolist()
    
    # Generate personnel IDs if not provided
    if personnel_df is None or len(personnel_df) == 0:
        print("Generating synthetic personnel IDs...")
        personnel_ids = [f"PERS-{uuid.uuid4().hex[:8].upper()}" for _ in range(20)]
    else:
        personnel_ids = personnel_df['personnel_id'].unique().tolist()
    
    # Filter equipment to get only production equipment (not instruments/sensors)
    production_equipment_types = [
        "Reactor", "Mixer", "Pump", "Compressor", "Heat Exchanger", "Distillation Column",
        "Filter", "Dryer", "Tank", "Furnace", "Conveyor", "Mill", "Centrifuge", "Evaporator",
        "Crystallizer", "Extruder", "Boiler", "Blender", "Separator", "Packaging Machine",
        "CNC Machine", "Robot", "Injection Molder", "Press", "Welder", "Assembly Station",
        "Filling Machine", "Labeling Machine", "Testing Station", "Sterilizer"
    ]
    
    # Get equipment classes for recipe requirements
    equipment_classes = {}
    for _, equip in equipment_df.iterrows():
        eq_type = equip['equipment_type'] if 'equipment_type' in equip else "Unknown"
        if eq_type in production_equipment_types:
            if eq_type not in equipment_classes:
                equipment_classes[eq_type] = []
            equipment_classes[eq_type].append(equip['equipment_id'])
    
    # Define recipe types
    recipe_types = ["Batch", "Continuous", "Discrete", "Testing", "Cleaning", "Validation"]
    
    # Define recipe statuses
    recipe_statuses = ["Active", "In Review", "Approved", "Obsolete", "Draft", "Testing"]
    status_weights = [0.6, 0.1, 0.15, 0.05, 0.05, 0.05]  # Mostly active recipes
    
    # Generate recipe data
    data = {
        "recipe_id": [f"RECIPE-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_recipes)],
        "recipe_name": [],
        "product_id": [],
        "version": [],
        "status": [],
        "author": [],
        "creation_date": [],
        "approval_date": [],
        "approved_by": [],
        "recipe_type": [],
        "equipment_requirement": [],
        "expected_duration_minutes": [],
        "recipe_description": []
    }
    
    # Product-related recipe names
    product_prefixes = ["Production", "Manufacturing", "Processing", "Assembly", "Synthesis", "Formulation"]
    
    # Cleaning recipe names
    cleaning_prefixes = ["CIP", "Cleaning", "Sanitization", "Sterilization", "Flush", "Purge"]
    cleaning_suffixes = ["Procedure", "Protocol", "Sequence", "Cycle"]
    
    # Testing recipe names
    testing_prefixes = ["Test", "Validation", "Verification", "Qualification", "Calibration"]
    testing_suffixes = ["Protocol", "Procedure", "Method", "Sequence"]
    
    # Recipe description templates
    description_templates = [
        "Standard {type} recipe for {product} production using {equipment} equipment.",
        "{type} recipe designed for optimal {product} quality with defined process parameters.",
        "Validated {type} recipe for consistent {product} manufacturing in {equipment} environment.",
        "Optimized {type} process for {product} with reduced cycle time and improved yield.",
        "Regulatory approved {type} recipe for {product} meeting all quality requirements.",
        "{type} recipe with enhanced process control for premium {product} production."
    ]
    
    # Generate data for each recipe
    for i in range(num_recipes):
        # Assign a product ID (some recipes may be for cleaning, testing, etc. without a product)
        if random.random() < 0.8:  # 80% of recipes are for products
            product_id = random.choice(product_ids)
            data["product_id"].append(product_id)
            
            # For product recipes, create product-based names
            prefix = random.choice(product_prefixes)
            # Extract a product code or ID suffix to use in the name
            product_code = product_id.split('-')[-1][:4]
            recipe_name = f"{prefix} Recipe {product_code}-{random.randint(100, 999)}"
            
            # Recipe type (mostly batch for product recipes)
            if random.random() < 0.7:
                recipe_type = "Batch"
            else:
                recipe_type = random.choice(["Continuous", "Discrete"])
        else:
            # Non-product recipes (cleaning, testing, etc.)
            data["product_id"].append("")
            
            # Determine if it's a cleaning or testing recipe
            if random.random() < 0.5:
                # Cleaning recipe
                prefix = random.choice(cleaning_prefixes)
                suffix = random.choice(cleaning_suffixes)
                recipe_name = f"{prefix} {suffix} {random.randint(100, 999)}"
                recipe_type = random.choice(["Cleaning", "Validation"])
            else:
                # Testing recipe
                prefix = random.choice(testing_prefixes)
                suffix = random.choice(testing_suffixes)
                recipe_name = f"{prefix} {suffix} {random.randint(100, 999)}"
                recipe_type = random.choice(["Testing", "Validation"])
        
        data["recipe_name"].append(recipe_name)
        data["recipe_type"].append(recipe_type)
        
        # Version numbering (major.minor)
        major_version = random.randint(1, 3)
        minor_version = random.randint(0, 9)
        data["version"].append(f"{major_version}.{minor_version}")
        
        # Status (weighted random)
        data["status"].append(random.choices(recipe_statuses, weights=status_weights)[0])
        
        # Author (personnel who created the recipe)
        data["author"].append(random.choice(personnel_ids))
        
        # Creation date (1-18 months ago)
        creation_days_ago = random.randint(30, 540)
        creation_date = datetime.now() - timedelta(days=creation_days_ago)
        data["creation_date"].append(creation_date.strftime("%Y-%m-%d"))
        
        # Approval date and approver
        if data["status"][i] in ["Approved", "Active", "Obsolete"]:
            # Approved recipes have approval dates after creation
            approval_days_ago = random.randint(0, min(creation_days_ago - 1, 30))
            approval_date = datetime.now() - timedelta(days=approval_days_ago)
            data["approval_date"].append(approval_date.strftime("%Y-%m-%d"))
            
            # Approver is a different person than author
            available_approvers = [p for p in personnel_ids if p != data["author"][i]]
            data["approved_by"].append(random.choice(available_approvers))
        else:
            # Unapproved recipes
            data["approval_date"].append("")
            data["approved_by"].append("")
        
        # Equipment requirement
        # Select a suitable equipment class for this recipe
        if recipe_type == "Batch":
            possible_classes = ["Reactor", "Mixer", "Tank", "Blender"]
        elif recipe_type == "Continuous":
            possible_classes = ["Extruder", "Distillation Column", "Reactor", "Heat Exchanger"]
        elif recipe_type == "Discrete":
            possible_classes = ["CNC Machine", "Robot", "Injection Molder", "Assembly Station"]
        elif recipe_type == "Cleaning":
            possible_classes = ["Reactor", "Tank", "Mixer", "Filter", "Heat Exchanger"]
        elif recipe_type == "Testing":
            possible_classes = ["Testing Station", "Reactor", "Mixer"]
        else:  # Validation
            possible_classes = ["Reactor", "Tank", "Mixer", "Testing Station"]
        
        # Filter to classes that exist in our equipment data
        available_classes = [c for c in possible_classes if c in equipment_classes]
        
        if available_classes:
            equipment_class = random.choice(available_classes)
            data["equipment_requirement"].append(equipment_class)
        else:
            # Default to any available equipment class
            all_classes = list(equipment_classes.keys())
            if all_classes:
                data["equipment_requirement"].append(random.choice(all_classes))
            else:
                data["equipment_requirement"].append("Any")
        
        # Expected duration
        if recipe_type == "Batch":
            # Batch processes typically take longer
            duration = random.randint(60, 480)  # 1-8 hours
        elif recipe_type == "Continuous":
            # Continuous processes typically run for very long periods
            duration = random.randint(480, 4320)  # 8-72 hours
        elif recipe_type == "Discrete":
            # Discrete manufacturing typically has shorter cycles
            duration = random.randint(10, 120)  # 10 min - 2 hours
        elif recipe_type == "Cleaning":
            # Cleaning processes are usually shorter
            duration = random.randint(30, 120)  # 30 min - 2 hours
        else:  # Testing and Validation
            duration = random.randint(60, 240)  # 1-4 hours
        
        data["expected_duration_minutes"].append(duration)
        
        # Generate recipe description
        template = random.choice(description_templates)
        recipe_type_desc = recipe_type.lower()
        product_desc = "product" if data["product_id"][i] == "" else f"product {product_id.split('-')[-1]}"
        equipment_desc = data["equipment_requirement"][i].lower()
        
        description = template.format(
            type=recipe_type_desc,
            product=product_desc,
            equipment=equipment_desc
        )
        
        data["recipe_description"].append(description)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} recipe records.")
    print(f"Data saved to {output_file}")
    
    return df

def generate_batch_steps(recipes_df, num_steps_per_recipe=10, output_file="data/batch_steps.csv"):
    """
    Generate synthetic batch steps data based on the recipes table
    
    Parameters:
    - recipes_df: DataFrame containing recipes data
    - num_steps_per_recipe: Average number of steps per recipe
    - output_file: CSV file to save the batch steps data
    
    Returns:
    - DataFrame containing the generated batch steps data
    """
    if recipes_df is None or len(recipes_df) == 0:
        print("Error: No recipes data available.")
        return None
    
    # Filter to only include batch and discrete recipes (those that have steps)
    batch_recipes = recipes_df[recipes_df['recipe_type'].isin(['Batch', 'Discrete', 'Testing', 'Cleaning'])]
    
    if len(batch_recipes) == 0:
        print("Warning: No batch recipes found. Generating steps for all recipes instead.")
        batch_recipes = recipes_df
    
    # Define step types based on recipe types
    step_types = {
        "Batch": [
            "Charging", "Mixing", "Heating", "Cooling", "Reaction", "Holding", 
            "Sampling", "pH Adjustment", "Transfer", "Filtration", "Discharge"
        ],
        "Discrete": [
            "Loading", "Processing", "Assembly", "Testing", "Inspection", 
            "Machining", "Welding", "Packaging", "Labeling", "Unloading"
        ],
        "Testing": [
            "Sample Preparation", "Measurement", "Analysis", "Verification", 
            "Calibration", "Data Collection", "Reporting", "Cleanup"
        ],
        "Cleaning": [
            "Pre-rinse", "Detergent Wash", "Rinse", "Sanitization", 
            "Final Rinse", "Drying", "Inspection", "Documentation"
        ],
        "Continuous": [
            "Startup", "Steady State", "Parameter Adjustment", "Sampling", 
            "Monitoring", "Shutdown", "Transition"
        ],
        "Validation": [
            "Setup", "Execution", "Data Collection", "Analysis", 
            "Verification", "Documentation", "Approval"
        ]
    }
    
    # Common steps for most recipes
    common_steps = ["Setup", "Documentation", "Cleanup"]
    
    # Generate batch steps data
    data = {
        "step_id": [],
        "recipe_id": [],
        "step_name": [],
        "step_number": [],
        "description": [],
        "expected_duration_minutes": [],
        "step_type": [],
        "equipment_requirement": [],
        "predecessor_steps": [],
        "successor_steps": []
    }
    
    # Process each recipe
    for _, recipe in batch_recipes.iterrows():
        recipe_id = recipe['recipe_id']
        recipe_type = recipe['recipe_type']
        recipe_duration = recipe['expected_duration_minutes']
        
        # Determine appropriate steps for this recipe type
        if recipe_type in step_types:
            possible_steps = step_types[recipe_type]
        else:
            possible_steps = step_types["Batch"]  # Default to batch steps
        
        # Always include some common steps
        all_possible_steps = possible_steps + common_steps
        
        # Vary the number of steps per recipe
        num_steps = max(3, int(random.normalvariate(num_steps_per_recipe, num_steps_per_recipe/4)))
        
        # Make sure we don't have more steps than possible step types
        num_steps = min(num_steps, len(all_possible_steps))
        
        # For recipes with few steps, ensure we have at least setup, main operation, and cleanup
        if num_steps <= 3:
            selected_steps = ["Setup", random.choice(possible_steps), "Cleanup"]
        else:
            # Select steps without replacement (to avoid duplicates)
            # Always include Setup as first step and Cleanup as last step
            middle_steps = random.sample(possible_steps, min(num_steps - 2, len(possible_steps)))
            if len(middle_steps) < num_steps - 2:
                # We need to add more steps - repeat some randomly
                additional_needed = num_steps - 2 - len(middle_steps)
                additional_steps = random.choices(possible_steps, k=additional_needed)
                middle_steps.extend(additional_steps)
            selected_steps = ["Setup"] + middle_steps + ["Cleanup"]
        
        # Calculate step durations (should sum approximately to recipe duration)
        # Setup and cleanup are typically shorter
        setup_duration = max(5, int(recipe_duration * 0.05))  # 5% of total time, min 5 minutes
        cleanup_duration = max(5, int(recipe_duration * 0.05))  # 5% of total time, min 5 minutes
        
        # Remaining time for operational steps
        remaining_duration = recipe_duration - (setup_duration + cleanup_duration)
        
        # Divide remaining time among operational steps
        # Use a random distribution to make some steps longer than others
        if len(selected_steps) > 2:
            # Generate random weights for duration distribution
            weights = [random.random() for _ in range(len(selected_steps) - 2)]
            total_weight = sum(weights)
            # Normalize weights
            normalized_weights = [w / total_weight for w in weights]
            # Calculate durations
            op_durations = [max(1, int(remaining_duration * w)) for w in normalized_weights]
        else:
            op_durations = []
        
        # Combine all durations
        step_durations = [setup_duration] + op_durations + [cleanup_duration]
        
        # Create step records
        step_ids = []
        for i, (step_name, duration) in enumerate(zip(selected_steps, step_durations)):
            step_id = f"STEP-{uuid.uuid4().hex[:8].upper()}"
            step_ids.append(step_id)
            
            # Determine step type (more specific classification)
            if step_name in ["Setup", "Cleanup", "Documentation"]:
                step_type = "Utility"
            elif step_name in ["Charging", "Loading", "Transfer", "Discharge", "Unloading"]:
                step_type = "Material Handling"
            elif step_name in ["Heating", "Cooling", "Reaction", "Holding"]:
                step_type = "Process"
            elif step_name in ["Sampling", "Testing", "Inspection", "Analysis", "Measurement"]:
                step_type = "Quality"
            elif step_name in ["Mixing", "Assembly", "Machining", "Welding"]:
                step_type = "Operation"
            elif step_name in ["Pre-rinse", "Detergent Wash", "Rinse", "Sanitization", "Final Rinse", "Drying"]:
                step_type = "Cleaning"
            else:
                step_type = "Standard"
            
            # Create detailed step name
            detailed_step_name = f"{step_name} {i+1}" if i > 0 and i < len(selected_steps)-1 else step_name
            
            # Create step description
            if step_name == "Setup":
                description = f"Prepare equipment and materials for {recipe_type.lower()} process."
            elif step_name == "Cleanup":
                description = f"Clean equipment and dispose of waste materials after {recipe_type.lower()} completion."
            elif step_name == "Documentation":
                description = f"Record process parameters and batch information for {recipe_type.lower()} record."
            else:
                descriptions = [
                    f"Execute {step_name.lower()} operation according to standard procedure.",
                    f"Perform {step_name.lower()} step with specified parameters.",
                    f"Complete {step_name.lower()} phase of the {recipe_type.lower()} process.",
                    f"{step_name} operation with quality verification.",
                    f"Standard {step_name.lower()} procedure for {recipe_type.lower()} recipe."
                ]
                description = random.choice(descriptions)
            
            # Equipment requirement (inherit from recipe)
            equipment_requirement = recipe['equipment_requirement']
            
            data["step_id"].append(step_id)
            data["recipe_id"].append(recipe_id)
            data["step_name"].append(detailed_step_name)
            data["step_number"].append(i + 1)
            data["description"].append(description)
            data["expected_duration_minutes"].append(duration)
            data["step_type"].append(step_type)
            data["equipment_requirement"].append(equipment_requirement)
            
            # Set predecessor steps (leave blank for first step)
            if i == 0:
                data["predecessor_steps"].append("")
            else:
                data["predecessor_steps"].append(step_ids[i-1])
            
            # Set successor steps (leave blank for last step)
            if i == len(selected_steps) - 1:
                data["successor_steps"].append("")
            else:
                # Will be filled in the next iteration
                data["successor_steps"].append("")
        
        # Fill in successor steps
        for i in range(len(step_ids) - 1):
            data["successor_steps"][len(data["successor_steps"]) - len(step_ids) + i] = step_ids[i+1]
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} batch step records for {len(batch_recipes)} recipes.")
    print(f"Data saved to {output_file}")
    
    return df

def display_statistics(recipes_df, batch_steps_df=None):
    """
    Display basic statistics about the generated recipes and batch steps data
    
    Parameters:
    - recipes_df: DataFrame containing recipes data
    - batch_steps_df: DataFrame containing batch steps data (optional)
    """
    if recipes_df is None or len(recipes_df) == 0:
        print("No recipes data to analyze.")
        return
    
    print("\nRecipes Statistics:")
    print(f"Total recipes: {len(recipes_df)}")
    
    print("\nRecipe Type Distribution:")
    type_counts = recipes_df['recipe_type'].value_counts()
    for recipe_type, count in type_counts.items():
        print(f"  {recipe_type}: {count} ({count/len(recipes_df)*100:.1f}%)")
    
    print("\nStatus Distribution:")
    status_counts = recipes_df['status'].value_counts()
    for status, count in status_counts.items():
        print(f"  {status}: {count} ({count/len(recipes_df)*100:.1f}%)")
    
    print("\nEquipment Requirement Distribution:")
    equip_counts = recipes_df['equipment_requirement'].value_counts().head(10)
    for equip, count in equip_counts.items():
        print(f"  {equip}: {count} ({count/len(recipes_df)*100:.1f}%)")
    
    # Product association
    product_count = recipes_df['product_id'].apply(lambda x: x != "").sum()
    print(f"\nRecipes with product association: {product_count} ({product_count/len(recipes_df)*100:.1f}%)")
    
    # Duration statistics
    durations = recipes_df['expected_duration_minutes'].astype(float)
    print(f"\nDuration Statistics:")
    print(f"  Average duration: {durations.mean():.1f} minutes")
    print(f"  Minimum duration: {durations.min()} minutes")
    print(f"  Maximum duration: {durations.max()} minutes")
    
    # Batch steps statistics
    if batch_steps_df is not None and len(batch_steps_df) > 0:
        print("\nBatch Steps Statistics:")
        print(f"Total batch steps: {len(batch_steps_df)}")
        
        # Steps per recipe
        steps_per_recipe = batch_steps_df.groupby('recipe_id').size()
        print(f"Average steps per recipe: {steps_per_recipe.mean():.1f}")
        print(f"Minimum steps per recipe: {steps_per_recipe.min()}")
        print(f"Maximum steps per recipe: {steps_per_recipe.max()}")
        
        print("\nStep Type Distribution:")
        step_type_counts = batch_steps_df['step_type'].value_counts()
        for step_type, count in step_type_counts.items():
            print(f"  {step_type}: {count} ({count/len(batch_steps_df)*100:.1f}%)")
        
        # Step duration statistics
        step_durations = batch_steps_df['expected_duration_minutes'].astype(float)
        print(f"\nStep Duration Statistics:")
        print(f"  Average step duration: {step_durations.mean():.1f} minutes")
        print(f"  Minimum step duration: {step_durations.min()} minutes")
        print(f"  Maximum step duration: {step_durations.max()} minutes")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load the equipment data
    equipment_df = load_equipment_data()
    
    # Try to load products and personnel data if available
    products_df = load_products_data()
    personnel_df = load_personnel_data()
    
    if equipment_df is not None:
        # Generate recipes data
        recipes_df = generate_recipes_data(
            equipment_df,
            products_df,
            personnel_df,
            num_recipes=50,  # Generate 50 recipe records
            output_file="data/recipes.csv"
        )
        
        # Generate batch steps data
        if recipes_df is not None:
            batch_steps_df = generate_batch_steps(
                recipes_df,
                num_steps_per_recipe=10,  # Average 10 steps per recipe
                output_file="data/batch_steps.csv"
            )
        else:
            batch_steps_df = None
        
        # Display sample data and statistics
        if recipes_df is not None:
            print("\nSample recipes data (first 5 records):")
            print(recipes_df.head())
            
            if batch_steps_df is not None:
                print("\nSample batch steps data (first 5 records):")
                print(batch_steps_df.head())
            
            # Display statistics
            display_statistics(recipes_df, batch_steps_df)

Note: Products data file data/products.csv not found.
Recipes will be generated with synthetic product IDs.
Note: Personnel data file data/personnel.csv not found.
Recipes will be generated with synthetic personnel IDs.
Generating synthetic product IDs...
Generating synthetic personnel IDs...
Successfully generated 50 recipe records.
Data saved to data/recipes.csv
Successfully generated 366 batch step records for 37 recipes.
Data saved to data/batch_steps.csv

Sample recipes data (first 5 records):
         recipe_id                  recipe_name     product_id version  \
0  RECIPE-5FEEC092  Sterilization Procedure 992                    3.2   
1  RECIPE-DE9EAF5C  Formulation Recipe D406-470  PROD-D40652D0     3.6   
2  RECIPE-45C2D7EF  Formulation Recipe BCB4-619  PROD-BCB4AE01     2.3   
3  RECIPE-70CF4DCE   Processing Recipe D9CE-342  PROD-D9CEBCB2     3.1   
4  RECIPE-7303DA66    Synthesis Recipe 2BC2-577  PROD-2BC251ED     3.1   

     status         author creation_date approval_d

BatchesStepExecution

In [13]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os
import csv 

def load_equipment_data(equipment_file="data/equipment.csv"):
    """
    Load the previously generated equipment data
    
    Parameters:
    - equipment_file: CSV file containing equipment data
    
    Returns:
    - DataFrame containing the equipment data
    """
    try:
        return pd.read_csv(equipment_file)
    except FileNotFoundError:
        print(f"Error: Equipment data file {equipment_file} not found.")
        print("Please run the equipment data generation script first.")
        return None

def load_recipes_data(recipes_file="data/recipes.csv"):
    """
    Load the previously generated recipes data
    
    Parameters:
    - recipes_file: CSV file containing recipes data
    
    Returns:
    - DataFrame containing the recipes data
    """
    try:
        return pd.read_csv(recipes_file)
    except FileNotFoundError:
        print(f"Error: Recipes data file {recipes_file} not found.")
        print("Please run the recipes data generation script first.")
        return None

def load_batch_steps_data(batch_steps_file="data/batch_steps.csv"):
    """
    Load the previously generated batch steps data
    
    Parameters:
    - batch_steps_file: CSV file containing batch steps data
    
    Returns:
    - DataFrame containing the batch steps data
    """
    try:
        return pd.read_csv(batch_steps_file)
    except FileNotFoundError:
        print(f"Error: Batch steps data file {batch_steps_file} not found.")
        print("Please run the batch steps data generation script first.")
        return None

def load_personnel_data(personnel_file="data/personnel.csv"):
    """
    Load the previously generated personnel data if available
    
    Parameters:
    - personnel_file: CSV file containing personnel data
    
    Returns:
    - DataFrame containing the personnel data or None if not available
    """
    try:
        return pd.read_csv(personnel_file)
    except FileNotFoundError:
        print(f"Note: Personnel data file {personnel_file} not found.")
        print("Batches will be generated with synthetic personnel IDs.")
        return None

def generate_batches(recipes_df, equipment_df, personnel_df=None, num_batches=100, 
                    start_time=None, end_time=None, output_file="data/batches.csv"):
    """
    Generate synthetic batch data based on the recipes table
    
    Parameters:
    - recipes_df: DataFrame containing recipes data
    - equipment_df: DataFrame containing equipment data
    - personnel_df: DataFrame containing personnel data (optional)
    - num_batches: Number of batch records to generate
    - start_time: Start time for batch history (defaults to 30 days ago)
    - end_time: End time for batch history (defaults to now)
    - output_file: CSV file to save the batches data
    
    Returns:
    - DataFrame containing the generated batches data
    """
    if recipes_df is None or len(recipes_df) == 0:
        print("Error: No recipes data available.")
        return None
        
    if equipment_df is None or len(equipment_df) == 0:
        print("Error: No equipment data available.")
        return None
    
    # Set default time range if not provided
    if start_time is None:
        start_time = datetime.now() - timedelta(days=30)
    if end_time is None:
        end_time = datetime.now()
    
    # Filter recipes to include only batch and discrete recipes
    batch_recipes = recipes_df[recipes_df['recipe_type'].isin(['Batch', 'Discrete'])]
    
    if len(batch_recipes) == 0:
        print("Warning: No batch recipes found. Using all available recipes.")
        batch_recipes = recipes_df
    
    # Generate work order IDs (to be used as foreign keys)
    work_order_ids = [f"WO-{uuid.uuid4().hex[:8].upper()}" for _ in range(int(num_batches * 0.7))]
    
    # Generate product IDs if not present in recipes
    product_ids = []
    for _, recipe in batch_recipes.iterrows():
        if pd.notna(recipe['product_id']) and recipe['product_id'] != "":
            product_ids.append(recipe['product_id'])
    
    if not product_ids:
        print("Warning: No product IDs found in recipes. Generating synthetic product IDs.")
        product_ids = [f"PROD-{uuid.uuid4().hex[:8].upper()}" for _ in range(20)]
    
    # Generate personnel IDs if not provided
    if personnel_df is None or len(personnel_df) == 0:
        print("Generating synthetic personnel IDs...")
        operator_ids = [f"OP-{uuid.uuid4().hex[:8].upper()}" for _ in range(15)]
    else:
        operator_ids = personnel_df['personnel_id'].unique().tolist()
    
    # Define batch statuses and their probabilities
    batch_statuses = [
        "Planned", "In Progress", "Completed", "Aborted", "On Hold", "Rejected"
    ]
    
    status_weights = [0.15, 0.25, 0.45, 0.05, 0.05, 0.05]  # Higher weights for active and completed
    
    # Get possible equipment for batches
    equipment_by_type = {}
    for _, equip in equipment_df.iterrows():
        eq_type = equip['equipment_type']
        if eq_type not in equipment_by_type:
            equipment_by_type[eq_type] = []
        equipment_by_type[eq_type].append(equip['equipment_id'])
    
    # Generate batch data
    data = {
        "batch_id": [f"BATCH-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_batches)],
        "recipe_id": [],
        "work_order_id": [],
        "product_id": [],
        "batch_size": [],
        "batch_size_unit": [],
        "planned_start_time": [],
        "actual_start_time": [],
        "planned_end_time": [],
        "actual_end_time": [],
        "batch_status": [],
        "equipment_id": [],
        "operator_id": [],
        "parent_batch_id": []
    }
    
    # Create a dictionary to track parent-child relationships
    batch_hierarchy = {}
    potential_parents = random.sample(data["batch_id"], int(num_batches * 0.2))  # 20% can be parents
    
    # Units of measurement based on product type
    units = ["kg", "L", "units", "gal", "m³", "tons"]
    
    # Generate data for each batch
    for i in range(num_batches):
        batch_id = data["batch_id"][i]
        
        # Select recipe (weighted toward active recipes)
        active_recipes = batch_recipes[batch_recipes['status'] == 'Active']
        if len(active_recipes) > 0 and random.random() < 0.8:  # 80% chance of using active recipe
            recipe = active_recipes.sample(1).iloc[0]
        else:
            recipe = batch_recipes.sample(1).iloc[0]
        
        data["recipe_id"].append(recipe['recipe_id'])
        
        # Assign work order (some batches may not be associated with a work order)
        if random.random() < 0.9:  # 90% chance of having a work order
            data["work_order_id"].append(random.choice(work_order_ids))
        else:
            data["work_order_id"].append("")
        
        # Assign product (use product from recipe if available)
        if pd.notna(recipe['product_id']) and recipe['product_id'] != "":
            data["product_id"].append(recipe['product_id'])
        else:
            data["product_id"].append(random.choice(product_ids))
        
        # Generate batch size (random but realistic)
        if recipe['recipe_type'] == 'Batch':
            # Batch sizes tend to be in specific ranges depending on industry
            size_options = [50, 100, 200, 500, 1000, 2000, 5000]
            batch_size = random.choice(size_options) * random.uniform(0.8, 1.2)  # Add some variation
            batch_size = round(batch_size, 1)
        elif recipe['recipe_type'] == 'Discrete':
            # Discrete manufacturing typically produces in lot sizes
            size_options = [10, 25, 50, 100, 250, 500, 1000]
            batch_size = random.choice(size_options)
        else:
            # Default size range for other recipe types
            batch_size = random.randint(50, 5000)
        
        data["batch_size"].append(batch_size)
        
        # Assign appropriate unit
        unit = random.choice(units)
        data["batch_size_unit"].append(unit)
        
        # Generate batch timing
        # Create a random start time within the specified range
        time_range_minutes = int((end_time - start_time).total_seconds() / 60)
        random_minutes_offset = random.randint(0, time_range_minutes)
        planned_start_time = start_time + timedelta(minutes=random_minutes_offset)
        
        # Get expected duration from recipe
        if pd.notna(recipe['expected_duration_minutes']):
            expected_duration = float(recipe['expected_duration_minutes'])
        else:
            expected_duration = random.randint(60, 480)  # Default 1-8 hours
        
        # Add some variation to the duration for planned vs actual
        planned_duration = expected_duration * random.uniform(0.9, 1.1)  # +/- 10%
        planned_end_time = planned_start_time + timedelta(minutes=planned_duration)
        
        data["planned_start_time"].append(planned_start_time.strftime("%Y-%m-%d %H:%M:%S"))
        data["planned_end_time"].append(planned_end_time.strftime("%Y-%m-%d %H:%M:%S"))
        
        # Determine batch status (weighted random)
        batch_status = random.choices(batch_statuses, weights=status_weights)[0]
        data["batch_status"].append(batch_status)
        
        # Set actual times based on status
        if batch_status == "Planned":
            # Hasn't started yet
            data["actual_start_time"].append("")
            data["actual_end_time"].append("")
        elif batch_status == "In Progress":
            # Started but not finished
            # Actual start time might have some deviation from planned
            start_deviation_minutes = random.randint(-60, 60)  # +/- 1 hour
            actual_start_time = planned_start_time + timedelta(minutes=start_deviation_minutes)
            
            # Ensure actual start time is not in the future
            if actual_start_time > end_time:
                actual_start_time = end_time - timedelta(minutes=random.randint(0, 60))
                
            data["actual_start_time"].append(actual_start_time.strftime("%Y-%m-%d %H:%M:%S"))
            data["actual_end_time"].append("")  # Still in progress
        elif batch_status in ["Completed", "Aborted", "Rejected"]:
            # Both started and finished
            start_deviation_minutes = random.randint(-60, 60)  # +/- 1 hour
            actual_start_time = planned_start_time + timedelta(minutes=start_deviation_minutes)
            
            # For actual duration, completed batches are typically close to planned
            # Aborted/rejected batches may be shorter
            if batch_status == "Completed":
                actual_duration = planned_duration * random.uniform(0.9, 1.2)  # +/- 20%
            else:
                # Aborted/rejected batches often finish early
                actual_duration = planned_duration * random.uniform(0.2, 0.8)  # 20-80% of planned
            
            actual_end_time = actual_start_time + timedelta(minutes=actual_duration)
            
            # Ensure times are within overall time range
            if actual_start_time > end_time:
                actual_start_time = end_time - timedelta(minutes=int(actual_duration) + 1)
            if actual_end_time > end_time:
                actual_end_time = end_time
                
            data["actual_start_time"].append(actual_start_time.strftime("%Y-%m-%d %H:%M:%S"))
            data["actual_end_time"].append(actual_end_time.strftime("%Y-%m-%d %H:%M:%S"))
        else:  # On Hold
            # Started but paused
            start_deviation_minutes = random.randint(-60, 60)  # +/- 1 hour
            actual_start_time = planned_start_time + timedelta(minutes=start_deviation_minutes)
            
            # Ensure actual start time is not in the future
            if actual_start_time > end_time:
                actual_start_time = end_time - timedelta(minutes=random.randint(0, 60))
                
            data["actual_start_time"].append(actual_start_time.strftime("%Y-%m-%d %H:%M:%S"))
            data["actual_end_time"].append("")  # Not finished
        
        # Assign equipment based on recipe requirement
        equipment_requirement = recipe['equipment_requirement']
        if equipment_requirement in equipment_by_type:
            # Use equipment of the required type
            data["equipment_id"].append(random.choice(equipment_by_type[equipment_requirement]))
        elif len(equipment_by_type) > 0:
            # Use any available equipment
            random_type = random.choice(list(equipment_by_type.keys()))
            data["equipment_id"].append(random.choice(equipment_by_type[random_type]))
        else:
            data["equipment_id"].append("")
        
        # Assign operator
        data["operator_id"].append(random.choice(operator_ids))
        
        # Determine parent batch (if any)
        # About 10% of batches will have a parent
        if batch_id not in potential_parents and random.random() < 0.1:
            # Ensure we don't create circular references
            available_parents = [p for p in potential_parents if p != batch_id and p not in batch_hierarchy.get(batch_id, [])]
            if available_parents:
                parent_id = random.choice(available_parents)
                data["parent_batch_id"].append(parent_id)
                
                # Update hierarchy
                if parent_id not in batch_hierarchy:
                    batch_hierarchy[parent_id] = []
                batch_hierarchy[parent_id].append(batch_id)
            else:
                data["parent_batch_id"].append("")
        else:
            data["parent_batch_id"].append("")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} batch records.")
    print(f"Data saved to {output_file}")
    
    return df

def generate_batch_execution(batches_df, batch_steps_df, equipment_df, personnel_df=None, 
                           output_file="data/batch_execution.csv"):
    """
    Generate synthetic batch step execution data based on the batches and batch steps tables
    
    Parameters:
    - batches_df: DataFrame containing batches data
    - batch_steps_df: DataFrame containing batch steps data
    - equipment_df: DataFrame containing equipment data
    - personnel_df: DataFrame containing personnel data (optional)
    - output_file: CSV file to save the batch execution data
    
    Returns:
    - DataFrame containing a sample of the generated batch execution data
    """
    if batches_df is None or len(batches_df) == 0:
        print("Error: No batches data available.")
        return None
        
    if batch_steps_df is None or len(batch_steps_df) == 0:
        print("Error: No batch steps data available.")
        return None
        
    if equipment_df is None or len(equipment_df) == 0:
        print("Error: No equipment data available.")
        return None
    
    # Generate personnel IDs if not provided
    if personnel_df is None or len(personnel_df) == 0:
        print("Generating synthetic personnel IDs...")
        operator_ids = [f"OP-{uuid.uuid4().hex[:8].upper()}" for _ in range(15)]
    else:
        operator_ids = personnel_df['personnel_id'].unique().tolist()
    
    # Define step execution statuses
    step_statuses = ["Completed", "In Progress", "Pending", "Aborted", "Skipped", "Paused"]
    
    # Prepare the output file with CSV writer for memory efficiency
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = [
            'execution_id', 'batch_id', 'step_id', 'equipment_id', 
            'start_time', 'end_time', 'status', 'operator_id',
            'actual_duration_minutes', 'deviation_reason', 'step_parameters'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        print(f"Generating batch step execution data...")
        execution_count = 0
        
        # Group batch steps by recipe
        recipe_steps = {}
        for _, step in batch_steps_df.iterrows():
            recipe_id = step['recipe_id']
            if recipe_id not in recipe_steps:
                recipe_steps[recipe_id] = []
            recipe_steps[recipe_id].append(step)
        
        # Process each batch
        for _, batch in batches_df.iterrows():
            batch_id = batch['batch_id']
            recipe_id = batch['recipe_id']
            batch_status = batch['batch_status']
            
            # Skip if no steps for this recipe
            if recipe_id not in recipe_steps or not recipe_steps[recipe_id]:
                continue
            
            # Get all steps for this recipe
            steps = recipe_steps[recipe_id]
            
            # Sort steps by step number
            steps = sorted(steps, key=lambda x: x['step_number'])
            
            # Get batch timing
            if pd.notna(batch['planned_start_time']) and batch['planned_start_time']:
                planned_start_time = datetime.strptime(batch['planned_start_time'], "%Y-%m-%d %H:%M:%S")
            else:
                planned_start_time = datetime.now() - timedelta(days=random.randint(1, 30))
                
            if pd.notna(batch['actual_start_time']) and batch['actual_start_time']:
                batch_start_time = datetime.strptime(batch['actual_start_time'], "%Y-%m-%d %H:%M:%S")
            else:
                # For planned batches, use a future start time
                batch_start_time = None
                
            if pd.notna(batch['actual_end_time']) and batch['actual_end_time']:
                batch_end_time = datetime.strptime(batch['actual_end_time'], "%Y-%m-%d %H:%M:%S")
            else:
                batch_end_time = None
            
            # Get the equipment assigned to this batch
            batch_equipment_id = batch['equipment_id'] if pd.notna(batch['equipment_id']) else None
            
            # Generate step execution records for this batch
            current_time = batch_start_time if batch_start_time else planned_start_time
            
            for i, step in enumerate(steps):
                step_id = step['step_id']
                expected_duration = step['expected_duration_minutes']
                
                # Create a unique execution ID
                execution_id = f"EXEC-{uuid.uuid4().hex[:12].upper()}"
                
                # Assign the same operator as the batch or a random one
                if random.random() < 0.7:  # 70% chance of same operator as batch
                    operator_id = batch['operator_id']
                else:
                    operator_id = random.choice(operator_ids)
                
                # Determine equipment (use batch equipment if possible)
                if batch_equipment_id:
                    equipment_id = batch_equipment_id
                else:
                    # Try to find equipment of required type
                    equipment_type = step['equipment_requirement']
                    matching_equipment = equipment_df[equipment_df['equipment_type'] == equipment_type]
                    if len(matching_equipment) > 0:
                        equipment_id = matching_equipment.sample(1).iloc[0]['equipment_id']
                    else:
                        equipment_id = equipment_df.sample(1).iloc[0]['equipment_id']
                
                # Determine step execution status based on batch status
                if batch_status == "Planned":
                    # All steps are pending for planned batches
                    status = "Pending"
                    start_time = ""
                    end_time = ""
                    actual_duration = 0
                elif batch_status == "In Progress":
                    # Steps before current point are completed, current step in progress, rest pending
                    if i < len(steps) / 3:  # First third of steps
                        status = "Completed"
                        # Add some variation to actual duration
                        actual_duration = expected_duration * random.uniform(0.8, 1.2)  # +/- 20%
                        start_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
                        end_time = (current_time + timedelta(minutes=actual_duration)).strftime("%Y-%m-%d %H:%M:%S")
                        current_time = current_time + timedelta(minutes=actual_duration)
                    elif i < len(steps) / 2:  # Middle
                        status = "In Progress"
                        start_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
                        end_time = ""
                        actual_duration = (datetime.now() - current_time).total_seconds() / 60
                    else:  # Remaining steps
                        status = "Pending"
                        start_time = ""
                        end_time = ""
                        actual_duration = 0
                elif batch_status == "Completed":
                    # All steps are completed for completed batches
                    status = "Completed"
                    # Add some variation to actual duration
                    actual_duration = expected_duration * random.uniform(0.8, 1.2)  # +/- 20%
                    start_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
                    end_time = (current_time + timedelta(minutes=actual_duration)).strftime("%Y-%m-%d %H:%M:%S")
                    current_time = current_time + timedelta(minutes=actual_duration)
                elif batch_status == "Aborted":
                    # Steps before abort point are completed, abort step is aborted, rest are skipped
                    abort_point = random.randint(0, len(steps) - 1)
                    if i < abort_point:
                        status = "Completed"
                        actual_duration = expected_duration * random.uniform(0.8, 1.2)
                    elif i == abort_point:
                        status = "Aborted"
                        actual_duration = expected_duration * random.uniform(0.1, 0.5)  # Aborted early
                    else:
                        status = "Skipped"
                        actual_duration = 0
                        
                    if i <= abort_point:
                        start_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
                        end_time = (current_time + timedelta(minutes=actual_duration)).strftime("%Y-%m-%d %H:%M:%S")
                        current_time = current_time + timedelta(minutes=actual_duration)
                    else:
                        start_time = ""
                        end_time = ""
                elif batch_status == "On Hold":
                    # Some steps completed, current step paused
                    hold_point = random.randint(1, len(steps) - 1)
                    if i < hold_point:
                        status = "Completed"
                        actual_duration = expected_duration * random.uniform(0.8, 1.2)
                        start_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
                        end_time = (current_time + timedelta(minutes=actual_duration)).strftime("%Y-%m-%d %H:%M:%S")
                        current_time = current_time + timedelta(minutes=actual_duration)
                    elif i == hold_point:
                        status = "Paused"
                        actual_duration = expected_duration * random.uniform(0.1, 0.8)  # Partially complete
                        start_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
                        end_time = ""
                    else:
                        status = "Pending"
                        actual_duration = 0
                        start_time = ""
                        end_time = ""
                else:  # Rejected or other statuses
                    # Similar to completed but some steps might have issues
                    if random.random() < 0.8:  # 80% chance of normal completion
                        status = "Completed"
                    else:
                        status = random.choice(["Completed with Issues", "Reworked", "Verified"])
                        
                    actual_duration = expected_duration * random.uniform(0.8, 1.2)
                    start_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
                    end_time = (current_time + timedelta(minutes=actual_duration)).strftime("%Y-%m-%d %H:%M:%S")
                    current_time = current_time + timedelta(minutes=actual_duration)
                
                # Check if we've exceeded the batch end time (if known)
                if batch_end_time and current_time > batch_end_time:
                    # Adjust the end time and duration
                    if status in ["Completed", "Aborted"]:
                        end_time = batch_end_time.strftime("%Y-%m-%d %H:%M:%S")
                        if start_time:
                            start_datetime = datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")
                            actual_duration = (batch_end_time - start_datetime).total_seconds() / 60
                
                # Generate deviation reason (only for some steps)
                if status in ["Completed", "Aborted", "Paused"] and abs(actual_duration - expected_duration) > expected_duration * 0.1:
                    deviation_reasons = [
                        "Process variation", "Equipment issue", "Material quality", "Operator intervention",
                        "Quality check", "Parameter adjustment", "Waiting for upstream process",
                        "Environmental factors", "Power fluctuation", "Scheduled break"
                    ]
                    deviation_reason = random.choice(deviation_reasons)
                else:
                    deviation_reason = ""
                
                # Generate step parameters (JSON string with key process parameters)
                if status in ["Completed", "In Progress", "Paused", "Aborted"]:
                    # Create parameter names based on step type
                    step_type = step['step_type'] if 'step_type' in step else "Standard"
                    
                    if step_type == "Material Handling":
                        param_names = ["Material ID", "Quantity", "Container ID", "Verification"]
                    elif step_type == "Process":
                        param_names = ["Temperature", "Pressure", "Time", "Agitation", "pH"]
                    elif step_type == "Quality":
                        param_names = ["Sample ID", "Result", "Specification", "Deviation"]
                    elif step_type == "Operation":
                        param_names = ["Speed", "Duration", "Power", "Mode"]
                    elif step_type == "Cleaning":
                        param_names = ["Agent", "Concentration", "Temperature", "Time"]
                    else:
                        param_names = ["Parameter1", "Parameter2", "Parameter3"]
                    
                    # Generate random parameter values
                    params = {}
                    for param in param_names:
                        if "Temperature" in param:
                            params[param] = f"{random.uniform(20, 100):.1f} °C"
                        elif "Pressure" in param:
                            params[param] = f"{random.uniform(1, 10):.2f} bar"
                        elif "Time" in param:
                            params[param] = f"{random.randint(5, 120)} min"
                        elif "Quantity" in param:
                            params[param] = f"{random.uniform(10, 1000):.1f} kg"
                        elif "Speed" in param:
                            params[param] = f"{random.randint(50, 1000)} rpm"
                        elif "Concentration" in param:
                            params[param] = f"{random.uniform(1, 5):.2f} %"
                        elif "pH" in param:
                            params[param] = f"{random.uniform(2, 12):.1f}"
                        elif "ID" in param:
                            params[param] = f"{param[0]}-{random.randint(1000, 9999)}"
                        elif "Result" in param or "Verification" in param:
                            params[param] = random.choice(["Pass", "Within Spec", "Acceptable"])
                        elif "Mode" in param:
                            params[param] = random.choice(["Auto", "Manual", "Semi-Auto"])
                        elif "Agent" in param:
                            params[param] = random.choice(["CIP-100", "Caustic", "Acid", "Water"])
                        else:
                            params[param] = f"{random.uniform(0, 100):.2f}"
                    
                    # Convert to JSON string
                    import json
                    step_parameters = json.dumps(params)
                else:
                    step_parameters = "{}"
                
                # Write the execution record to the CSV
                writer.writerow({
                    'execution_id': execution_id,
                    'batch_id': batch_id,
                    'step_id': step_id,
                    'equipment_id': equipment_id,
                    'start_time': start_time,
                    'end_time': end_time,
                    'status': status,
                    'operator_id': operator_id,
                    'actual_duration_minutes': round(actual_duration) if actual_duration else "",
                    'deviation_reason': deviation_reason,
                    'step_parameters': step_parameters
                })
                
                execution_count += 1
                if execution_count % 10000 == 0:
                    print(f"Generated {execution_count} batch execution records so far...")
    
    print(f"Successfully generated {execution_count} batch execution records.")
    print(f"Data saved to {output_file}")
    
    # Return a sample of the data (first 1000 rows) for preview
    return pd.read_csv(output_file, nrows=1000)

def get_sample_statistics(batches_file, batch_execution_file):
    """
    Get basic statistics about the generated batches and batch execution data
    
    Parameters:
    - batches_file: CSV file containing the batches data
    - batch_execution_file: CSV file containing the batch execution data
    """
    try:
        # Read batches data
        batches_df = pd.read_csv(batches_file)
        
        print("\nBatches Statistics:")
        print(f"Total batches: {len(batches_df)}")
        
        print("\nBatch Status Distribution:")
        status_counts = batches_df['batch_status'].value_counts()
        for status, count in status_counts.items():
            print(f"  {status}: {count} ({count/len(batches_df)*100:.1f}%)")
        
        # Batch size statistics
        print("\nBatch Size Statistics:")
        print(f"  Average batch size: {batches_df['batch_size'].mean():.1f}")
        print(f"  Min batch size: {batches_df['batch_size'].min()}")
        print(f"  Max batch size: {batches_df['batch_size'].max()}")
        
        # Recipe usage
        print("\nRecipe Usage:")
        recipe_counts = batches_df['recipe_id'].value_counts().head(5)
        for recipe_id, count in recipe_counts.items():
            print(f"  {recipe_id}: {count} batches")
        
        # Parent-child relationships
        parent_count = batches_df['parent_batch_id'].apply(lambda x: x != "").sum()
        print(f"\nBatches with parent: {parent_count} ({parent_count/len(batches_df)*100:.1f}%)")
        
        # Read batch execution data (sample for efficiency)
        execution_df = pd.read_csv(batch_execution_file, nrows=10000)
        
        print("\nBatch Execution Statistics (from sample):")
        print(f"Sample size: {len(execution_df)} execution records")
        
        print("\nExecution Status Distribution:")
        exec_status_counts = execution_df['status'].value_counts()
        for status, count in exec_status_counts.items():
            print(f"  {status}: {count} ({count/len(execution_df)*100:.1f}%)")
        
        # Duration statistics (only for records with duration)
        duration_df = execution_df[execution_df['actual_duration_minutes'] != ""]
        if len(duration_df) > 0:
            duration_df['actual_duration_minutes'] = duration_df['actual_duration_minutes'].astype(float)
            print("\nStep Duration Statistics:")
            print(f"  Average step duration: {duration_df['actual_duration_minutes'].mean():.1f} minutes")
            print(f"  Min step duration: {duration_df['actual_duration_minutes'].min()} minutes")
            print(f"  Max step duration: {duration_df['actual_duration_minutes'].max()} minutes")
        
        # Deviation reasons
        deviation_count = execution_df['deviation_reason'].apply(lambda x: x != "").sum()
        print(f"\nSteps with deviation reasons: {deviation_count} ({deviation_count/len(execution_df)*100:.1f}%)")
        
        # Equipment usage
        print("\nEquipment Usage (top 5):")
        equipment_counts = execution_df['equipment_id'].value_counts().head(5)
        for equipment_id, count in equipment_counts.items():
            print(f"  {equipment_id}: {count} steps")
            
    except Exception as e:
        print(f"Error getting statistics: {e}")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load required data
    recipes_df = load_recipes_data()
    equipment_df = load_equipment_data()
    batch_steps_df = load_batch_steps_data()
    personnel_df = load_personnel_data()
    
    if recipes_df is not None and equipment_df is not None:
        # Generate batches data
        batches_df = generate_batches(
            recipes_df,
            equipment_df,
            personnel_df,
            num_batches=100,  # Generate 100 batch records
            output_file="data/batches.csv"
        )
        
        # Generate batch execution data if we have batches and steps
        if batches_df is not None and batch_steps_df is not None:
            sample_df = generate_batch_execution(
                batches_df,
                batch_steps_df,
                equipment_df,
                personnel_df,
                output_file="data/batch_execution.csv"
            )
            
            # Display a sample of the data
            if sample_df is not None:
                print("\nSample batch execution data (first 5 records):")
                print(sample_df.head())
                
                # Get statistics
                get_sample_statistics("data/batches.csv", "data/batch_execution.csv")

Note: Personnel data file data/personnel.csv not found.
Batches will be generated with synthetic personnel IDs.
Generating synthetic personnel IDs...
Successfully generated 100 batch records.
Data saved to data/batches.csv
Generating synthetic personnel IDs...
Generating batch step execution data...
Successfully generated 1027 batch execution records.
Data saved to data/batch_execution.csv

Sample batch execution data (first 5 records):
        execution_id        batch_id        step_id equipment_id  \
0  EXEC-D1ED5F702FF4  BATCH-24435A1C  STEP-A62B6D4D  EQ-AC717F77   
1  EXEC-4A4AA96AF9F5  BATCH-24435A1C  STEP-F7B325E1  EQ-AC717F77   
2  EXEC-A1C41BA39017  BATCH-24435A1C  STEP-7BE3E876  EQ-AC717F77   
3  EXEC-57249B7FD9AF  BATCH-24435A1C  STEP-87DEA6AC  EQ-AC717F77   
4  EXEC-45F1552FADD0  BATCH-24435A1C  STEP-538490E3  EQ-AC717F77   

            start_time             end_time     status  operator_id  \
0  2025-06-23 02:06:27  2025-06-23 02:17:55  Completed  OP-4DCFD75A   
1  2025-

Process Areas Table

In [14]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def load_equipment_data(equipment_file="data/equipment.csv"):
    """
    Load the previously generated equipment data
    
    Parameters:
    - equipment_file: CSV file containing equipment data
    
    Returns:
    - DataFrame containing the equipment data
    """
    try:
        return pd.read_csv(equipment_file)
    except FileNotFoundError:
        print(f"Error: Equipment data file {equipment_file} not found.")
        print("Please run the equipment data generation script first.")
        return None

def load_personnel_data(personnel_file="data/personnel.csv"):
    """
    Load the previously generated personnel data if available
    
    Parameters:
    - personnel_file: CSV file containing personnel data
    
    Returns:
    - DataFrame containing the personnel data or None if not available
    """
    try:
        return pd.read_csv(personnel_file)
    except FileNotFoundError:
        print(f"Note: Personnel data file {personnel_file} not found.")
        print("Process areas will be generated with synthetic personnel IDs.")
        return None

def generate_facilities_data(num_facilities=5, output_file="data/facilities.csv"):
    """
    Generate synthetic data for the Facilities table
    
    Parameters:
    - num_facilities: Number of facility records to generate
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated facilities data
    """
    # Define possible values for categorical fields
    facility_types = ["Manufacturing Plant", "Warehouse", "Distribution Center", 
                    "R&D Center", "Processing Plant", "Assembly Plant", "Packaging Facility"]
    
    facility_statuses = ["Operational", "Under Maintenance", "Expanding", "Reduced Capacity", "Shutdown"]
    status_weights = [0.8, 0.1, 0.05, 0.03, 0.02]  # Mostly operational
    
    operating_hours = ["24/7", "Mon-Fri: 06:00-22:00", "Mon-Sat: 08:00-20:00", 
                      "Three Shifts: 06:00-14:00, 14:00-22:00, 22:00-06:00",
                      "Two Shifts: 07:00-19:00, 19:00-07:00"]
    
    # City locations (for address generation)
    cities = ["Chicago, IL", "Houston, TX", "Phoenix, AZ", "Philadelphia, PA", 
             "San Antonio, TX", "San Diego, CA", "Dallas, TX", "San Jose, CA",
             "Indianapolis, IN", "Jacksonville, FL", "Columbus, OH", "Charlotte, NC"]
    
    # Generate facility data
    data = {
        "facility_id": [f"FAC-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_facilities)],
        "facility_name": [],
        "facility_type": [],
        "address": [],
        "manager_id": [],  # Will be filled in later if personnel data is available
        "operating_hours": [],
        "status": [],
        "parent_facility_id": []
    }
    
    # Create a hierarchy with some facilities having parent facilities
    # First, identify which facilities will be parents (about 20%)
    potential_parents = []
    if num_facilities > 1:
        potential_parents = random.sample(data["facility_id"], max(1, int(num_facilities * 0.2)))
    
    # Generate facility names and other data
    for i in range(num_facilities):
        facility_id = data["facility_id"][i]
        
        # Generate facility type
        facility_type = random.choice(facility_types)
        data["facility_type"].append(facility_type)
        
        # Generate facility name (based on type and location)
        city = random.choice(cities)
        city_short = city.split(',')[0]  # Just the city name, not state
        
        name_formats = [
            f"{city_short} {facility_type}",
            f"{city_short} {facility_type} {i+1}",
            f"Plant {i+1} - {city_short}",
            f"{random.choice(['North', 'South', 'East', 'West'])} {city_short} {facility_type}",
            f"{city_short} Industrial {facility_type}"
        ]
        
        facility_name = random.choice(name_formats)
        data["facility_name"].append(facility_name)
        
        # Generate address
        street_number = random.randint(100, 9999)
        street_names = ["Main St", "Industrial Pkwy", "Commerce Dr", "Manufacturing Blvd", 
                        "Technology Dr", "Innovation Way", "Production Ave", "Enterprise Rd"]
        street_name = random.choice(street_names)
        address = f"{street_number} {street_name}, {city}"
        data["address"].append(address)
        
        # Operating hours
        data["operating_hours"].append(random.choice(operating_hours))
        
        # Status (weighted random)
        data["status"].append(random.choices(facility_statuses, weights=status_weights)[0])
        
        # Manager ID (will be filled in later)
        data["manager_id"].append("")
        
        # Determine parent facility (if any)
        # Top-level facilities have no parent
        if facility_id not in potential_parents and len(potential_parents) > 0 and random.random() < 0.3:
            # 30% chance of having a parent if not a potential parent itself
            parent_id = random.choice(potential_parents)
            data["parent_facility_id"].append(parent_id)
        else:
            data["parent_facility_id"].append("")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} facility records.")
    print(f"Data saved to {output_file}")
    
    return df

def generate_process_areas(equipment_df, facilities_df, personnel_df=None, output_file="data/process_areas.csv"):
    """
    Generate synthetic data for the ProcessAreas table from ISA-95 Level 2.
    
    Parameters:
    - equipment_df: DataFrame containing equipment data
    - facilities_df: DataFrame containing facilities data
    - personnel_df: DataFrame containing personnel data (optional)
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated process areas data
    """
    if equipment_df is None or len(equipment_df) == 0:
        print("Error: No equipment data available.")
        return None
        
    if facilities_df is None or len(facilities_df) == 0:
        print("Error: No facilities data available.")
        return None
    
    # Define possible values for categorical fields
    area_types = [
        "Production", "Packaging", "Filling", "Mixing", "Reaction", "Distillation",
        "Fermentation", "Filtration", "Drying", "Granulation", "Tableting", "Assembly",
        "Testing", "Quality Control", "Warehousing", "Utilities", "Maintenance"
    ]
    
    environmental_classifications = [
        "General Manufacturing", "Clean Room Class 100,000", "Clean Room Class 10,000", 
        "Clean Room Class 1,000", "Clean Room Class 100", "Controlled Humidity",
        "Controlled Temperature", "Explosion Proof", "Corrosive Environment", 
        "High Temperature", "Cold Storage", "Sterile", "Aseptic", "Hazardous Material"
    ]
    
    # Group equipment by facility
    # Since we don't have facility_id in equipment directly, we'll assign equipment to facilities
    equipment_by_facility = {}
    for facility_id in facilities_df['facility_id']:
        equipment_by_facility[facility_id] = []
    
    # Randomly assign equipment to facilities
    for _, equipment in equipment_df.iterrows():
        facility_id = random.choice(facilities_df['facility_id'].tolist())
        equipment_by_facility[facility_id].append(equipment['equipment_id'])
    
    # Generate personnel IDs if not provided
    if personnel_df is None or len(personnel_df) == 0:
        print("Generating synthetic personnel IDs...")
        area_manager_ids = [f"PERS-{uuid.uuid4().hex[:8].upper()}" for _ in range(20)]
    else:
        area_manager_ids = personnel_df['personnel_id'].unique().tolist()
    
    # Now generate process areas for each facility
    data = {
        "area_id": [],
        "area_name": [],
        "facility_id": [],
        "area_type": [],
        "area_manager_id": [],
        "environmental_classification": [],
        "parent_area_id": []
    }
    
    # Track areas for potential parent-child relationships
    all_areas = []
    
    # Process each facility
    for _, facility in facilities_df.iterrows():
        facility_id = facility['facility_id']
        facility_type = facility['facility_type']
        
        # Determine how many areas to create for this facility (based on facility type)
        if "Plant" in facility_type or "Manufacturing" in facility_type:
            num_areas = random.randint(5, 15)  # Larger facilities have more areas
        else:
            num_areas = random.randint(2, 8)  # Smaller facilities have fewer areas
        
        # Get equipment assigned to this facility
        facility_equipment = equipment_by_facility.get(facility_id, [])
        
        # Generate area data for this facility
        facility_areas = []
        
        for i in range(num_areas):
            area_id = f"AREA-{uuid.uuid4().hex[:8].upper()}"
            all_areas.append(area_id)
            facility_areas.append(area_id)
            
            data["area_id"].append(area_id)
            data["facility_id"].append(facility_id)
            
            # Determine area type (with some weighting toward production areas)
            if random.random() < 0.6:  # 60% chance of production-related area
                area_type = random.choice([
                    "Production", "Packaging", "Filling", "Mixing", "Reaction", 
                    "Distillation", "Fermentation", "Filtration", "Drying", 
                    "Granulation", "Tableting", "Assembly"
                ])
            else:
                area_type = random.choice([
                    "Testing", "Quality Control", "Warehousing", "Utilities", "Maintenance"
                ])
            
            data["area_type"].append(area_type)
            
            # Generate area name
            area_name = f"{area_type} Area {i+1}" if random.random() < 0.5 else f"{area_type} {i+1}"
            data["area_name"].append(area_name)
            
            # Assign area manager
            data["area_manager_id"].append(random.choice(area_manager_ids))
            
            # Determine environmental classification (based on area type)
            if area_type in ["Quality Control", "Testing", "Production", "Filling"]:
                # More likely to have controlled environments
                env_class = random.choice([
                    "Clean Room Class 100,000", "Clean Room Class 10,000", 
                    "Controlled Humidity", "Controlled Temperature"
                ])
            elif area_type in ["Reaction", "Distillation", "Fermentation"]:
                # More likely to have hazardous environments
                env_class = random.choice([
                    "Explosion Proof", "Corrosive Environment", "High Temperature", 
                    "Hazardous Material"
                ])
            elif area_type == "Packaging":
                # Packaging areas often have controlled conditions
                env_class = random.choice([
                    "Controlled Humidity", "Controlled Temperature", "General Manufacturing"
                ])
            else:
                # Other areas more likely to be general
                env_class = random.choice([
                    "General Manufacturing", "Controlled Humidity", "Controlled Temperature"
                ])
            
            data["environmental_classification"].append(env_class)
            
            # Initially no parent area
            data["parent_area_id"].append("")
        
        # Now create hierarchical relationships within this facility's areas
        # About 40% of areas will have a parent
        if len(facility_areas) > 1:
            potential_parents = random.sample(facility_areas, max(1, int(len(facility_areas) * 0.3)))
            
            for i, area_id in enumerate(facility_areas):
                # Skip areas that are potential parents
                if area_id in potential_parents:
                    continue
                
                # 40% chance of having a parent
                if random.random() < 0.4:
                    parent_id = random.choice(potential_parents)
                    
                    # Update the parent_area_id (need to find the index in the main data list)
                    idx = data["area_id"].index(area_id)
                    data["parent_area_id"][idx] = parent_id
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} process area records.")
    print(f"Data saved to {output_file}")
    
    return df

def update_equipment_with_areas(equipment_df, process_areas_df, output_file="data/equipment_updated.csv"):
    """
    Update equipment data with area assignments
    
    Parameters:
    - equipment_df: DataFrame containing equipment data
    - process_areas_df: DataFrame containing process areas data
    - output_file: CSV file to save the updated equipment data
    
    Returns:
    - DataFrame containing the updated equipment data
    """
    if equipment_df is None or len(equipment_df) == 0:
        print("Error: No equipment data available.")
        return None
        
    if process_areas_df is None or len(process_areas_df) == 0:
        print("Error: No process areas data available.")
        return None
    
    # Create a copy of the equipment data
    updated_equipment = equipment_df.copy()
    
    # Check if area_id column already exists
    if 'area_id' not in updated_equipment.columns:
        updated_equipment['area_id'] = ""
    
    # Group areas by facility
    areas_by_facility = {}
    for _, area in process_areas_df.iterrows():
        facility_id = area['facility_id']
        if facility_id not in areas_by_facility:
            areas_by_facility[facility_id] = []
        areas_by_facility[facility_id].append(area['area_id'])
    
    # Group equipment by facility
    equipment_by_facility = {}
    for i, equipment in updated_equipment.iterrows():
        # Since we don't have facility_id in equipment, assign randomly
        facility_id = random.choice(list(areas_by_facility.keys()))
        if facility_id not in equipment_by_facility:
            equipment_by_facility[facility_id] = []
        equipment_by_facility[facility_id].append(i)
    
    # Assign process areas to equipment
    for facility_id, equipment_indices in equipment_by_facility.items():
        facility_areas = areas_by_facility.get(facility_id, [])
        
        if not facility_areas:
            continue
        
        # Distribute equipment across areas
        for idx in equipment_indices:
            area_id = random.choice(facility_areas)
            updated_equipment.at[idx, 'area_id'] = area_id
    
    # Save updated equipment data
    updated_equipment.to_csv(output_file, index=False)
    
    print(f"Successfully updated {len(updated_equipment)} equipment records with area assignments.")
    print(f"Updated data saved to {output_file}")
    
    return updated_equipment

def display_statistics(facilities_df, process_areas_df, updated_equipment_df=None):
    """
    Display basic statistics about the generated data
    
    Parameters:
    - facilities_df: DataFrame containing facilities data
    - process_areas_df: DataFrame containing process areas data
    - updated_equipment_df: DataFrame containing updated equipment data (optional)
    """
    if facilities_df is None or len(facilities_df) == 0:
        print("No facilities data to analyze.")
        return
        
    if process_areas_df is None or len(process_areas_df) == 0:
        print("No process areas data to analyze.")
        return
    
    print("\nFacilities Statistics:")
    print(f"Total facilities: {len(facilities_df)}")
    
    print("\nFacility Type Distribution:")
    type_counts = facilities_df['facility_type'].value_counts()
    for facility_type, count in type_counts.items():
        print(f"  {facility_type}: {count} ({count/len(facilities_df)*100:.1f}%)")
    
    print("\nFacility Status Distribution:")
    status_counts = facilities_df['status'].value_counts()
    for status, count in status_counts.items():
        print(f"  {status}: {count} ({count/len(facilities_df)*100:.1f}%)")
    
    # Parent-child relationships
    parent_count = facilities_df['parent_facility_id'].apply(lambda x: x != "").sum()
    print(f"\nFacilities with parent: {parent_count} ({parent_count/len(facilities_df)*100:.1f}%)")
    
    print("\nProcess Areas Statistics:")
    print(f"Total process areas: {len(process_areas_df)}")
    
    print("\nAreas per Facility:")
    areas_per_facility = process_areas_df.groupby('facility_id').size()
    print(f"  Average: {areas_per_facility.mean():.1f}")
    print(f"  Minimum: {areas_per_facility.min()}")
    print(f"  Maximum: {areas_per_facility.max()}")
    
    print("\nArea Type Distribution:")
    area_type_counts = process_areas_df['area_type'].value_counts().head(10)
    for area_type, count in area_type_counts.items():
        print(f"  {area_type}: {count} ({count/len(process_areas_df)*100:.1f}%)")
    
    print("\nEnvironmental Classification Distribution:")
    env_counts = process_areas_df['environmental_classification'].value_counts().head(10)
    for env_class, count in env_counts.items():
        print(f"  {env_class}: {count} ({count/len(process_areas_df)*100:.1f}%)")
    
    # Parent-child relationships
    area_parent_count = process_areas_df['parent_area_id'].apply(lambda x: x != "").sum()
    print(f"\nAreas with parent: {area_parent_count} ({area_parent_count/len(process_areas_df)*100:.1f}%)")
    
    # Equipment distribution (if updated equipment data is available)
    if updated_equipment_df is not None and len(updated_equipment_df) > 0:
        print("\nEquipment Distribution:")
        print(f"Total equipment: {len(updated_equipment_df)}")
        
        # Count equipment per area
        equipment_per_area = updated_equipment_df.groupby('area_id').size()
        if len(equipment_per_area) > 0:
            print(f"  Areas with equipment: {len(equipment_per_area)} of {len(process_areas_df)}")
            print(f"  Average equipment per area: {equipment_per_area.mean():.1f}")
            print(f"  Minimum equipment per area: {equipment_per_area.min()}")
            print(f"  Maximum equipment per area: {equipment_per_area.max()}")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load required data
    equipment_df = load_equipment_data()
    personnel_df = load_personnel_data()
    
    if equipment_df is not None:
        # First, generate facilities data
        facilities_df = generate_facilities_data(
            num_facilities=5,  # Generate 5 facility records
            output_file="data/facilities.csv"
        )
        
        if facilities_df is not None:
            # Generate process areas data
            process_areas_df = generate_process_areas(
                equipment_df,
                facilities_df,
                personnel_df,
                output_file="data/process_areas.csv"
            )
            
            if process_areas_df is not None:
                # Update equipment with area assignments
                updated_equipment_df = update_equipment_with_areas(
                    equipment_df,
                    process_areas_df,
                    output_file="data/equipment_updated.csv"
                )
                
                # Display statistics
                display_statistics(facilities_df, process_areas_df, updated_equipment_df)
                
                # Display sample data
                print("\nSample facilities data (first 3 records):")
                print(facilities_df.head(3))
                
                print("\nSample process areas data (first 3 records):")
                print(process_areas_df.head(3))
                
                if updated_equipment_df is not None:
                    print("\nSample updated equipment data (first 3 records):")
                    print(updated_equipment_df.head(3))

Note: Personnel data file data/personnel.csv not found.
Process areas will be generated with synthetic personnel IDs.
Successfully generated 5 facility records.
Data saved to data/facilities.csv
Generating synthetic personnel IDs...
Successfully generated 37 process area records.
Data saved to data/process_areas.csv
Successfully updated 150 equipment records with area assignments.
Updated data saved to data/equipment_updated.csv

Facilities Statistics:
Total facilities: 5

Facility Type Distribution:
  Distribution Center: 2 (40.0%)
  Assembly Plant: 2 (40.0%)
  Warehouse: 1 (20.0%)

Facility Status Distribution:
  Operational: 4 (80.0%)
  Reduced Capacity: 1 (20.0%)

Facilities with parent: 0 (0.0%)

Process Areas Statistics:
Total process areas: 37

Areas per Facility:
  Average: 7.4
  Minimum: 2
  Maximum: 15

Area Type Distribution:
  Quality Control: 5 (13.5%)
  Granulation: 4 (10.8%)
  Tableting: 4 (10.8%)
  Utilities: 4 (10.8%)
  Filling: 3 (8.1%)
  Maintenance: 3 (8.1%)
  Produ