Level 1: Sensing & Manipulation

Sensors

In [7]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import json
import os

def generate_sensors_data(num_records=100, output_file="sensors_data.csv"):
    """
    Generate synthetic data for the Sensors table from ISA-95 Level 1.
    
    Parameters:
    - num_records: Number of sensor records to generate
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated sensor data
    """
    # Define possible values for categorical fields
    sensor_types = ["temperature", "pressure", "flow", "level", "ph", "conductivity", 
                   "vibration", "speed", "torque", "current", "voltage", "weight", 
                   "humidity", "oxygen", "co2", "position", "proximity", "rpm"]
    
    manufacturers = ["Siemens", "ABB", "Emerson", "Honeywell", "Endress+Hauser", 
                    "Yokogawa", "Schneider Electric", "Omron", "Rockwell Automation", 
                    "WIKA", "Vega", "ifm electronic", "Pepperl+Fuchs", "Sick AG"]
    
    statuses = ["Active", "Maintenance", "Calibration Due", "Fault", "Standby", "Offline"]
    
    # Generate unique IDs for equipment (to be used as foreign keys)
    # In a real system, these would come from the Equipment table
    equipment_ids = [f"EQ-{uuid.uuid4().hex[:8].upper()}" for _ in range(int(num_records/5))]
    
    # Generate sensor data
    data = {
        "sensor_id": [f"SEN-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_records)],
        "equipment_id": [random.choice(equipment_ids) for _ in range(num_records)],
        "sensor_type": [random.choice(sensor_types) for _ in range(num_records)],
        "manufacturer": [random.choice(manufacturers) for _ in range(num_records)],
        "model_number": [f"M{random.randint(1000, 9999)}-{random.choice(['A', 'B', 'C', 'D', 'E'])}{random.randint(10, 99)}" for _ in range(num_records)],
        "installation_date": [(datetime.now() - timedelta(days=random.randint(30, 1825))).strftime("%Y-%m-%d") for _ in range(num_records)],
        "calibration_due_date": [(datetime.now() + timedelta(days=random.randint(-30, 365))).strftime("%Y-%m-%d") for _ in range(num_records)],
        "location_x": [round(random.uniform(0, 100), 2) for _ in range(num_records)],
        "location_y": [round(random.uniform(0, 100), 2) for _ in range(num_records)],
        "location_z": [round(random.uniform(0, 10), 2) for _ in range(num_records)],
    }
    
    # Create a DataFrame
    df = pd.DataFrame(data)
    
    # Add measurement units based on sensor type
    def get_measurement_unit(sensor_type):
        units = {
            "temperature": "°C",
            "pressure": "bar",
            "flow": "m³/h",
            "level": "%",
            "ph": "pH",
            "conductivity": "µS/cm",
            "vibration": "mm/s",
            "speed": "rpm",
            "torque": "Nm",
            "current": "A",
            "voltage": "V",
            "weight": "kg",
            "humidity": "%RH",
            "oxygen": "%",
            "co2": "ppm",
            "position": "mm",
            "proximity": "mm",
            "rpm": "rpm"
        }
        return units.get(sensor_type, "unit")
    
    df["measurement_unit"] = df["sensor_type"].apply(get_measurement_unit)
    
    # Generate measurement ranges based on sensor type
    def get_measurement_range(row):
        ranges = {
            "temperature": (0, 150),
            "pressure": (0, 25),
            "flow": (0, 100),
            "level": (0, 100),
            "ph": (0, 14),
            "conductivity": (0, 2000),
            "vibration": (0, 50),
            "speed": (0, 3000),
            "torque": (0, 500),
            "current": (0, 100),
            "voltage": (0, 440),
            "weight": (0, 2000),
            "humidity": (0, 100),
            "oxygen": (0, 25),
            "co2": (0, 5000),
            "position": (0, 1000),
            "proximity": (0, 50),
            "rpm": (0, 5000)
        }
        
        default_range = (0, 100)
        min_val, max_val = ranges.get(row["sensor_type"], default_range)
        
        # Add some variation to ranges
        min_val = max(0, min_val - random.uniform(0, min_val/5))
        max_val = max_val + random.uniform(0, max_val/5)
        
        return min_val, max_val
    
    # Apply the function to each row and create range min/max columns
    df["temp_ranges"] = df.apply(get_measurement_range, axis=1)
    df["measurement_range_min"] = df["temp_ranges"].apply(lambda x: round(x[0], 2))
    df["measurement_range_max"] = df["temp_ranges"].apply(lambda x: round(x[1], 2))
    df.drop("temp_ranges", axis=1, inplace=True)
    
    # Add accuracy based on sensor type and a bit of randomness
    df["accuracy"] = df["sensor_type"].apply(
        lambda x: round(random.uniform(0.1, 2.0), 2)
    )
    
    # Add status
    df["status"] = [random.choice(statuses) for _ in range(num_records)]
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    # Also return the DataFrame for further processing
    return df

def generate_and_save_sample(num_records=100, csv_file="data/sensors_data.csv", json_file="data/sensors_data.json"):
    """
    Generate sample data and save to both CSV and JSON formats.
    
    Parameters:
    - num_records: Number of sensor records to generate
    - csv_file: CSV file to save the data
    - json_file: JSON file to save the data
    
    Returns:
    - DataFrame containing the generated sensor data
    """
    print(f"Generating {num_records} sensor records...")
    df = generate_sensors_data(num_records, csv_file)
    print(f"Saved CSV data to {csv_file}")
    
    
    # Display a sample of the data
    print("\nSample data (first 5 records):")
    print(df.head())
    
    # Show basic statistics
    print("\nBasic statistics:")
    print(f"Total sensors: {len(df)}")
    print(f"Unique equipment IDs: {df['equipment_id'].nunique()}")
    print(f"Sensor type distribution:\n{df['sensor_type'].value_counts()[:5]}")
    
    return df

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Generate and save sample data
    df = generate_and_save_sample(
        num_records=100, 
        csv_file="data/sensors_data.csv", 
        json_file="data/sensors_data.json"
    )

Generating 100 sensor records...
Saved CSV data to data/sensors_data.csv

Sample data (first 5 records):
      sensor_id equipment_id sensor_type    manufacturer model_number  \
0  SEN-0C25D1C0  EQ-5C402C11   proximity  ifm electronic    M4036-C78   
1  SEN-A2AC3EED  EQ-099F97DE         co2            WIKA    M8217-C98   
2  SEN-B64619D4  EQ-D47D244B       level         Sick AG    M3938-D65   
3  SEN-D4672B5C  EQ-9F6621AF       speed  Endress+Hauser    M5972-D82   
4  SEN-41DA410B  EQ-F3D6BF0F      torque             ABB    M6491-B75   

  installation_date calibration_due_date  location_x  location_y  location_z  \
0        2022-05-21           2025-12-26       79.42       64.60        5.19   
1        2023-08-08           2026-05-30       41.01       74.17        4.04   
2        2024-05-04           2026-05-16       92.61       14.66        1.63   
3        2023-01-14           2026-04-28       48.76       49.52        4.88   
4        2023-06-19           2025-09-11       60.75    

SensorReadings

In [8]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os
import csv

def load_sensors_data(sensors_file="data/sensors_data.csv"):
    """
    Load the previously generated sensors data
    
    Parameters:
    - sensors_file: CSV file containing sensor data
    
    Returns:
    - DataFrame containing the sensors data
    """
    try:
        return pd.read_csv(sensors_file)
    except FileNotFoundError:
        print(f"Error: Sensors data file {sensors_file} not found.")
        print("Please run the sensors data generation script first.")
        return None

def generate_sensor_readings(sensors_df, num_readings_per_sensor=100, 
                            start_time=None, end_time=None, output_file="data/sensor_readings.csv"):
    """
    Generate synthetic sensor readings data based on the sensors table
    
    Parameters:
    - sensors_df: DataFrame containing sensor data
    - num_readings_per_sensor: Number of readings to generate per sensor
    - start_time: Start time for readings (defaults to 7 days ago)
    - end_time: End time for readings (defaults to now)
    - output_file: CSV file to save the readings data
    
    Returns:
    - DataFrame containing the generated readings data
    """
    if sensors_df is None or len(sensors_df) == 0:
        print("Error: No sensor data available.")
        return None
    
    # Set default time range if not provided
    if start_time is None:
        start_time = datetime.now() - timedelta(days=7)
    if end_time is None:
        end_time = datetime.now()
    
    # Prepare the output file with CSV writer for memory efficiency
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Create a set of batch IDs and equipment state IDs to simulate relationships
    batch_ids = [f"BATCH-{uuid.uuid4().hex[:8].upper()}" for _ in range(20)]
    equipment_state_ids = [f"STATE-{uuid.uuid4().hex[:8].upper()}" for _ in range(50)]
    
    # Calculate total number of readings
    total_readings = len(sensors_df) * num_readings_per_sensor
    
    # Use CSV writer for memory efficiency with large datasets
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = [
            'reading_id', 'sensor_id', 'timestamp', 'value', 
            'quality_indicator', 'status_code', 'batch_id', 'equipment_state_id'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        print(f"Generating {total_readings} sensor readings...")
        readings_count = 0
        
        # Process each sensor
        for _, sensor in sensors_df.iterrows():
            # Get the sensor's measurement range
            min_val = sensor['measurement_range_min']
            max_val = sensor['measurement_range_max']
            
            # Generate timestamps within the specified range
            time_points = [
                start_time + (end_time - start_time) * (i / num_readings_per_sensor)
                for i in range(num_readings_per_sensor)
            ]
            
            # Sort timestamps to ensure chronological order
            time_points.sort()
            
            # Generate base trend for this sensor (smooth curve + noise)
            # This creates more realistic data than pure random values
            base_trend = np.sin(np.linspace(0, random.randint(3, 8) * np.pi, num_readings_per_sensor))
            
            # Add noise and scaling to make it look realistic
            noise_level = (max_val - min_val) * random.uniform(0.05, 0.15)  # 5-15% noise
            
            # Generate realistic values for this sensor
            for i in range(num_readings_per_sensor):
                # Create a unique reading ID
                reading_id = f"READ-{uuid.uuid4().hex[:12].upper()}"
                
                # Format timestamp
                timestamp = time_points[i].strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
                
                # Generate a somewhat realistic value with trend and noise
                # Scale the base trend to fit within the sensor's range
                scaled_value = min_val + (base_trend[i] + 1) / 2 * (max_val - min_val)
                
                # Add noise to the value
                value = scaled_value + random.uniform(-noise_level, noise_level)
                value = max(min_val, min(max_val, value))  # Clip to range
                value = round(value, 2)
                
                # Quality indicator (higher is better, occasional low quality)
                if random.random() < 0.05:  # 5% chance of lower quality
                    quality_indicator = round(random.uniform(50, 85), 1)
                else:
                    quality_indicator = round(random.uniform(85, 100), 1)
                
                # Status code (mostly 0 = normal, occasional other values)
                if random.random() < 0.03:  # 3% chance of abnormal status
                    status_code = random.choice([1, 2, 3, 4])  # Different error/warning codes
                else:
                    status_code = 0  # Normal operation
                
                # Batch ID (some readings may not be associated with a batch)
                if random.random() < 0.8:  # 80% chance of having a batch
                    batch_id = random.choice(batch_ids)
                else:
                    batch_id = ""
                
                # Equipment state (some readings may not have an equipment state)
                if random.random() < 0.9:  # 90% chance of having an equipment state
                    equipment_state_id = random.choice(equipment_state_ids)
                else:
                    equipment_state_id = ""
                
                # Write the reading to the CSV
                writer.writerow({
                    'reading_id': reading_id,
                    'sensor_id': sensor['sensor_id'],
                    'timestamp': timestamp,
                    'value': value,
                    'quality_indicator': quality_indicator,
                    'status_code': status_code,
                    'batch_id': batch_id,
                    'equipment_state_id': equipment_state_id
                })
                
                readings_count += 1
                if readings_count % 10000 == 0:
                    print(f"Generated {readings_count} readings so far...")
    
    print(f"Successfully generated {readings_count} sensor readings.")
    print(f"Data saved to {output_file}")
    
    # Return a sample of the data (first 1000 rows) for preview
    # This avoids loading the entire file back into memory
    return pd.read_csv(output_file, nrows=1000)

def get_sample_statistics(output_file):
    """
    Get basic statistics about the generated data without loading the entire file
    
    Parameters:
    - output_file: CSV file containing the readings data
    """
    # Sample statistics on chunks to avoid memory issues
    num_rows = 0
    sensor_ids = set()
    batch_ids = set()
    equipment_state_ids = set()
    
    # Process in chunks
    for chunk in pd.read_csv(output_file, chunksize=10000):
        num_rows += len(chunk)
        sensor_ids.update(chunk['sensor_id'].unique())
        batch_ids.update([bid for bid in chunk['batch_id'].unique() if bid])
        equipment_state_ids.update([eid for eid in chunk['equipment_state_id'].unique() if eid])
    
    print("\nData Statistics:")
    print(f"Total readings: {num_rows}")
    print(f"Unique sensors: {len(sensor_ids)}")
    print(f"Unique batches: {len(batch_ids)}")
    print(f"Unique equipment states: {len(equipment_state_ids)}")
    
    # Get time range safely
    try:
        first_row = pd.read_csv(output_file, nrows=1)
        last_row = pd.read_csv(output_file, skiprows=max(0, num_rows-1), nrows=1)
        
        if not first_row.empty and not last_row.empty and 'timestamp' in first_row.columns and 'timestamp' in last_row.columns:
            first_time = first_row['timestamp'].iloc[0]
            last_time = last_row['timestamp'].iloc[0]
            print(f"Time range: {first_time} to {last_time}")
    except Exception as e:
        print(f"Could not determine time range: {e}")

if __name__ == "__main__":
    # Load the sensors data
    sensors_df = load_sensors_data()
    
    if sensors_df is not None:
        # Define the time range (past 7 days)
        end_time = datetime.now()
        start_time = end_time - timedelta(days=7)
        
        # Generate readings with 1000 data points per sensor
        # This creates a high-frequency time series typical of manufacturing data
        sample_df = generate_sensor_readings(
            sensors_df, 
            num_readings_per_sensor=1000,
            start_time=start_time,
            end_time=end_time,
            output_file="data/sensor_readings.csv"
        )
        
        # Display a sample of the data
        if sample_df is not None:
            print("\nSample data (first 5 records):")
            print(sample_df.head())
            
            # Get statistics
            get_sample_statistics("data/sensor_readings.csv")

Generating 100000 sensor readings...
Generated 10000 readings so far...
Generated 20000 readings so far...
Generated 30000 readings so far...
Generated 40000 readings so far...
Generated 50000 readings so far...
Generated 60000 readings so far...
Generated 70000 readings so far...
Generated 80000 readings so far...
Generated 90000 readings so far...
Generated 100000 readings so far...
Successfully generated 100000 sensor readings.
Data saved to data/sensor_readings.csv

Sample data (first 5 records):
          reading_id     sensor_id                timestamp  value  \
0  READ-D3482C00B685  SEN-0C25D1C0  2025-07-08 19:12:07.601  22.18   
1  READ-199809D38D09  SEN-0C25D1C0  2025-07-08 19:22:12.401  30.55   
2  READ-BB78FDC49B66  SEN-0C25D1C0  2025-07-08 19:32:17.201  23.36   
3  READ-2922724C0A3C  SEN-0C25D1C0  2025-07-08 19:42:22.001  31.57   
4  READ-C753F315886C  SEN-0C25D1C0  2025-07-08 19:52:26.801  35.32   

   quality_indicator  status_code        batch_id equipment_state_id  
0 

Actuators

In [9]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def generate_actuators_data(num_records=100, equipment_ids=None, output_file="data/actuators_data.csv"):
    """
    Generate synthetic data for the Actuators table from ISA-95 Level 1.
    
    Parameters:
    - num_records: Number of actuator records to generate
    - equipment_ids: Optional list of equipment IDs to use (if None, new ones will be generated)
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated actuator data
    """
    # Define possible values for categorical fields
    actuator_types = ["valve", "motor", "pump", "heater", "fan", "agitator", "conveyor", 
                      "damper", "cylinder", "positioner", "relay", "switch", "mixer", 
                      "doser", "compressor", "blower"]
    
    manufacturers = ["Siemens", "ABB", "Emerson", "Honeywell", "Schneider Electric", 
                    "Festo", "SMC", "Bürkert", "Danfoss", "Asco", "Parker", "Rotork", 
                    "Auma", "Allen-Bradley", "SEW-Eurodrive", "WEG"]
    
    statuses = ["Active", "Maintenance", "Fault", "Standby", "Offline", "Reserved"]
    
    # Generate unique IDs for equipment (to be used as foreign keys) if not provided
    if equipment_ids is None or len(equipment_ids) == 0:
        equipment_ids = [f"EQ-{uuid.uuid4().hex[:8].upper()}" for _ in range(int(num_records/5))]
    
    # Generate actuator data
    data = {
        "actuator_id": [f"ACT-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_records)],
        "equipment_id": [random.choice(equipment_ids) for _ in range(num_records)],
        "actuator_type": [random.choice(actuator_types) for _ in range(num_records)],
        "manufacturer": [random.choice(manufacturers) for _ in range(num_records)],
        "model_number": [f"A{random.randint(1000, 9999)}-{random.choice(['X', 'Y', 'Z', 'S', 'P'])}{random.randint(10, 99)}" for _ in range(num_records)],
        "installation_date": [(datetime.now() - timedelta(days=random.randint(30, 1825))).strftime("%Y-%m-%d") for _ in range(num_records)],
        "location_x": [round(random.uniform(0, 100), 2) for _ in range(num_records)],
        "location_y": [round(random.uniform(0, 100), 2) for _ in range(num_records)],
        "location_z": [round(random.uniform(0, 10), 2) for _ in range(num_records)],
    }
    
    # Create a DataFrame
    df = pd.DataFrame(data)
    
    # Generate control ranges based on actuator type
    def get_control_range(row):
        ranges = {
            "valve": (0, 100),             # Percent open
            "motor": (0, 3000),            # RPM
            "pump": (0, 500),              # Flow rate
            "heater": (0, 500),            # Temperature
            "fan": (0, 100),               # Percent speed
            "agitator": (0, 100),          # Percent speed
            "conveyor": (0, 10),           # m/s
            "damper": (0, 100),            # Percent open
            "cylinder": (0, 1000),         # mm extension
            "positioner": (0, 360),        # Degrees
            "relay": (0, 1),               # On/Off
            "switch": (0, 1),              # On/Off
            "mixer": (0, 100),             # Percent speed
            "doser": (0, 50),              # L/min
            "compressor": (0, 200),        # Bar
            "blower": (0, 100),            # Percent speed
        }
        
        default_range = (0, 100)
        min_val, max_val = ranges.get(row["actuator_type"], default_range)
        
        # Add some variation to ranges
        min_val = max(0, min_val)
        max_val = max_val + random.uniform(0, max_val/10)
        
        return min_val, max_val
    
    # Apply the function to each row and create range min/max columns
    df["temp_ranges"] = df.apply(get_control_range, axis=1)
    df["control_range_min"] = df["temp_ranges"].apply(lambda x: round(x[0], 2))
    df["control_range_max"] = df["temp_ranges"].apply(lambda x: round(x[1], 2))
    df.drop("temp_ranges", axis=1, inplace=True)
    
    # Add control units based on actuator type
    def get_control_unit(actuator_type):
        units = {
            "valve": "%",
            "motor": "rpm",
            "pump": "m³/h",
            "heater": "°C",
            "fan": "%",
            "agitator": "%",
            "conveyor": "m/s",
            "damper": "%",
            "cylinder": "mm",
            "positioner": "°",
            "relay": "binary",
            "switch": "binary",
            "mixer": "%",
            "doser": "L/min",
            "compressor": "bar",
            "blower": "%"
        }
        return units.get(actuator_type, "unit")
    
    df["control_unit"] = df["actuator_type"].apply(get_control_unit)
    
    # Add status
    df["status"] = [random.choice(statuses) for _ in range(num_records)]
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    # Also return the DataFrame for further processing
    return df

def load_sensors_data(sensors_file="data/sensors_data.csv"):
    """
    Load the previously generated sensors data to extract equipment IDs
    
    Parameters:
    - sensors_file: CSV file containing sensor data
    
    Returns:
    - List of equipment IDs from the sensors data
    """
    try:
        sensors_df = pd.read_csv(sensors_file)
        return sensors_df["equipment_id"].unique().tolist()
    except FileNotFoundError:
        print(f"Warning: Sensors data file {sensors_file} not found. Generating new equipment IDs.")
        return None
    except KeyError:
        print(f"Warning: No equipment_id column found in {sensors_file}. Generating new equipment IDs.")
        return None

def generate_and_save_sample(num_records=100, output_file="data/actuators_data.csv"):
    """
    Generate sample actuator data and save to CSV format.
    
    Parameters:
    - num_records: Number of actuator records to generate
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated actuator data
    """
    # Try to load equipment IDs from sensors data for consistency
    equipment_ids = load_sensors_data()
    
    print(f"Generating {num_records} actuator records...")
    df = generate_actuators_data(num_records, equipment_ids, output_file)
    print(f"Saved CSV data to {output_file}")
    
    # Display a sample of the data
    print("\nSample data (first 5 records):")
    print(df.head())
    
    # Show basic statistics
    print("\nBasic statistics:")
    print(f"Total actuators: {len(df)}")
    print(f"Unique equipment IDs: {df['equipment_id'].nunique()}")
    print(f"Actuator type distribution:\n{df['actuator_type'].value_counts()[:5]}")
    
    return df

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Generate and save sample data
    df = generate_and_save_sample(
        num_records=100, 
        output_file="data/actuators_data.csv"
    )

Generating 100 actuator records...
Saved CSV data to data/actuators_data.csv

Sample data (first 5 records):
    actuator_id equipment_id actuator_type   manufacturer model_number  \
0  ACT-EF54959F  EQ-8953C760        switch            WEG    A3880-S83   
1  ACT-9B4081FB  EQ-F3D6BF0F      cylinder  Allen-Bradley    A8717-Y78   
2  ACT-5BEB536A  EQ-38DD265D        switch          Festo    A2688-Y94   
3  ACT-AC6E4A22  EQ-F4696348    positioner           Auma    A3981-X78   
4  ACT-0FB6FE15  EQ-9A644928         motor        Danfoss    A9798-Y21   

  installation_date  location_x  location_y  location_z  control_range_min  \
0        2022-08-15       55.20       42.07        8.34                  0   
1        2023-05-16        9.98       79.25        0.93                  0   
2        2023-05-02       88.33       73.43        5.67                  0   
3        2024-07-02        7.20        4.39        5.87                  0   
4        2023-02-07       59.23       37.31        5.11 

ActuatorCommands

In [10]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os
import csv

def load_actuators_data(actuators_file="data/actuators_data.csv"):
    """
    Load the previously generated actuators data
    
    Parameters:
    - actuators_file: CSV file containing actuator data
    
    Returns:
    - DataFrame containing the actuators data
    """
    try:
        return pd.read_csv(actuators_file)
    except FileNotFoundError:
        print(f"Error: Actuators data file {actuators_file} not found.")
        print("Please run the actuators data generation script first.")
        return None

def generate_actuator_commands(actuators_df, num_commands_per_actuator=100, 
                            start_time=None, end_time=None, output_file="data/actuator_commands.csv"):
    """
    Generate synthetic actuator commands data based on the actuators table
    
    Parameters:
    - actuators_df: DataFrame containing actuator data
    - num_commands_per_actuator: Number of commands to generate per actuator
    - start_time: Start time for commands (defaults to 7 days ago)
    - end_time: End time for commands (defaults to now)
    - output_file: CSV file to save the commands data
    
    Returns:
    - DataFrame containing a sample of the generated commands data
    """
    if actuators_df is None or len(actuators_df) == 0:
        print("Error: No actuator data available.")
        return None
    
    # Set default time range if not provided
    if start_time is None:
        start_time = datetime.now() - timedelta(days=7)
    if end_time is None:
        end_time = datetime.now()
    
    # Create a set of batch IDs, step IDs and operator IDs to simulate relationships
    batch_ids = [f"BATCH-{uuid.uuid4().hex[:8].upper()}" for _ in range(20)]
    step_ids = [f"STEP-{uuid.uuid4().hex[:8].upper()}" for _ in range(50)]
    operator_ids = [f"OP-{uuid.uuid4().hex[:6].upper()}" for _ in range(15)]
    
    # Command types and control modes
    command_types = ["position", "speed", "open", "close", "start", "stop", "setpoint", "reset"]
    control_modes = ["Auto", "Manual", "Cascade", "Supervised"]
    
    # Control mode weights (Auto is most common)
    control_mode_weights = [0.7, 0.2, 0.05, 0.05]
    
    # Calculate total number of commands
    total_commands = len(actuators_df) * num_commands_per_actuator
    
    # Prepare the output file with CSV writer for memory efficiency
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = [
            'command_id', 'actuator_id', 'timestamp', 'command_value', 
            'command_type', 'control_mode', 'operator_id', 'batch_id', 'step_id'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        print(f"Generating {total_commands} actuator commands...")
        commands_count = 0
        
        # Process each actuator
        for _, actuator in actuators_df.iterrows():
            # Get the actuator's control range
            min_val = actuator['control_range_min']
            max_val = actuator['control_range_max']
            actuator_type = actuator['actuator_type']
            
            # Determine appropriate command types for this actuator type
            if actuator_type in ['valve', 'damper']:
                specific_commands = ["open", "close", "position"]
            elif actuator_type in ['motor', 'pump', 'fan', 'agitator', 'conveyor', 'mixer', 'blower']:
                specific_commands = ["start", "stop", "speed"]
            elif actuator_type in ['heater', 'compressor']:
                specific_commands = ["start", "stop", "setpoint"]
            elif actuator_type in ['cylinder', 'positioner']:
                specific_commands = ["position", "reset"]
            elif actuator_type in ['relay', 'switch']:
                specific_commands = ["open", "close"]
            else:
                specific_commands = command_types
            
            # Generate timestamps within the specified range
            time_points = sorted([
                start_time + (end_time - start_time) * random.random()
                for _ in range(num_commands_per_actuator)
            ])
            
            # Generate commands for this actuator
            for i in range(num_commands_per_actuator):
                # Create a unique command ID
                command_id = f"CMD-{uuid.uuid4().hex[:12].upper()}"
                
                # Format timestamp
                timestamp = time_points[i].strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
                
                # Select command type
                command_type = random.choice(specific_commands)
                
                # Determine command value based on command type and actuator type
                if command_type == "open" or command_type == "start":
                    # For binary/open commands, use max value
                    command_value = max_val
                elif command_type == "close" or command_type == "stop":
                    # For binary/close commands, use min value
                    command_value = min_val
                elif command_type == "reset":
                    # Reset to a default position (usually 0 or middle)
                    if min_val == 0 and max_val == 1:  # Binary
                        command_value = 0
                    else:
                        command_value = min_val + (max_val - min_val) * 0.1  # 10% position
                else:
                    # For setpoints and positions, use a value within range
                    # Create clustering around common setpoints (e.g., 0%, 25%, 50%, 75%, 100%)
                    if random.random() < 0.6:  # 60% chance of common setpoint
                        common_points = [
                            min_val,
                            min_val + (max_val - min_val) * 0.25,
                            min_val + (max_val - min_val) * 0.5,
                            min_val + (max_val - min_val) * 0.75,
                            max_val
                        ]
                        command_value = random.choice(common_points)
                    else:
                        # Random value within range
                        command_value = min_val + random.random() * (max_val - min_val)
                
                # Round command value appropriately
                if min_val == 0 and max_val == 1:  # Binary actuator
                    command_value = round(command_value)
                else:
                    command_value = round(command_value, 2)
                
                # Determine control mode (weighted selection)
                control_mode = random.choices(control_modes, weights=control_mode_weights)[0]
                
                # Only include operator ID for manual control mode
                if control_mode == "Manual":
                    operator_id = random.choice(operator_ids)
                else:
                    operator_id = ""
                
                # Batch ID and Step ID (some commands may not be associated with a batch/step)
                if random.random() < 0.8:  # 80% chance of having a batch
                    batch_id = random.choice(batch_ids)
                    # If there's a batch, high chance of having a step
                    if random.random() < 0.9:  # 90% chance of having a step if there's a batch
                        step_id = random.choice(step_ids)
                    else:
                        step_id = ""
                else:
                    batch_id = ""
                    step_id = ""
                
                # Write the command to the CSV
                writer.writerow({
                    'command_id': command_id,
                    'actuator_id': actuator['actuator_id'],
                    'timestamp': timestamp,
                    'command_value': command_value,
                    'command_type': command_type,
                    'control_mode': control_mode,
                    'operator_id': operator_id,
                    'batch_id': batch_id,
                    'step_id': step_id
                })
                
                commands_count += 1
                if commands_count % 10000 == 0:
                    print(f"Generated {commands_count} commands so far...")
    
    print(f"Successfully generated {commands_count} actuator commands.")
    print(f"Data saved to {output_file}")
    
    # Return a sample of the data (first 1000 rows) for preview
    return pd.read_csv(output_file, nrows=1000)

def get_sample_statistics(output_file):
    """
    Get basic statistics about the generated data
    
    Parameters:
    - output_file: CSV file containing the commands data
    """
    try:
        # Read the first chunk to get column information
        sample_df = pd.read_csv(output_file, nrows=1000)
        
        # Sample statistics on chunks to avoid memory issues
        num_rows = 0
        actuator_ids = set()
        batch_ids = set()
        step_ids = set()
        operator_ids = set()
        command_types = set()
        control_modes = set()
        
        # Process in chunks
        for chunk in pd.read_csv(output_file, chunksize=10000):
            num_rows += len(chunk)
            actuator_ids.update(chunk['actuator_id'].unique())
            batch_ids.update([bid for bid in chunk['batch_id'].unique() if isinstance(bid, str) and bid])
            step_ids.update([sid for sid in chunk['step_id'].unique() if isinstance(sid, str) and sid])
            operator_ids.update([oid for oid in chunk['operator_id'].unique() if isinstance(oid, str) and oid])
            command_types.update(chunk['command_type'].unique())
            control_modes.update(chunk['control_mode'].unique())
        
        print("\nData Statistics:")
        print(f"Total commands: {num_rows}")
        print(f"Unique actuators: {len(actuator_ids)}")
        print(f"Unique batches: {len(batch_ids)}")
        print(f"Unique steps: {len(step_ids)}")
        print(f"Unique operators: {len(operator_ids)}")
        print(f"Command types: {sorted(command_types)}")
        print(f"Control modes: {sorted(control_modes)}")
        
        # Get time range if available
        if 'timestamp' in sample_df.columns:
            # Get min and max timestamp from the sample
            min_time = pd.to_datetime(sample_df['timestamp']).min()
            max_time = pd.to_datetime(sample_df['timestamp']).max()
            print(f"Time range (from sample): approximately {min_time} to {max_time}")
            
    except Exception as e:
        print(f"Error getting statistics: {e}")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load the actuators data
    actuators_df = load_actuators_data()
    
    if actuators_df is not None:
        # Define the time range (past 7 days)
        end_time = datetime.now()
        start_time = end_time - timedelta(days=7)
        
        # Generate commands
        sample_df = generate_actuator_commands(
            actuators_df, 
            num_commands_per_actuator=100,  # 100 commands per actuator
            start_time=start_time,
            end_time=end_time,
            output_file="data/actuator_commands.csv"
        )
        
        # Display a sample of the data
        if sample_df is not None:
            print("\nSample data (first 5 records):")
            print(sample_df.head())
            
            # Get statistics
            get_sample_statistics("data/actuator_commands.csv")

Generating 10000 actuator commands...
Generated 10000 commands so far...
Successfully generated 10000 actuator commands.
Data saved to data/actuator_commands.csv

Sample data (first 5 records):
         command_id   actuator_id                timestamp  command_value  \
0  CMD-3B0FE47DA496  ACT-EF54959F  2025-07-08 20:38:42.771            1.0   
1  CMD-AF14DC8DE979  ACT-EF54959F  2025-07-08 20:43:54.456            1.0   
2  CMD-7A7A7DA249D9  ACT-EF54959F  2025-07-08 22:01:24.123            1.0   
3  CMD-10A270E97B11  ACT-EF54959F  2025-07-08 23:11:30.013            1.0   
4  CMD-392AC3FD6B89  ACT-EF54959F  2025-07-09 01:21:39.333            1.0   

  command_type control_mode operator_id        batch_id        step_id  
0         open         Auto         NaN  BATCH-E5574981  STEP-F3AB499C  
1         open         Auto         NaN  BATCH-E50133B1            NaN  
2         open         Auto         NaN  BATCH-E5574981            NaN  
3         open       Manual   OP-B6553C  BATCH-68D3

DeviceDiagnostics

In [11]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os
import csv

def load_sensors_and_actuators_data(sensors_file="data/sensors_data.csv", actuators_file="data/actuators_data.csv"):
    """
    Load the previously generated sensors and actuators data to create device diagnostics
    
    Parameters:
    - sensors_file: CSV file containing sensor data
    - actuators_file: CSV file containing actuator data
    
    Returns:
    - DataFrame containing combined device data
    """
    devices_df = pd.DataFrame()
    
    try:
        # Load sensors data
        sensors_df = pd.read_csv(sensors_file)
        sensors_df['device_type'] = 'sensor'
        sensors_df = sensors_df.rename(columns={'sensor_id': 'device_id', 'sensor_type': 'device_subtype'})
        
        # Select relevant columns
        sensors_cols = ['device_id', 'device_type', 'device_subtype', 'equipment_id', 'manufacturer', 'model_number', 'status']
        sensors_df = sensors_df[sensors_cols]
        
        devices_df = pd.concat([devices_df, sensors_df])
    except FileNotFoundError:
        print(f"Warning: Sensors data file {sensors_file} not found.")
    
    try:
        # Load actuators data
        actuators_df = pd.read_csv(actuators_file)
        actuators_df['device_type'] = 'actuator'
        actuators_df = actuators_df.rename(columns={'actuator_id': 'device_id', 'actuator_type': 'device_subtype'})
        
        # Select relevant columns
        actuators_cols = ['device_id', 'device_type', 'device_subtype', 'equipment_id', 'manufacturer', 'model_number', 'status']
        actuators_df = actuators_df[actuators_cols]
        
        devices_df = pd.concat([devices_df, actuators_df])
    except FileNotFoundError:
        print(f"Warning: Actuators data file {actuators_file} not found.")
    
    if len(devices_df) == 0:
        print("Error: No device data found. Please run sensor and actuator data generation scripts first.")
        return None
    
    return devices_df

def generate_device_diagnostics(devices_df, num_diagnostics_per_device=10, 
                              start_time=None, end_time=None, output_file="data/device_diagnostics.csv"):
    """
    Generate synthetic device diagnostics data based on the sensors and actuators tables
    
    Parameters:
    - devices_df: DataFrame containing device data
    - num_diagnostics_per_device: Number of diagnostic records to generate per device
    - start_time: Start time for diagnostics (defaults to 30 days ago)
    - end_time: End time for diagnostics (defaults to now)
    - output_file: CSV file to save the diagnostics data
    
    Returns:
    - DataFrame containing a sample of the generated diagnostics data
    """
    if devices_df is None or len(devices_df) == 0:
        print("Error: No device data available.")
        return None
    
    # Set default time range if not provided (diagnostic data typically spans longer time than readings)
    if start_time is None:
        start_time = datetime.now() - timedelta(days=30)
    if end_time is None:
        end_time = datetime.now()
    
    # Define diagnostic types
    diagnostic_types = {
        'sensor': [
            'Calibration Check', 'Signal Quality Test', 'Range Verification', 
            'Response Time Test', 'Interference Check', 'Power Supply Test',
            'Communication Test', 'Self-Diagnostic', 'Drift Analysis'
        ],
        'actuator': [
            'Movement Test', 'Response Time Test', 'Leak Test', 'Position Verification', 
            'Torque Test', 'Speed Test', 'Current Draw Test', 'Self-Diagnostic',
            'Feedback Verification', 'Lubrication Check', 'Wear Analysis'
        ]
    }
    
    # Define severity levels (0 = normal, 1-5 = increasing severity)
    severity_levels = [0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 4, 5]  # Weighted towards normal
    
    # Define status codes
    # 0 = Normal, 1 = Warning, 2 = Minor Issue, 3 = Moderate Issue, 4 = Major Issue, 5 = Critical Failure
    status_code_messages = {
        0: ["Normal operation", "No issues detected", "All parameters within normal range", 
            "Device functioning correctly", "Diagnostic passed"],
        1: ["Minor deviation detected", "Parameter near warning threshold", "Slight performance degradation", 
            "Recommend monitoring", "Non-critical warning"],
        2: ["Parameter outside optimal range", "Performance degradation detected", "Maintenance recommended", 
            "Minor issue detected", "Device requires attention"],
        3: ["Significant deviation detected", "Multiple parameters out of range", "Performance significantly degraded", 
            "Maintenance required soon", "Device operating outside specifications"],
        4: ["Major issue detected", "Device may fail soon", "Immediate maintenance required", 
            "Performance severely degraded", "Reliability compromised"],
        5: ["Critical failure detected", "Device non-operational", "Emergency maintenance required", 
            "Safety risk possible", "Replace device immediately"]
    }
    
    # Calculate total number of diagnostics
    total_diagnostics = len(devices_df) * num_diagnostics_per_device
    
    # Prepare the output file with CSV writer for memory efficiency
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = [
            'diagnostic_id', 'device_id', 'timestamp', 'diagnostic_type', 
            'status_code', 'diagnostic_message', 'severity_level', 
            'battery_level', 'communication_quality', 'internal_temperature',
            'maintenance_required'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        print(f"Generating {total_diagnostics} device diagnostic records...")
        diagnostics_count = 0
        
        # Process each device
        for _, device in devices_df.iterrows():
            device_id = device['device_id']
            device_type = device['device_type']
            device_subtype = device['device_subtype']
            device_status = device['status']
            
            # Get applicable diagnostic types for this device type
            applicable_diagnostics = diagnostic_types.get(device_type, diagnostic_types['sensor'])
            
            # Generate timestamps within the specified range (more spread out than readings)
            time_points = sorted([
                start_time + (end_time - start_time) * random.random()
                for _ in range(num_diagnostics_per_device)
            ])
            
            # Current device state (will evolve over time to simulate deterioration)
            device_health = 100.0  # Start at 100% health
            
            # Rate of deterioration (different for each device)
            deterioration_rate = random.uniform(0.1, 2.0)
            
            # Generate diagnostic records for this device
            for i in range(num_diagnostics_per_device):
                # Create a unique diagnostic ID
                diagnostic_id = f"DIAG-{uuid.uuid4().hex[:12].upper()}"
                
                # Format timestamp
                timestamp = time_points[i].strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
                
                # Determine diagnostic type
                diagnostic_type = random.choice(applicable_diagnostics)
                
                # Simulate device health deterioration over time
                # Devices in "Maintenance" or "Fault" status deteriorate faster
                if device_status in ["Maintenance", "Fault"]:
                    deterioration_factor = 2.0
                else:
                    deterioration_factor = 1.0
                
                # Calculate position in time series (0 to 1)
                time_position = i / (num_diagnostics_per_device - 1) if num_diagnostics_per_device > 1 else 0
                
                # Decrease health over time with some randomness
                health_decrease = deterioration_rate * deterioration_factor * time_position
                health_decrease += random.uniform(-0.5, 0.5) * health_decrease  # Add noise
                device_health -= health_decrease
                device_health = max(50, device_health)  # Don't let it go below 50%
                
                # Determine severity based on device health and some randomness
                if device_health > 95:
                    severity_candidates = [0]
                elif device_health > 90:
                    severity_candidates = [0, 0, 0, 1]
                elif device_health > 80:
                    severity_candidates = [0, 0, 1, 1, 2]
                elif device_health > 70:
                    severity_candidates = [0, 1, 1, 2, 2]
                elif device_health > 60:
                    severity_candidates = [1, 2, 2, 3, 3]
                else:
                    severity_candidates = [2, 3, 3, 4, 4, 5]
                
                severity_level = random.choice(severity_candidates)
                
                # Set status code based on severity
                status_code = severity_level
                
                # Get appropriate diagnostic message
                diagnostic_message = random.choice(status_code_messages[status_code])
                
                # Generate battery level (only for wireless devices)
                has_battery = random.random() < 0.3  # 30% of devices are wireless
                if has_battery:
                    # Battery level decreases over time, with some recharges
                    base_battery = 100 - 50 * time_position  # Decreases from 100% to 50%
                    # Occasionally "recharge" the battery
                    if random.random() < 0.2:  # 20% chance of recent recharge
                        base_battery = min(100, base_battery + random.uniform(30, 80))
                    
                    battery_level = round(base_battery + random.uniform(-10, 10), 1)  # Add some noise
                    battery_level = max(5, min(100, battery_level))  # Keep between 5% and 100%
                else:
                    battery_level = None
                
                # Generate communication quality
                base_comm_quality = 100 - 20 * time_position  # Slight degradation over time
                # Add random fluctuations
                comm_quality = round(base_comm_quality + random.uniform(-15, 5), 1)
                comm_quality = max(60, min(100, comm_quality))  # Keep between 60% and 100%
                
                # Generate internal temperature
                if device_type == 'sensor':
                    # Sensors typically run cooler
                    base_temp = 25 + 5 * time_position  # Gradual increase over time
                    temp_variation = random.uniform(-3, 5)
                else:
                    # Actuators often run hotter
                    base_temp = 30 + 8 * time_position  # More significant increase over time
                    temp_variation = random.uniform(-3, 8)
                
                internal_temperature = round(base_temp + temp_variation, 1)
                
                # Determine if maintenance is required
                maintenance_required = severity_level >= 3  # Levels 3, 4, 5 require maintenance
                
                # Write the diagnostic record to the CSV
                writer.writerow({
                    'diagnostic_id': diagnostic_id,
                    'device_id': device_id,
                    'timestamp': timestamp,
                    'diagnostic_type': diagnostic_type,
                    'status_code': status_code,
                    'diagnostic_message': diagnostic_message,
                    'severity_level': severity_level,
                    'battery_level': battery_level if has_battery else '',
                    'communication_quality': comm_quality,
                    'internal_temperature': internal_temperature,
                    'maintenance_required': 1 if maintenance_required else 0
                })
                
                diagnostics_count += 1
                if diagnostics_count % 10000 == 0:
                    print(f"Generated {diagnostics_count} diagnostic records so far...")
    
    print(f"Successfully generated {diagnostics_count} device diagnostic records.")
    print(f"Data saved to {output_file}")
    
    # Return a sample of the data (first 1000 rows) for preview
    return pd.read_csv(output_file, nrows=1000)

def get_sample_statistics(output_file):
    """
    Get basic statistics about the generated diagnostics data
    
    Parameters:
    - output_file: CSV file containing the diagnostics data
    """
    try:
        # Read the first chunk to get column information
        sample_df = pd.read_csv(output_file, nrows=1000)
        
        # Sample statistics on chunks to avoid memory issues
        num_rows = 0
        device_ids = set()
        diagnostic_types = set()
        status_codes = set()
        severity_counts = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
        maintenance_required_count = 0
        
        # Process in chunks
        for chunk in pd.read_csv(output_file, chunksize=10000):
            num_rows += len(chunk)
            device_ids.update(chunk['device_id'].unique())
            diagnostic_types.update(chunk['diagnostic_type'].unique())
            status_codes.update(chunk['status_code'].unique())
            
            # Count severity levels
            for severity in range(6):
                severity_counts[severity] += len(chunk[chunk['severity_level'] == severity])
            
            # Count maintenance required
            maintenance_required_count += chunk['maintenance_required'].sum()
        
        print("\nData Statistics:")
        print(f"Total diagnostic records: {num_rows}")
        print(f"Unique devices: {len(device_ids)}")
        print(f"Diagnostic types: {sorted(diagnostic_types)}")
        print(f"Status codes: {sorted(status_codes)}")
        
        print("\nSeverity Level Distribution:")
        for severity, count in severity_counts.items():
            percentage = (count / num_rows) * 100 if num_rows > 0 else 0
            print(f"Level {severity}: {count} records ({percentage:.1f}%)")
        
        print(f"\nMaintenance Required: {maintenance_required_count} records ({(maintenance_required_count / num_rows) * 100:.1f}%)")
        
        # Get time range if available
        if 'timestamp' in sample_df.columns:
            min_time = pd.to_datetime(sample_df['timestamp']).min()
            max_time = pd.to_datetime(sample_df['timestamp']).max()
            print(f"Time range (from sample): approximately {min_time} to {max_time}")
            
    except Exception as e:
        print(f"Error getting statistics: {e}")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load the device data (combining sensors and actuators)
    devices_df = load_sensors_and_actuators_data()
    
    if devices_df is not None:
        # Define the time range (past 30 days for diagnostics)
        end_time = datetime.now()
        start_time = end_time - timedelta(days=30)
        
        # Generate diagnostic records
        sample_df = generate_device_diagnostics(
            devices_df, 
            num_diagnostics_per_device=10,  # 10 diagnostics per device
            start_time=start_time,
            end_time=end_time,
            output_file="data/device_diagnostics.csv"
        )
        
        # Display a sample of the data
        if sample_df is not None:
            print("\nSample data (first 5 records):")
            print(sample_df.head())
            
            # Get statistics
            get_sample_statistics("data/device_diagnostics.csv")

Generating 2000 device diagnostic records...
Successfully generated 2000 device diagnostic records.
Data saved to data/device_diagnostics.csv

Sample data (first 5 records):
       diagnostic_id     device_id                timestamp  \
0  DIAG-078498B17D62  SEN-0C25D1C0  2025-06-15 21:02:37.927   
1  DIAG-86B1F9CFBC73  SEN-0C25D1C0  2025-06-20 08:28:30.712   
2  DIAG-0F953E086CDA  SEN-0C25D1C0  2025-06-21 15:20:21.753   
3  DIAG-B05B63B3D703  SEN-0C25D1C0  2025-06-26 23:12:14.977   
4  DIAG-D1056E3A8636  SEN-0C25D1C0  2025-07-01 20:24:37.738   

      diagnostic_type  status_code                  diagnostic_message  \
0      Drift Analysis            0                    Normal operation   
1  Response Time Test            0                    Normal operation   
2  Communication Test            0        Device functioning correctly   
3      Drift Analysis            0                   Diagnostic passed   
4  Response Time Test            0  All parameters within normal range   

  

ControlLoops

In [12]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def load_devices_data(sensors_file="data/sensors_data.csv", actuators_file="data/actuators_data.csv"):
    """
    Load the previously generated sensors and actuators data to create control loops
    
    Parameters:
    - sensors_file: CSV file containing sensor data
    - actuators_file: CSV file containing actuator data
    
    Returns:
    - Two DataFrames containing sensors and actuators data
    """
    sensors_df = None
    actuators_df = None
    
    try:
        # Load sensors data
        sensors_df = pd.read_csv(sensors_file)
    except FileNotFoundError:
        print(f"Warning: Sensors data file {sensors_file} not found.")
    
    try:
        # Load actuators data
        actuators_df = pd.read_csv(actuators_file)
    except FileNotFoundError:
        print(f"Warning: Actuators data file {actuators_file} not found.")
    
    if sensors_df is None or actuators_df is None:
        print("Error: Both sensor and actuator data are required for control loops. Please run sensor and actuator data generation scripts first.")
        return None, None
    
    return sensors_df, actuators_df

def generate_control_loops_data(sensors_df, actuators_df, num_loops=50, output_file="data/control_loops.csv"):
    """
    Generate synthetic control loops data based on the sensors and actuators tables
    
    Parameters:
    - sensors_df: DataFrame containing sensors data
    - actuators_df: DataFrame containing actuators data
    - num_loops: Number of control loops to generate
    - output_file: CSV file to save the control loops data
    
    Returns:
    - DataFrame containing the generated control loops data
    """
    if sensors_df is None or actuators_df is None or len(sensors_df) == 0 or len(actuators_df) == 0:
        print("Error: Valid sensor and actuator data required.")
        return None
    
    # Filter sensors by type (only certain sensor types are used for process variables)
    pv_sensor_types = ['temperature', 'pressure', 'flow', 'level', 'ph', 'conductivity', 
                       'speed', 'position', 'weight', 'humidity', 'oxygen', 'co2']
    
    pv_sensors_df = sensors_df[sensors_df['sensor_type'].isin(pv_sensor_types)]
    
    if len(pv_sensors_df) == 0:
        print("Warning: No suitable process variable sensors found. Using all sensors instead.")
        pv_sensors_df = sensors_df
    
    # Filter actuators by type (only certain actuator types are used for control outputs)
    cv_actuator_types = ['valve', 'motor', 'pump', 'heater', 'fan', 'agitator', 
                         'damper', 'positioner', 'doser', 'compressor']
    
    cv_actuators_df = actuators_df[actuators_df['actuator_type'].isin(cv_actuator_types)]
    
    if len(cv_actuators_df) == 0:
        print("Warning: No suitable control variable actuators found. Using all actuators instead.")
        cv_actuators_df = actuators_df
    
    # Make sure we have enough sensors and actuators
    num_loops = min(num_loops, len(pv_sensors_df), len(cv_actuators_df))
    
    if num_loops == 0:
        print("Error: Not enough sensors and actuators to create control loops.")
        return None
    
    print(f"Generating {num_loops} control loops...")
    
    # Get unique equipment IDs
    equipment_ids = list(set(sensors_df['equipment_id'].unique()) | set(actuators_df['equipment_id'].unique()))
    
    # Define controller types and their probabilities
    controller_types = {
        "PID": 0.6,        # Most common
        "Cascade": 0.15,
        "Feedforward": 0.1,
        "On-Off": 0.05,
        "Ratio": 0.05,
        "Model Predictive": 0.03,
        "Fuzzy Logic": 0.02
    }
    
    # Define control modes and their probabilities
    control_modes = {
        "Auto": 0.7,       # Most common
        "Manual": 0.15,
        "Cascade": 0.1,
        "Supervised": 0.05
    }
    
    # Create data structure
    data = {
        'loop_id': [f"LOOP-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_loops)],
        'loop_name': [],
        'process_variable_sensor_id': [],
        'control_output_actuator_id': [],
        'controller_type': [],
        'control_mode': [],
        'setpoint_value': [],
        'setpoint_unit': [],
        'p_value': [],
        'i_value': [],
        'd_value': [],
        'equipment_id': [],
        'status': []
    }
    
    # Sample sensors and actuators without replacement to ensure uniqueness
    sampled_sensors = pv_sensors_df.sample(n=num_loops).reset_index(drop=True)
    sampled_actuators = cv_actuators_df.sample(n=num_loops).reset_index(drop=True)
    
    # Create logical pairings based on sensor and actuator types
    pairings = []
    
    # First, try to pair sensors and actuators on the same equipment
    for i in range(num_loops):
        sensor = sampled_sensors.iloc[i]
        
        # Find actuators on the same equipment as this sensor
        same_equipment_actuators = sampled_actuators[sampled_actuators['equipment_id'] == sensor['equipment_id']]
        
        if len(same_equipment_actuators) > 0:
            # Pick a random actuator from the same equipment
            actuator = same_equipment_actuators.sample(1).iloc[0]
            equipment_id = sensor['equipment_id']
        else:
            # If no matching actuator on same equipment, just pick a random one
            actuator = sampled_actuators.iloc[i]
            
            # Use one of their equipment IDs (prefer the sensor's)
            if random.random() < 0.7:
                equipment_id = sensor['equipment_id']
            else:
                equipment_id = actuator['equipment_id']
        
        pairings.append((sensor, actuator, equipment_id))
    
    # Create appropriate loop names and parameter values
    for i, (sensor, actuator, equipment_id) in enumerate(pairings):
        # Create a descriptive loop name based on what it controls
        sensor_type = sensor['sensor_type']
        actuator_type = actuator['actuator_type']
        
        # Create logical loop name
        if sensor_type == 'temperature' and actuator_type in ['heater', 'valve']:
            loop_name = f"Temperature Control {i+1}"
        elif sensor_type == 'flow' and actuator_type in ['valve', 'pump']:
            loop_name = f"Flow Control {i+1}"
        elif sensor_type == 'pressure' and actuator_type in ['valve', 'compressor']:
            loop_name = f"Pressure Control {i+1}"
        elif sensor_type == 'level' and actuator_type in ['valve', 'pump']:
            loop_name = f"Level Control {i+1}"
        elif sensor_type == 'ph' and actuator_type in ['pump', 'valve', 'doser']:
            loop_name = f"pH Control {i+1}"
        elif sensor_type == 'speed' and actuator_type in ['motor', 'fan']:
            loop_name = f"Speed Control {i+1}"
        elif sensor_type == 'position' and actuator_type in ['positioner', 'motor']:
            loop_name = f"Position Control {i+1}"
        else:
            loop_name = f"{sensor_type.capitalize()}-{actuator_type.capitalize()} Control {i+1}"
        
        data['loop_name'].append(loop_name)
        data['process_variable_sensor_id'].append(sensor['sensor_id'])
        data['control_output_actuator_id'].append(actuator['actuator_id'])
        
        # Select controller type (weighted random)
        data['controller_type'].append(
            random.choices(list(controller_types.keys()), 
                         weights=list(controller_types.values()))[0]
        )
        
        # Select control mode (weighted random)
        data['control_mode'].append(
            random.choices(list(control_modes.keys()), 
                         weights=list(control_modes.values()))[0]
        )
        
        # Set appropriate setpoint based on sensor type
        sensor_unit = sensor['measurement_unit']
        sensor_range_min = sensor['measurement_range_min']
        sensor_range_max = sensor['measurement_range_max']
        
        # Set setpoint somewhere in the middle of the range
        setpoint = sensor_range_min + (sensor_range_max - sensor_range_min) * random.uniform(0.3, 0.7)
        data['setpoint_value'].append(round(setpoint, 2))
        data['setpoint_unit'].append(sensor_unit)
        
        # Set PID parameters based on controller type
        controller_type = data['controller_type'][-1]
        
        if controller_type == "PID":
            # Standard PID values
            p_value = round(random.uniform(0.5, 10.0), 2)
            i_value = round(random.uniform(0.05, 2.0), 3)
            d_value = round(random.uniform(0, 0.5), 3)
        elif controller_type == "On-Off":
            # On-Off controllers don't use P, I, D in the same way
            p_value = 1.0  # Just binary
            i_value = 0.0
            d_value = 0.0
        else:
            # Other controller types with some variation
            p_value = round(random.uniform(0.1, 20.0), 2)
            i_value = round(random.uniform(0, 5.0), 3)
            d_value = round(random.uniform(0, 2.0), 3)
        
        data['p_value'].append(p_value)
        data['i_value'].append(i_value)
        data['d_value'].append(d_value)
        
        # Set equipment and status
        data['equipment_id'].append(equipment_id)
        
        # Status (mostly active)
        statuses = ["Active", "Tuning", "Inactive", "Fault"]
        weights = [0.85, 0.05, 0.07, 0.03]  # Mostly active
        data['status'].append(random.choices(statuses, weights=weights)[0])
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {num_loops} control loops.")
    print(f"Data saved to {output_file}")
    
    return df

def display_statistics(df):
    """
    Display basic statistics about the generated control loops data
    
    Parameters:
    - df: DataFrame containing control loops data
    """
    if df is None or len(df) == 0:
        print("No data to analyze.")
        return
    
    print("\nControl Loops Statistics:")
    print(f"Total control loops: {len(df)}")
    
    print("\nController Type Distribution:")
    type_counts = df['controller_type'].value_counts()
    for controller_type, count in type_counts.items():
        print(f"  {controller_type}: {count} ({count/len(df)*100:.1f}%)")
    
    print("\nControl Mode Distribution:")
    mode_counts = df['control_mode'].value_counts()
    for control_mode, count in mode_counts.items():
        print(f"  {control_mode}: {count} ({count/len(df)*100:.1f}%)")
    
    print("\nStatus Distribution:")
    status_counts = df['status'].value_counts()
    for status, count in status_counts.items():
        print(f"  {status}: {count} ({count/len(df)*100:.1f}%)")
    
    print("\nPID Parameters (Average):")
    print(f"  P: {df['p_value'].mean():.2f}")
    print(f"  I: {df['i_value'].mean():.3f}")
    print(f"  D: {df['d_value'].mean():.3f}")
    
    # Check for uniqueness
    print(f"\nUnique process variable sensors: {df['process_variable_sensor_id'].nunique()} of {len(df)}")
    print(f"Unique control output actuators: {df['control_output_actuator_id'].nunique()} of {len(df)}")
    
    # Analyze the most common sensor-actuator pairings
    print("\nCommon Control Loop Types:")
    # Get sensor types
    sensors_df = pd.read_csv("data/sensors_data.csv")
    actuators_df = pd.read_csv("data/actuators_data.csv")
    
    # Merge to get sensor and actuator types
    merged_df = df.merge(
        sensors_df[['sensor_id', 'sensor_type']], 
        left_on='process_variable_sensor_id', 
        right_on='sensor_id', 
        how='left'
    )
    
    merged_df = merged_df.merge(
        actuators_df[['actuator_id', 'actuator_type']], 
        left_on='control_output_actuator_id', 
        right_on='actuator_id', 
        how='left'
    )
    
    # Count sensor-actuator type pairs
    pair_counts = merged_df.groupby(['sensor_type', 'actuator_type']).size().reset_index(name='count')
    pair_counts = pair_counts.sort_values('count', ascending=False)
    
    # Display top 5 pairs
    for _, row in pair_counts.head(5).iterrows():
        print(f"  {row['sensor_type']}-{row['actuator_type']}: {row['count']} loops")

if __name__ == "__main__":
    # Create directories if they don't exist
    os.makedirs("data", exist_ok=True)
    
    # Load the sensors and actuators data
    sensors_df, actuators_df = load_devices_data()
    
    if sensors_df is not None and actuators_df is not None:
        # Generate control loops data
        control_loops_df = generate_control_loops_data(
            sensors_df, 
            actuators_df,
            num_loops=50,  # Number of control loops to generate
            output_file="data/control_loops.csv"
        )
        
        # Display sample data and statistics
        if control_loops_df is not None:
            print("\nSample data (first 5 records):")
            print(control_loops_df.head())
            
            # Display statistics
            display_statistics(control_loops_df)

Generating 50 control loops...
Successfully generated 50 control loops.
Data saved to data/control_loops.csv

Sample data (first 5 records):
         loop_id                    loop_name process_variable_sensor_id  \
0  LOOP-B9F55919  Oxygen-Positioner Control 1               SEN-5108DE00   
1  LOOP-4DCCB066      Humidity-Pump Control 2               SEN-ED322A76   
2  LOOP-F2C8B181       Oxygen-Doser Control 3               SEN-D8485E11   
3  LOOP-744E9C8F           Flow-Fan Control 4               SEN-36198AE5   
4  LOOP-94ACA76F        Flow-Heater Control 5               SEN-D5682186   

  control_output_actuator_id controller_type control_mode  setpoint_value  \
0               ACT-E271F55F           Ratio      Cascade           16.52   
1               ACT-4B0AFAAC             PID         Auto           57.44   
2               ACT-8BB70EA3             PID         Auto           16.61   
3               ACT-B4C149AC             PID      Cascade           80.19   
4               A