In [1]:
# sensor_simulator.py

import random
import time
import json

class Sensor:
    def __init__(self, sensor_id, sensor_type, normal_range, unit):
        self.sensor_id = sensor_id
        self.sensor_type = sensor_type
        self.normal_range = normal_range
        self.unit = unit
        self.current_value = self.generate_normal_value()

    def generate_normal_value(self):
        return round(random.uniform(self.normal_range[0], self.normal_range[1]), 2)

    def read_value(self, under_attack=False, attack_type=None, attack_intensity=1):
        """Generates a sensor reading.
        If under_attack, the value might be anomalous based on attack_type.
        attack_intensity can be used to scale the anomaly (e.g., 1 to 5).
        """
        if under_attack:
            # Simple anomaly generation for demonstration
            # This should be more sophisticated based on specific attack impacts
            if attack_type == "data_manipulation": # e.g., for DoS on sensor or direct tampering
                anomaly_factor = random.choice([-1, 1]) * self.normal_range[1] * 0.5 * attack_intensity
                self.current_value = round(self.generate_normal_value() + anomaly_factor, 2)
            elif attack_type == "sensor_outage": # e.g., DoS making sensor unresponsive or send null
                return None # Or a specific error code
            else:
                # For other attacks, sensor might still report normally, or be indirectly affected
                self.current_value = self.generate_normal_value()
        else:
            # Simulate slight variations around the previous value or generate a new normal one
            if random.random() < 0.7: # 70% chance to stay close to previous
                variation = (self.normal_range[1] - self.normal_range[0]) * 0.05 # 5% of range
                self.current_value = round(max(self.normal_range[0], min(self.normal_range[1], self.current_value + random.uniform(-variation, variation))), 2)
            else:
                self.current_value = self.generate_normal_value()
        return self.current_value

    def get_data_point(self, timestamp, under_attack=False, attack_type=None, attack_intensity=1):
        value = self.read_value(under_attack, attack_type, attack_intensity)
        return {
            "timestamp": timestamp,
            "sensor_id": self.sensor_id,
            "sensor_type": self.sensor_type,
            "value": value,
            "unit": self.unit,
            "status": "error" if value is None else "ok"
        }

# Define sensor types based on the paper
SENSOR_DEFINITIONS = {
    "temperature": {"range": [10, 35], "unit": "Celsius"},
    "humidity": {"range": [30, 70], "unit": "%RH"},
    "water_level": {"range": [0, 100], "unit": "cm"},
    "ph_sensor": {"range": [6.0, 8.0], "unit": "pH"},
    "soil_moisture": {"range": [20, 80], "unit": "%"},
    "flame_sensor": {"range": [0, 1], "unit": "binary"}, # 0 = no flame, 1 = flame
    "ultrasonic_sensor": {"range": [2, 400], "unit": "cm"}, # Distance
    "ir_receiver": {"range": [0, 1], "unit": "binary"}, # Signal detected or not
    "heart_rate_sensor": {"range": [50, 130], "unit": "bpm"},
    # Modbus PLCs can be considered as sources of multiple data points (registers)
    # For simplicity, we can simulate a generic "plc_register_value"
    "plc_register_value": {"range": [0, 65535], "unit": "integer"}
}

def initialize_sensors(num_each_type=1):
    sensors = []
    sensor_id_counter = 0
    for sensor_type, props in SENSOR_DEFINITIONS.items():
        for i in range(num_each_type):
            sensors.append(Sensor(f"{sensor_type}_{i}", sensor_type, props["range"], props["unit"]))
            sensor_id_counter += 1
    return sensors

if __name__ == "__main__":
    sensors = initialize_sensors(num_each_type=2)
    print(f"Initialized {len(sensors)} sensors.")

    for i in range(5):
        print(f"\n--- Simulation Timestep {i+1} ---")
        current_time = time.time()
        for sensor in sensors:
            # Example: Simulate one sensor being under a data manipulation attack
            is_attacked = False
            attack_details = None
            if sensor.sensor_id == "temperature_0" and i >= 2: # Attack temperature_0 from timestep 3
                is_attacked = True
                attack_details = "data_manipulation"

            data = sensor.get_data_point(current_time, under_attack=is_attacked, attack_type=attack_details)
            print(json.dumps(data))
        time.sleep(1)



Initialized 20 sensors.

--- Simulation Timestep 1 ---
{"timestamp": 1746920907.4924839, "sensor_id": "temperature_0", "sensor_type": "temperature", "value": 30.94, "unit": "Celsius", "status": "ok"}
{"timestamp": 1746920907.4924839, "sensor_id": "temperature_1", "sensor_type": "temperature", "value": 28.41, "unit": "Celsius", "status": "ok"}
{"timestamp": 1746920907.4924839, "sensor_id": "humidity_0", "sensor_type": "humidity", "value": 46.93, "unit": "%RH", "status": "ok"}
{"timestamp": 1746920907.4924839, "sensor_id": "humidity_1", "sensor_type": "humidity", "value": 65.18, "unit": "%RH", "status": "ok"}
{"timestamp": 1746920907.4924839, "sensor_id": "water_level_0", "sensor_type": "water_level", "value": 20.54, "unit": "cm", "status": "ok"}
{"timestamp": 1746920907.4924839, "sensor_id": "water_level_1", "sensor_type": "water_level", "value": 18.21, "unit": "cm", "status": "ok"}
{"timestamp": 1746920907.4924839, "sensor_id": "ph_sensor_0", "sensor_type": "ph_sensor", "value": 6.86, 

In [2]:
# attack_simulator.py

import random
import time
import json

# Placeholder for network-like log entries
# In a real scenario, this would be more detailed and structured like pcap features
NETWORK_LOG_ENTRIES = []

def log_network_event(timestamp, event_type, src_ip, dst_ip, src_port, dst_port, protocol, details):
    log_entry = {
        "timestamp": timestamp,
        "event_type": event_type, # e.g., "connection_attempt", "data_transfer", "attack_indicator"
        "src_ip": src_ip,
        "dst_ip": dst_ip,
        "src_port": src_port,
        "dst_port": dst_port,
        "protocol": protocol,
        "details": details
    }
    NETWORK_LOG_ENTRIES.append(log_entry)
    # print(json.dumps(log_entry)) # For real-time logging if needed

class AttackSimulator:
    def __init__(self, target_ips, network_segment="192.168.1.0/24"):
        self.target_ips = target_ips # List of IPs that can be targeted
        self.attacker_ips = [f"10.0.0.{i}" for i in range(1, 21)] # Pool of attacker IPs
        self.network_segment = network_segment

    def _get_random_target(self):
        return random.choice(self.target_ips) if self.target_ips else "192.168.1.100"

    def _get_random_attacker(self):
        return random.choice(self.attacker_ips)

    def simulate_ddos_tcp_syn_flood(self, timestamp, duration_seconds=5, intensity=100):
        print(f"INFO: Simulating DDoS TCP SYN Flood at {timestamp} for {duration_seconds}s with intensity {intensity}")
        target_ip = self._get_random_target()
        target_port = random.choice([80, 443, 8080])
        start_time = timestamp
        num_packets = 0
        for _ in range(duration_seconds * intensity):
            attacker_ip = self._get_random_attacker()
            src_port = random.randint(1024, 65535)
            log_network_event(start_time + (_/(intensity)), "attack_indicator",
                              attacker_ip, target_ip, src_port, target_port, "TCP",
                              {"flags": "SYN", "type": "DDoS_TCP_SYN_Flood", "payload_size": 0})
            num_packets +=1
        print(f"INFO: DDoS TCP SYN Flood simulation complete. Target: {target_ip}:{target_port}, Packets: {num_packets}")
        return {"attack_type": "DDoS_TCP_SYN_Flood", "target": f"{target_ip}:{target_port}", "packets_sent": num_packets, "duration": duration_seconds}

    def simulate_port_scan(self, timestamp, intensity=10):
        print(f"INFO: Simulating Port Scan at {timestamp} with intensity {intensity}")
        target_ip = self._get_random_target()
        attacker_ip = self._get_random_attacker()
        scanned_ports_tcp = random.sample(range(1, 1025), min(intensity, 1024)) # Scan up to `intensity` common ports
        scanned_ports_udp = random.sample(range(1, 1025), min(intensity//2, 1024))
        num_probes = 0

        for port in scanned_ports_tcp:
            log_network_event(timestamp, "attack_indicator",
                              attacker_ip, target_ip, random.randint(1024,65535), port, "TCP",
                              {"type": "Port_Scan_TCP", "subtype": "SYN_PROBE"})
            num_probes +=1
        for port in scanned_ports_udp:
            log_network_event(timestamp, "attack_indicator",
                              attacker_ip, target_ip, random.randint(1024,65535), port, "UDP",
                              {"type": "Port_Scan_UDP", "subtype": "UDP_PROBE"})
            num_probes +=1
        print(f"INFO: Port Scan simulation complete. Target: {target_ip}, Probes: {num_probes}")
        return {"attack_type": "Port_Scan", "target": target_ip, "ports_probed": len(scanned_ports_tcp) + len(scanned_ports_udp)}

    def simulate_sql_injection(self, timestamp, target_service_port=80):
        print(f"INFO: Simulating SQL Injection at {timestamp}")
        target_ip = self._get_random_target()
        attacker_ip = self._get_random_attacker()
        payloads = [
            "' OR '1'='1",
            "' UNION SELECT username, password FROM users --",
            "1; DROP TABLE users"
        ]
        chosen_payload = random.choice(payloads)
        log_network_event(timestamp, "attack_indicator",
                          attacker_ip, target_ip, random.randint(1024,65535), target_service_port, "HTTP",
                          {"type": "SQL_Injection", "method": "POST", "uri": "/login", "payload_snippet": chosen_payload[:50]})
        print(f"INFO: SQL Injection simulation complete. Target: {target_ip}:{target_service_port}")
        return {"attack_type": "SQL_Injection", "target": f"{target_ip}:{target_service_port}", "payload_used": chosen_payload}

    # Add more attack simulations based on the 14 attacks from the paper:
    # DDoS UDP Flood, DDoS HTTP Flood, DDoS ICMP Flood
    # OS Fingerprinting, Vulnerability Scanning
    # ARP Spoofing, DNS Spoofing
    # XSS, Uploading Attack
    # Ransomware, Backdoor

    def simulate_generic_attack(self, timestamp, attack_name, protocol="TCP", port=80, details=None):
        print(f"INFO: Simulating Generic Attack: {attack_name} at {timestamp}")
        target_ip = self._get_random_target()
        attacker_ip = self._get_random_attacker()
        event_details = {"type": attack_name}
        if details:
            event_details.update(details)

        log_network_event(timestamp, "attack_indicator",
                          attacker_ip, target_ip, random.randint(1024, 65535), port, protocol,
                          event_details)
        print(f"INFO: Generic Attack {attack_name} simulation complete. Target: {target_ip}:{port}")
        return {"attack_type": attack_name, "target": f"{target_ip}:{port}", "details": event_details}


# Example Usage (can be run as part of a larger simulation script)
if __name__ == "__main__":
    # These IPs would ideally come from a simulated network configuration
    simulated_device_ips = [f"192.168.1.{i}" for i in range(10, 20)]
    attacker = AttackSimulator(target_ips=simulated_device_ips)
    current_sim_time = time.time()

    print("\n--- Running Attack Simulations ---")
    attack_results = []
    attack_results.append(attacker.simulate_ddos_tcp_syn_flood(current_sim_time, duration_seconds=2, intensity=50))
    time.sleep(0.5)
    current_sim_time = time.time()
    attack_results.append(attacker.simulate_port_scan(current_sim_time, intensity=5))
    time.sleep(0.5)
    current_sim_time = time.time()
    attack_results.append(attacker.simulate_sql_injection(current_sim_time))
    time.sleep(0.5)
    current_sim_time = time.time()
    attack_results.append(attacker.simulate_generic_attack(current_sim_time, "DDoS_UDP_Flood", protocol="UDP", port=53, details={"packet_rate": 1000}))
    time.sleep(0.5)
    current_sim_time = time.time()
    attack_results.append(attacker.simulate_generic_attack(current_sim_time, "ARP_Spoofing", protocol="ARP", port=0, details={"target_pair": ["192.168.1.10", "192.168.1.1"]}))

    print("\n--- Attack Simulation Summary ---")
    for res in attack_results:
        print(json.dumps(res))

    print(f"\n--- Generated Network Log Entries ({len(NETWORK_LOG_ENTRIES)}) ---")
    # In a real run, save NETWORK_LOG_ENTRIES to a file
    # For demonstration, printing first 5 and last 5 if many
    if len(NETWORK_LOG_ENTRIES) > 10:
        for entry in NETWORK_LOG_ENTRIES[:5]:
            print(json.dumps(entry))
        print("...")
        for entry in NETWORK_LOG_ENTRIES[-5:]:
            print(json.dumps(entry))
    else:
        for entry in NETWORK_LOG_ENTRIES:
            print(json.dumps(entry))

    # To save to a file:
    # with open("network_attack_logs.jsonl", "w") as f:
    #     for entry in NETWORK_LOG_ENTRIES:
    #         f.write(json.dumps(entry) + "\n")
    # print("\nNetwork logs saved to network_attack_logs.jsonl")




--- Running Attack Simulations ---
INFO: Simulating DDoS TCP SYN Flood at 1746920954.124338 for 2s with intensity 50
INFO: DDoS TCP SYN Flood simulation complete. Target: 192.168.1.12:80, Packets: 100
INFO: Simulating Port Scan at 1746920954.6254847 with intensity 5
INFO: Port Scan simulation complete. Target: 192.168.1.14, Probes: 7
INFO: Simulating SQL Injection at 1746920955.1260762
INFO: SQL Injection simulation complete. Target: 192.168.1.12:80
INFO: Simulating Generic Attack: DDoS_UDP_Flood at 1746920955.6266522
INFO: Generic Attack DDoS_UDP_Flood simulation complete. Target: 192.168.1.18:53
INFO: Simulating Generic Attack: ARP_Spoofing at 1746920956.127197
INFO: Generic Attack ARP_Spoofing simulation complete. Target: 192.168.1.10:0

--- Attack Simulation Summary ---
{"attack_type": "DDoS_TCP_SYN_Flood", "target": "192.168.1.12:80", "packets_sent": 100, "duration": 2}
{"attack_type": "Port_Scan", "target": "192.168.1.14", "ports_probed": 7}
{"attack_type": "SQL_Injection", "tar

In [5]:
# sensor_simulator.py

import random
import time
import json

OUTPUT_FILE = "sensor_simulation_output.jsonl"

class Sensor:
    def __init__(self, sensor_id, sensor_type, normal_range, unit):
        self.sensor_id = sensor_id
        self.sensor_type = sensor_type
        self.normal_range = normal_range
        self.unit = unit
        self.current_value = self.generate_normal_value()

    def generate_normal_value(self):
        return round(random.uniform(self.normal_range[0], self.normal_range[1]), 2)

    def read_value(self, under_attack=False, attack_type=None, attack_intensity=1):
        """Generates a sensor reading.
        If under_attack, the value might be anomalous based on attack_type.
        attack_intensity can be used to scale the anomaly (e.g., 1 to 5).
        """
        if under_attack:
            if attack_type == "data_manipulation":
                anomaly_factor = random.choice([-1, 1]) * self.normal_range[1] * 0.5 * attack_intensity
                self.current_value = round(self.generate_normal_value() + anomaly_factor, 2)
            elif attack_type == "sensor_outage":
                return None
            else:
                self.current_value = self.generate_normal_value()
        else:
            if random.random() < 0.7:
                variation = (self.normal_range[1] - self.normal_range[0]) * 0.05
                self.current_value = round(max(self.normal_range[0], min(self.normal_range[1], self.current_value + random.uniform(-variation, variation))), 2)
            else:
                self.current_value = self.generate_normal_value()
        return self.current_value

    def get_data_point(self, timestamp, under_attack=False, attack_type=None, attack_intensity=1):
        value = self.read_value(under_attack, attack_type, attack_intensity)
        return {
            "timestamp": timestamp,
            "sensor_id": self.sensor_id,
            "sensor_type": self.sensor_type,
            "value": value,
            "unit": self.unit,
            "status": "error" if value is None else "ok"
        }

SENSOR_DEFINITIONS = {
    "temperature": {"range": [10, 35], "unit": "Celsius"},
    "humidity": {"range": [30, 70], "unit": "%RH"},
    "water_level": {"range": [0, 100], "unit": "cm"},
    "ph_sensor": {"range": [6.0, 8.0], "unit": "pH"},
    "soil_moisture": {"range": [20, 80], "unit": "%"},
    "flame_sensor": {"range": [0, 1], "unit": "binary"},
    "ultrasonic_sensor": {"range": [2, 400], "unit": "cm"},
    "ir_receiver": {"range": [0, 1], "unit": "binary"},
    "heart_rate_sensor": {"range": [50, 130], "unit": "bpm"},
    "plc_register_value": {"range": [0, 65535], "unit": "integer"}
}

def initialize_sensors(num_each_type=1):
    sensors = []
    sensor_id_counter = 0
    for sensor_type, props in SENSOR_DEFINITIONS.items():
        for i in range(num_each_type):
            sensors.append(Sensor(f"{sensor_type}_{i}", sensor_type, props["range"], props["unit"]))
            sensor_id_counter += 1
    return sensors

if __name__ == "__main__":
    sensors = initialize_sensors(num_each_type=2)
    print(f"Initialized {len(sensors)} sensors.")
    print(f"Output will be written to {OUTPUT_FILE}")

    with open(OUTPUT_FILE, "w") as outfile:
        outfile.write(f"Initialized {len(sensors)} sensors.\n\n") # Write header info to file too
        for i in range(5): # Simulate for 5 timesteps
            header_line = f"--- Simulation Timestep {i+1} ---\n"
            print(header_line.strip()) # Print to console
            outfile.write(header_line) # Write to file
            current_time = time.time()
            for sensor in sensors:
                is_attacked = False
                attack_details = None
                if sensor.sensor_id == "temperature_0" and i >= 2: # Attack temperature_0 from timestep 3
                    is_attacked = True
                    attack_details = "data_manipulation"

                data = sensor.get_data_point(current_time, under_attack=is_attacked, attack_type=attack_details)
                json_data = json.dumps(data)
                print(json_data) # Print to console
                outfile.write(json_data + "\n") # Write JSON data to file, one per line
            outfile.write("\n") # Add a newline after each timestep block in the file
            time.sleep(1)
    print(f"Sensor simulation complete. Data written to {OUTPUT_FILE}")



Initialized 20 sensors.
Output will be written to sensor_simulation_output.jsonl
--- Simulation Timestep 1 ---
{"timestamp": 1746921943.3554137, "sensor_id": "temperature_0", "sensor_type": "temperature", "value": 31.16, "unit": "Celsius", "status": "ok"}
{"timestamp": 1746921943.3554137, "sensor_id": "temperature_1", "sensor_type": "temperature", "value": 31.12, "unit": "Celsius", "status": "ok"}
{"timestamp": 1746921943.3554137, "sensor_id": "humidity_0", "sensor_type": "humidity", "value": 51.08, "unit": "%RH", "status": "ok"}
{"timestamp": 1746921943.3554137, "sensor_id": "humidity_1", "sensor_type": "humidity", "value": 50.0, "unit": "%RH", "status": "ok"}
{"timestamp": 1746921943.3554137, "sensor_id": "water_level_0", "sensor_type": "water_level", "value": 76.19, "unit": "cm", "status": "ok"}
{"timestamp": 1746921943.3554137, "sensor_id": "water_level_1", "sensor_type": "water_level", "value": 59.62, "unit": "cm", "status": "ok"}
{"timestamp": 1746921943.3554137, "sensor_id": "ph

In [6]:
# attack_simulator.py

import random
import time
import json

OUTPUT_FILE = "attack_simulation_output.jsonl"
NETWORK_LOG_ENTRIES = [] # Global list to store network event logs

def log_network_event(timestamp, event_type, src_ip, dst_ip, src_port, dst_port, protocol, details):
    log_entry = {
        "timestamp": timestamp,
        "event_type": event_type,
        "src_ip": src_ip,
        "dst_ip": dst_ip,
        "src_port": src_port,
        "dst_port": dst_port,
        "protocol": protocol,
        "details": details
    }
    NETWORK_LOG_ENTRIES.append(log_entry)

class AttackSimulator:
    def __init__(self, target_ips, network_segment="192.168.1.0/24"):
        self.target_ips = target_ips
        self.attacker_ips = [f"10.0.0.{i}" for i in range(1, 21)]
        self.network_segment = network_segment
        self.output_lines_for_file = [] # Store console output for file writing

    def _get_random_target(self):
        return random.choice(self.target_ips) if self.target_ips else "192.168.1.100"

    def _get_random_attacker(self):
        return random.choice(self.attacker_ips)

    def _log_to_console_and_file(self, message):
        print(message)
        self.output_lines_for_file.append(message + "\n")

    def simulate_ddos_tcp_syn_flood(self, timestamp, duration_seconds=5, intensity=100):
        log_msg = f"INFO: Simulating DDoS TCP SYN Flood at {timestamp} for {duration_seconds}s with intensity {intensity}"
        self._log_to_console_and_file(log_msg)
        target_ip = self._get_random_target()
        target_port = random.choice([80, 443, 8080])
        start_time = timestamp
        num_packets = 0
        for _ in range(duration_seconds * intensity):
            attacker_ip = self._get_random_attacker()
            src_port = random.randint(1024, 65535)
            log_network_event(start_time + (_/(intensity)), "attack_indicator",
                              attacker_ip, target_ip, src_port, target_port, "TCP",
                              {"flags": "SYN", "type": "DDoS_TCP_SYN_Flood", "payload_size": 0})
            num_packets +=1
        log_msg_complete = f"INFO: DDoS TCP SYN Flood simulation complete. Target: {target_ip}:{target_port}, Packets: {num_packets}"
        self._log_to_console_and_file(log_msg_complete)
        return {"attack_type": "DDoS_TCP_SYN_Flood", "target": f"{target_ip}:{target_port}", "packets_sent": num_packets, "duration": duration_seconds}

    def simulate_port_scan(self, timestamp, intensity=10):
        log_msg = f"INFO: Simulating Port Scan at {timestamp} with intensity {intensity}"
        self._log_to_console_and_file(log_msg)
        target_ip = self._get_random_target()
        attacker_ip = self._get_random_attacker()
        scanned_ports_tcp = random.sample(range(1, 1025), min(intensity, 1024))
        scanned_ports_udp = random.sample(range(1, 1025), min(intensity//2, 1024))
        num_probes = 0
        for port in scanned_ports_tcp:
            log_network_event(timestamp, "attack_indicator",
                              attacker_ip, target_ip, random.randint(1024,65535), port, "TCP",
                              {"type": "Port_Scan_TCP", "subtype": "SYN_PROBE"})
            num_probes +=1
        for port in scanned_ports_udp:
            log_network_event(timestamp, "attack_indicator",
                              attacker_ip, target_ip, random.randint(1024,65535), port, "UDP",
                              {"type": "Port_Scan_UDP", "subtype": "UDP_PROBE"})
            num_probes +=1
        log_msg_complete = f"INFO: Port Scan simulation complete. Target: {target_ip}, Probes: {num_probes}"
        self._log_to_console_and_file(log_msg_complete)
        return {"attack_type": "Port_Scan", "target": target_ip, "ports_probed": len(scanned_ports_tcp) + len(scanned_ports_udp)}

    def simulate_sql_injection(self, timestamp, target_service_port=80):
        log_msg = f"INFO: Simulating SQL Injection at {timestamp}"
        self._log_to_console_and_file(log_msg)
        target_ip = self._get_random_target()
        attacker_ip = self._get_random_attacker()
        payloads = [
            "' OR '1'='1",
            "' UNION SELECT username, password FROM users --",
            "1; DROP TABLE users"
        ]
        chosen_payload = random.choice(payloads)
        log_network_event(timestamp, "attack_indicator",
                          attacker_ip, target_ip, random.randint(1024,65535), target_service_port, "HTTP",
                          {"type": "SQL_Injection", "method": "POST", "uri": "/login", "payload_snippet": chosen_payload[:50]})
        log_msg_complete = f"INFO: SQL Injection simulation complete. Target: {target_ip}:{target_service_port}"
        self._log_to_console_and_file(log_msg_complete)
        return {"attack_type": "SQL_Injection", "target": f"{target_ip}:{target_service_port}", "payload_used": chosen_payload}

    def simulate_generic_attack(self, timestamp, attack_name, protocol="TCP", port=80, details=None):
        log_msg = f"INFO: Simulating Generic Attack: {attack_name} at {timestamp}"
        self._log_to_console_and_file(log_msg)
        target_ip = self._get_random_target()
        attacker_ip = self._get_random_attacker()
        event_details = {"type": attack_name}
        if details:
            event_details.update(details)
        log_network_event(timestamp, "attack_indicator",
                          attacker_ip, target_ip, random.randint(1024, 65535), port, protocol,
                          event_details)
        log_msg_complete = f"INFO: Generic Attack {attack_name} simulation complete. Target: {target_ip}:{port}"
        self._log_to_console_and_file(log_msg_complete)
        return {"attack_type": attack_name, "target": f"{target_ip}:{port}", "details": event_details}

if __name__ == "__main__":
    simulated_device_ips = [f"192.168.1.{i}" for i in range(10, 20)]
    attacker = AttackSimulator(target_ips=simulated_device_ips)
    current_sim_time = time.time()

    attacker._log_to_console_and_file("\n--- Running Attack Simulations ---")
    attack_results_summary = [] # To store summary of attacks for file

    attack_results_summary.append(attacker.simulate_ddos_tcp_syn_flood(current_sim_time, duration_seconds=2, intensity=50))
    time.sleep(0.5)
    current_sim_time = time.time()
    attack_results_summary.append(attacker.simulate_port_scan(current_sim_time, intensity=5))
    time.sleep(0.5)
    current_sim_time = time.time()
    attack_results_summary.append(attacker.simulate_sql_injection(current_sim_time))
    time.sleep(0.5)
    current_sim_time = time.time()
    attack_results_summary.append(attacker.simulate_generic_attack(current_sim_time, "DDoS_UDP_Flood", protocol="UDP", port=53, details={"packet_rate": 1000}))
    time.sleep(0.5)
    current_sim_time = time.time()
    attack_results_summary.append(attacker.simulate_generic_attack(current_sim_time, "ARP_Spoofing", protocol="ARP", port=0, details={"target_pair": ["192.168.1.10", "192.168.1.1"]}))

    attacker._log_to_console_and_file("\n--- Attack Simulation Summary ---")
    for res in attack_results_summary:
        attacker._log_to_console_and_file(json.dumps(res))

    attacker._log_to_console_and_file(f"\n--- Generated Network Log Entries ({len(NETWORK_LOG_ENTRIES)}) ---")
    # The actual log entries are now written separately after all info lines

    # Write all collected console output and then the detailed network logs to the file
    with open(OUTPUT_FILE, "w") as f:
        for line_content in attacker.output_lines_for_file:
            f.write(line_content) # These already have newlines
        # Now write the detailed NETWORK_LOG_ENTRIES, each as a JSON line
        for entry in NETWORK_LOG_ENTRIES:
            f.write(json.dumps(entry) + "\n")

    print(f"\nAttack simulation complete. Data written to {OUTPUT_FILE}")




--- Running Attack Simulations ---
INFO: Simulating DDoS TCP SYN Flood at 1746921982.6815176 for 2s with intensity 50
INFO: DDoS TCP SYN Flood simulation complete. Target: 192.168.1.12:80, Packets: 100
INFO: Simulating Port Scan at 1746921983.1820152 with intensity 5
INFO: Port Scan simulation complete. Target: 192.168.1.18, Probes: 7
INFO: Simulating SQL Injection at 1746921983.6824315
INFO: SQL Injection simulation complete. Target: 192.168.1.14:80
INFO: Simulating Generic Attack: DDoS_UDP_Flood at 1746921984.1827435
INFO: Generic Attack DDoS_UDP_Flood simulation complete. Target: 192.168.1.13:53
INFO: Simulating Generic Attack: ARP_Spoofing at 1746921984.6830585
INFO: Generic Attack ARP_Spoofing simulation complete. Target: 192.168.1.16:0

--- Attack Simulation Summary ---
{"attack_type": "DDoS_TCP_SYN_Flood", "target": "192.168.1.12:80", "packets_sent": 100, "duration": 2}
{"attack_type": "Port_Scan", "target": "192.168.1.18", "ports_probed": 7}
{"attack_type": "SQL_Injection", "t

In [7]:
# preprocess_simulation_data.py

import json
import pandas as pd
import numpy as np

# These are the 27 features the LSTM model was trained on (from user log)
# We will attempt to map our simulation data to these, filling missing ones with defaults.
FEATURES_COLUMNS = [
    'arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port',
    'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst',
    'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack',
    'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.qry.type',
    'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in',
    'mqtt.conflag.cleansess', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as',
    'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id', 'Attack_label'
]

def process_attack_log_entry(log_entry):
    """Converts a single attack log entry to a feature dictionary."""
    features = {col: 0 for col in FEATURES_COLUMNS} # Default all to 0
    features['Attack_label'] = 1 # This is an attack event

    # Attempt to map some fields from the attack log
    # This is a VERY simplified mapping and many features will remain 0
    # A proper mapping would require a full feature extraction engine like Zeek/TShark
    # on raw network data (pcap), which our simulation does not produce.

    if 'dst_port' in log_entry:
        features['tcp.dstport'] = log_entry['dst_port'] # Assuming tcp.dstport if port is present
        if log_entry.get('protocol') == 'UDP': # if udp, maybe udp.dstport (not in 27 features)
            pass # udp.dstport is not one of the 27 features

    if log_entry.get('protocol') == 'TCP':
        if log_entry.get('details', {}).get('flags') == 'SYN':
            features['tcp.connection.syn'] = 1
        if log_entry.get('details', {}).get('flags') == 'ACK': # Assuming this mapping
            features['tcp.flags.ack'] = 1
        # 'tcp.len' could be details.payload_size if available and relevant
        if 'payload_size' in log_entry.get('details', {}):
            features['tcp.len'] = log_entry['details']['payload_size']

    if log_entry.get('protocol') == 'HTTP': # HTTP runs over TCP
        # Example: if it's an SQL injection, we might try to set http related flags if we had them
        # For now, this is just a placeholder for more complex logic
        if log_entry.get('details', {}).get('type') == 'SQL_Injection':
            # We don't have http.content_length from this log directly
            pass

    # Add more mappings here if possible, but acknowledge limitations.
    # Most of the 27 features (e.g., arp.*, dns.*, mqtt.*, mbtcp.*, checksums, raw values)
    # are not present in the high-level simulation logs.

    # For demonstration, let's add a few more dummy values based on attack type
    attack_type = log_entry.get('details', {}).get('type')
    if attack_type == 'DDoS_TCP_SYN_Flood':
        features['tcp.len'] = 0 # SYN packets usually have 0 TCP payload len
        features['tcp.connection.syn'] = 1
    elif attack_type == 'Port_Scan_TCP':
        features['tcp.connection.syn'] = 1 # Scans often use SYN probes
    elif attack_type == 'DDoS_UDP_Flood':
        features['udp.stream'] = log_entry.get('dst_port',0) % 256 # Arbitrary mapping for demo
        features['tcp.len'] = 64 #  Example payload size for UDP flood packets

    return features

def process_sensor_log_entry(log_entry):
    """Converts a single sensor log entry to a feature dictionary."""
    features = {col: 0 for col in FEATURES_COLUMNS}
    features['Attack_label'] = 0 # Sensor data is assumed normal unless specified

    # Sensor data does not directly map to network features.
    # We could try to create proxy features if certain sensor values correlate with network states,
    # but that's complex. For now, these will be mostly zeros, indicating 'no network event'.
    # One might argue that sensor data should be used by a different model or an ensemble.

    # Example: if a PLC sensor reading is very high, it might be an mbtcp related feature.
    if log_entry.get('sensor_type') == 'plc_register_value':
        features['mbtcp.len'] = 1 # Indicate some mbtcp activity
        features['mbtcp.trans_id'] = log_entry.get('value', 0) % 1000 # Example mapping
        features['mbtcp.unit_id'] = int(log_entry.get('sensor_id', '_0').split('_')[-1]) + 1

    return features

def main(attack_log_file, sensor_log_file, output_csv_file):
    processed_data = []

    print(f"Processing attack log file: {attack_log_file}")
    try:
        with open(attack_log_file, 'r') as f:
            for line in f:
                if line.strip().startswith("INFO:") or line.strip().startswith("--- ") or not line.strip():
                    continue # Skip info/header lines
                try:
                    log_entry = json.loads(line.strip())
                    # We are interested in the detailed log entries, not the summary ones
                    if "event_type" in log_entry and log_entry["event_type"] == "attack_indicator":
                        processed_data.append(process_attack_log_entry(log_entry))
                except json.JSONDecodeError:
                    print(f"Skipping malformed JSON line in attack log: {line.strip()}")
    except FileNotFoundError:
        print(f"Warning: Attack log file {attack_log_file} not found.")

    print(f"Processing sensor log file: {sensor_log_file}")
    try:
        with open(sensor_log_file, 'r') as f:
            for line in f:
                if line.strip().startswith("--- ") or line.strip().startswith("Initialized") or not line.strip() :
                    continue # Skip header/info lines
                try:
                    log_entry = json.loads(line.strip())
                    processed_data.append(process_sensor_log_entry(log_entry))
                except json.JSONDecodeError:
                    print(f"Skipping malformed JSON line in sensor log: {line.strip()}")
    except FileNotFoundError:
        print(f"Warning: Sensor log file {sensor_log_file} not found.")

    if not processed_data:
        print("No data processed. Exiting.")
        return

    df = pd.DataFrame(processed_data)
    df = df[FEATURES_COLUMNS] # Ensure correct column order and selection
    df.fillna(0, inplace=True) # Fill any NaNs that might have occurred with 0

    # Ensure all feature columns are numeric (except Attack_label which is int)
    for col in df.columns:
        if col != 'Attack_label':
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        else:
            df[col] = df[col].astype(int)

    print(f"Processed {len(df)} records.")
    print(f"Value counts for 'Attack_label':\n{df['Attack_label'].value_counts()}")

    df.to_csv(output_csv_file, index=False)
    print(f"Processed simulation data saved to {output_csv_file}")
    print("\nIMPORTANT NOTE:")
    print("This preprocessing script provides a VERY SIMPLIFIED conversion of high-level simulation logs.")
    print("The generated features are NOT a faithful representation of the original Edge-IIoTset features.")
    print("Proper feature engineering from raw network traffic (pcap) using tools like Zeek/TShark is required")
    print("for accurate model validation against the original dataset's feature space.")
    print("This output is primarily for demonstrating the validation workflow.")

if __name__ == "__main__":
    # These filenames correspond to what was saved from user's pasted content
    # User will need to ensure these files are in the same directory as the script or provide full paths.
    attack_logs = "attack_simulation_output.jsonl"
    sensor_logs = "sensor_simulation_output.jsonl"
    output_csv = "processed_simulation_data.csv"
    main(attack_logs, sensor_logs, output_csv)



Processing attack log file: attack_simulation_output.jsonl
Processing sensor log file: sensor_simulation_output.jsonl
Processed 210 records.
Value counts for 'Attack_label':
Attack_label
1    110
0    100
Name: count, dtype: int64
Processed simulation data saved to processed_simulation_data.csv

IMPORTANT NOTE:
This preprocessing script provides a VERY SIMPLIFIED conversion of high-level simulation logs.
The generated features are NOT a faithful representation of the original Edge-IIoTset features.
Proper feature engineering from raw network traffic (pcap) using tools like Zeek/TShark is required
for accurate model validation against the original dataset's feature space.
This output is primarily for demonstrating the validation workflow.


In [8]:
# preprocess_simulation_data.py

import json
import pandas as pd
import numpy as np

# These are the 27 features the LSTM model was trained on (from user log)
# We will attempt to map our simulation data to these, filling missing ones with defaults.
FEATURES_COLUMNS = [
    'arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port',
    'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst',
    'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack',
    'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.qry.type',
    'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in',
    'mqtt.conflag.cleansess', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as',
    'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id', 'Attack_label'
]

def process_attack_log_entry(log_entry):
    """Converts a single attack log entry to a feature dictionary."""
    features = {col: 0 for col in FEATURES_COLUMNS} # Default all to 0
    features['Attack_label'] = 1 # This is an attack event

    # Attempt to map some fields from the attack log
    # This is a VERY simplified mapping and many features will remain 0
    # A proper mapping would require a full feature extraction engine like Zeek/TShark
    # on raw network data (pcap), which our simulation does not produce.

    if 'dst_port' in log_entry:
        features['tcp.dstport'] = log_entry['dst_port'] # Assuming tcp.dstport if port is present
        if log_entry.get('protocol') == 'UDP': # if udp, maybe udp.dstport (not in 27 features)
            pass # udp.dstport is not one of the 27 features

    if log_entry.get('protocol') == 'TCP':
        if log_entry.get('details', {}).get('flags') == 'SYN':
            features['tcp.connection.syn'] = 1
        if log_entry.get('details', {}).get('flags') == 'ACK': # Assuming this mapping
            features['tcp.flags.ack'] = 1
        # 'tcp.len' could be details.payload_size if available and relevant
        if 'payload_size' in log_entry.get('details', {}):
            features['tcp.len'] = log_entry['details']['payload_size']

    if log_entry.get('protocol') == 'HTTP': # HTTP runs over TCP
        # Example: if it's an SQL injection, we might try to set http related flags if we had them
        # For now, this is just a placeholder for more complex logic
        if log_entry.get('details', {}).get('type') == 'SQL_Injection':
            # We don't have http.content_length from this log directly
            pass

    # Add more mappings here if possible, but acknowledge limitations.
    # Most of the 27 features (e.g., arp.*, dns.*, mqtt.*, mbtcp.*, checksums, raw values)
    # are not present in the high-level simulation logs.

    # For demonstration, let's add a few more dummy values based on attack type
    attack_type = log_entry.get('details', {}).get('type')
    if attack_type == 'DDoS_TCP_SYN_Flood':
        features['tcp.len'] = 0 # SYN packets usually have 0 TCP payload len
        features['tcp.connection.syn'] = 1
    elif attack_type == 'Port_Scan_TCP':
        features['tcp.connection.syn'] = 1 # Scans often use SYN probes
    elif attack_type == 'DDoS_UDP_Flood':
        features['udp.stream'] = log_entry.get('dst_port',0) % 256 # Arbitrary mapping for demo
        features['tcp.len'] = 64 #  Example payload size for UDP flood packets

    return features

def process_sensor_log_entry(log_entry):
    """Converts a single sensor log entry to a feature dictionary."""
    features = {col: 0 for col in FEATURES_COLUMNS}
    features['Attack_label'] = 0 # Sensor data is assumed normal unless specified

    # Sensor data does not directly map to network features.
    # We could try to create proxy features if certain sensor values correlate with network states,
    # but that's complex. For now, these will be mostly zeros, indicating 'no network event'.
    # One might argue that sensor data should be used by a different model or an ensemble.

    # Example: if a PLC sensor reading is very high, it might be an mbtcp related feature.
    if log_entry.get('sensor_type') == 'plc_register_value':
        features['mbtcp.len'] = 1 # Indicate some mbtcp activity
        features['mbtcp.trans_id'] = log_entry.get('value', 0) % 1000 # Example mapping
        features['mbtcp.unit_id'] = int(log_entry.get('sensor_id', '_0').split('_')[-1]) + 1

    return features

def main(attack_log_file, sensor_log_file, output_csv_file):
    processed_data = []

    print(f"Processing attack log file: {attack_log_file}")
    try:
        with open(attack_log_file, 'r') as f:
            for line in f:
                if line.strip().startswith("INFO:") or line.strip().startswith("--- ") or not line.strip():
                    continue # Skip info/header lines
                try:
                    log_entry = json.loads(line.strip())
                    # We are interested in the detailed log entries, not the summary ones
                    if "event_type" in log_entry and log_entry["event_type"] == "attack_indicator":
                        processed_data.append(process_attack_log_entry(log_entry))
                except json.JSONDecodeError:
                    print(f"Skipping malformed JSON line in attack log: {line.strip()}")
    except FileNotFoundError:
        print(f"Warning: Attack log file {attack_log_file} not found.")

    print(f"Processing sensor log file: {sensor_log_file}")
    try:
        with open(sensor_log_file, 'r') as f:
            for line in f:
                if line.strip().startswith("--- ") or line.strip().startswith("Initialized") or not line.strip() :
                    continue # Skip header/info lines
                try:
                    log_entry = json.loads(line.strip())
                    processed_data.append(process_sensor_log_entry(log_entry))
                except json.JSONDecodeError:
                    print(f"Skipping malformed JSON line in sensor log: {line.strip()}")
    except FileNotFoundError:
        print(f"Warning: Sensor log file {sensor_log_file} not found.")

    if not processed_data:
        print("No data processed. Exiting.")
        return

    df = pd.DataFrame(processed_data)
    df = df[FEATURES_COLUMNS] # Ensure correct column order and selection
    df.fillna(0, inplace=True) # Fill any NaNs that might have occurred with 0

    # Ensure all feature columns are numeric (except Attack_label which is int)
    for col in df.columns:
        if col != 'Attack_label':
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        else:
            df[col] = df[col].astype(int)

    print(f"Processed {len(df)} records.")
    print(f"Value counts for 'Attack_label':\n{df['Attack_label'].value_counts()}")

    df.to_csv(output_csv_file, index=False)
    print(f"Processed simulation data saved to {output_csv_file}")
    print("\nIMPORTANT NOTE:")
    print("This preprocessing script provides a VERY SIMPLIFIED conversion of high-level simulation logs.")
    print("The generated features are NOT a faithful representation of the original Edge-IIoTset features.")
    print("Proper feature engineering from raw network traffic (pcap) using tools like Zeek/TShark is required")
    print("for accurate model validation against the original dataset's feature space.")
    print("This output is primarily for demonstrating the validation workflow.")

if __name__ == "__main__":
    # These filenames correspond to what was saved from user's pasted content
    # User will need to ensure these files are in the same directory as the script or provide full paths.
    attack_logs = "attack_simulation_output.jsonl"
    sensor_logs = "sensor_simulation_output.jsonl"
    output_csv = "processed_simulation_data2.csv"
    main(attack_logs, sensor_logs, output_csv)



Processing attack log file: attack_simulation_output.jsonl
Processing sensor log file: sensor_simulation_output.jsonl
Processed 210 records.
Value counts for 'Attack_label':
Attack_label
1    110
0    100
Name: count, dtype: int64
Processed simulation data saved to processed_simulation_data2.csv

IMPORTANT NOTE:
This preprocessing script provides a VERY SIMPLIFIED conversion of high-level simulation logs.
The generated features are NOT a faithful representation of the original Edge-IIoTset features.
Proper feature engineering from raw network traffic (pcap) using tools like Zeek/TShark is required
for accurate model validation against the original dataset's feature space.
This output is primarily for demonstrating the validation workflow.


In [13]:
# preprocess_simulation_data.py

import json
import pandas as pd
import numpy as np

# Updated FEATURES_COLUMNS to include Specific_Attack_Type
FEATURES_COLUMNS = [
    'arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port',
    'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst',
    'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack',
    'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.qry.type',
    'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in',
    'mqtt.conflag.cleansess', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as',
    'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id',
    'Attack_label', 'Specific_Attack_Type' # Added new column
]

def process_attack_log_entry(log_entry):
    """Converts a single attack log entry to a feature dictionary."""
    # Initialize features with default 0, Attack_label 1, and Specific_Attack_Type from log
    specific_attack_name = log_entry.get('details', {}).get('type', 'Unknown_Attack')
    features = {col: 0 for col in FEATURES_COLUMNS if col not in ['Attack_label', 'Specific_Attack_Type']}
    features['Attack_label'] = 1
    features['Specific_Attack_Type'] = specific_attack_name

    if 'dst_port' in log_entry:
        features['tcp.dstport'] = log_entry['dst_port']

    if log_entry.get('protocol') == 'TCP':
        if log_entry.get('details', {}).get('flags') == 'SYN':
            features['tcp.connection.syn'] = 1
        if log_entry.get('details', {}).get('flags') == 'ACK':
            features['tcp.flags.ack'] = 1
        if 'payload_size' in log_entry.get('details', {}):
            features['tcp.len'] = log_entry['details']['payload_size']

    if log_entry.get('protocol') == 'HTTP':
        if log_entry.get('details', {}).get('type') == 'SQL_Injection':
            pass

    attack_type_detail = log_entry.get('details', {}).get('type') # This is the specific attack type
    if attack_type_detail == 'DDoS_TCP_SYN_Flood':
        features['tcp.len'] = 0
        features['tcp.connection.syn'] = 1
    elif attack_type_detail == 'Port_Scan_TCP':
        features['tcp.connection.syn'] = 1
    elif attack_type_detail == 'DDoS_UDP_Flood':
        features['udp.stream'] = log_entry.get('dst_port',0) % 256
        features['tcp.len'] = 64 # Example payload size
    # Ensure Specific_Attack_Type is populated even if not one of the above explicitly handled
    if features.get('Specific_Attack_Type') == 'Unknown_Attack' and attack_type_detail:
        features['Specific_Attack_Type'] = attack_type_detail

    return features

def process_sensor_log_entry(log_entry):
    """Converts a single sensor log entry to a feature dictionary."""
    features = {col: 0 for col in FEATURES_COLUMNS if col not in ['Attack_label', 'Specific_Attack_Type']}
    features['Attack_label'] = 0
    features['Specific_Attack_Type'] = 'Normal' # Sensor data is Normal

    if log_entry.get('sensor_type') == 'plc_register_value':
        features['mbtcp.len'] = 1
        features['mbtcp.trans_id'] = log_entry.get('value', 0) % 1000
        features['mbtcp.unit_id'] = int(log_entry.get('sensor_id', '_0').split('_')[-1]) + 1

    return features

def main(attack_log_file, sensor_log_file, output_csv_file):
    processed_data = []

    print(f"Processing attack log file: {attack_log_file}")
    try:
        with open(attack_log_file, 'r') as f:
            for line in f:
                # Skip non-JSON lines (summaries, info lines from attack_simulator)
                if not line.strip().startswith('{'):
                    continue
                try:
                    log_entry = json.loads(line.strip())
                    if "event_type" in log_entry and log_entry["event_type"] == "attack_indicator":
                        processed_data.append(process_attack_log_entry(log_entry))
                except json.JSONDecodeError:
                    print(f"Skipping malformed JSON line in attack log: {line.strip()}")
    except FileNotFoundError:
        print(f"Warning: Attack log file {attack_log_file} not found.")

    print(f"Processing sensor log file: {sensor_log_file}")
    try:
        with open(sensor_log_file, 'r') as f:
            for line in f:
                if not line.strip().startswith('{'):
                    continue # Skip header/info lines
                try:
                    log_entry = json.loads(line.strip())
                    processed_data.append(process_sensor_log_entry(log_entry))
                except json.JSONDecodeError:
                    print(f"Skipping malformed JSON line in sensor log: {line.strip()}")
    except FileNotFoundError:
        print(f"Warning: Sensor log file {sensor_log_file} not found.")

    if not processed_data:
        print("No data processed. Exiting.")
        return

    df = pd.DataFrame(processed_data)
    # Ensure all defined columns are present, fill missing with 0 or 'Unknown'
    for col in FEATURES_COLUMNS:
        if col not in df.columns:
            if col == 'Specific_Attack_Type':
                df[col] = 'Unknown'
            else:
                df[col] = 0

    df = df[FEATURES_COLUMNS] # Ensure correct column order and selection

    # Fill NaNs for numeric columns with 0, and for Specific_Attack_Type with 'Unknown'
    for col in df.columns:
        if col == 'Specific_Attack_Type':
            df[col].fillna('Unknown', inplace=True)
        elif col != 'Attack_label': # Attack_label should be int
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        else:
            df[col] = df[col].astype(int)

    print(f"Processed {len(df)} records.")
    print(f"Value counts for 'Attack_label':\n{df['Attack_label'].value_counts()}")
    print(f"Value counts for 'Specific_Attack_Type':\n{df['Specific_Attack_Type'].value_counts()}")

    df.to_csv(output_csv_file, index=False)
    print(f"Processed simulation data saved to {output_csv_file}")
    print("\nIMPORTANT NOTE: This script provides a SIMPLIFIED conversion of simulation logs.")

if __name__ == "__main__":
    attack_logs = "attack_simulation_output.jsonl"
    sensor_logs = "sensor_simulation_output.jsonl"
    output_csv = "processed_simulation_data_detailed.csv" # New output filename
    main(attack_logs, sensor_logs, output_csv)



Processing attack log file: attack_simulation_output.jsonl
Processing sensor log file: sensor_simulation_output.jsonl
Processed 210 records.
Value counts for 'Attack_label':
Attack_label
1    110
0    100
Name: count, dtype: int64
Value counts for 'Specific_Attack_Type':
Specific_Attack_Type
DDoS_TCP_SYN_Flood    100
Normal                100
Port_Scan_TCP           5
Port_Scan_UDP           2
SQL_Injection           1
DDoS_UDP_Flood          1
ARP_Spoofing            1
Name: count, dtype: int64
Processed simulation data saved to processed_simulation_data_detailed.csv

IMPORTANT NOTE: This script provides a SIMPLIFIED conversion of simulation logs.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)


In [1]:
# validate_model_on_simulation.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
import os

# Define paths
PROCESSED_SIM_DATA_PATH = "/content/processed_simulation_data_detailed.csv" # Assumes it's in the same directory
# Update path to the pre-trained model here
# Instead of loading the results.txt, change to the saved model file (e.g., lstm_model.h5)
MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/results/LSTM_EdgeIIoT_model.h5"
OUTPUT_PATH = "/content/drive/MyDrive/Colab Notebooks/results/"
os.makedirs(OUTPUT_PATH, exist_ok=True)
RESULTS_FILE = os.path.join(OUTPUT_PATH, "simulation_validation_results.txt")

# These are the 27 features the LSTM model was trained on (from user log)
# Ensure this list matches the columns in processed_simulation_data.csv (excluding Attack_label)
FEATURE_COL_NAMES = [
    'arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port',
    'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst',
    'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack',
    'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.qry.type',
    'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in',
    'mqtt.conflag.cleansess', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as',
    'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id'
]

print(f"Loading processed simulation data from: {PROCESSED_SIM_DATA_PATH}")
try:
    sim_df = pd.read_csv(PROCESSED_SIM_DATA_PATH)
except FileNotFoundError:
    print(f"Error: Processed simulation data file not found at {PROCESSED_SIM_DATA_PATH}")
    print("Please ensure 'preprocess_simulation_data.py' was run successfully and the CSV is in the correct location.")
    exit()

print(f"Loaded {len(sim_df)} records from simulation data.")

if 'Attack_label' not in sim_df.columns:
    print("Error: 'Attack_label' column not found in the simulation data.")
    exit()

# Separate features and labels
X_sim = sim_df[FEATURE_COL_NAMES]
y_sim = sim_df['Attack_label']

print(f"Number of features in simulation data: {X_sim.shape[1]}")
print(f"Target variable distribution in simulation data:\n{y_sim.value_counts(normalize=True)}")

# Feature Scaling
# IMPORTANT: This scaler should ideally be the *same* scaler object (fit on the original training data)
# used when training the LSTM model. If not available, fitting a new scaler on simulation data
# is incorrect for evaluation. For this script, we'll fit a new one on the sim data for demonstration,
# but highlight this is a major caveat for true performance assessment.
# User should replace this with loading their saved scaler if possible.
print("\nWARNING: Applying a new StandardScaler fit on the simulation data. For accurate validation, \n         the scaler originally fit on the training data of the model should be used.")
scaler = StandardScaler()
X_sim_scaled = scaler.fit_transform(X_sim)

# Reshape data for LSTM: (samples, timesteps, features)
# Assuming the LSTM model expects this shape, as per the training script structure
X_sim_lstm = X_sim_scaled.reshape((X_sim_scaled.shape[0], 1, X_sim_scaled.shape[1]))
print(f"Reshaped simulation data for LSTM: {X_sim_lstm.shape}")

# Load the pre-trained model
print(f"\nLoading pre-trained LSTM model from: {MODEL_PATH}")
try:
    model = tf.keras.models.load_model(MODEL_PATH) # Make sure MODEL_PATH is correct
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please ensure the model path is correct and the model was saved properly (e.g., lstm_model.h5).")
    exit()

model.summary()

# Make predictions on the simulation data
print("\nMaking predictions on simulation data...")
y_pred_sim_probs = model.predict(X_sim_lstm)
y_pred_sim = np.argmax(y_pred_sim_probs, axis=1)

# Evaluate performance
accuracy_sim = accuracy_score(y_sim, y_pred_sim)
report_sim = classification_report(y_sim, y_pred_sim, target_names=['Normal (0)', 'Attack (1)'])

print(f"\n--- Validation Results on Simulation Data ---")
print(f"Accuracy: {accuracy_sim:.4f}")
print("Classification Report:")
print(report_sim)

# Save the results
results_summary = f"Validation Results on Simulation Data (using model: {MODEL_PATH}):\n\n"
results_summary += f"Data Source: {PROCESSED_SIM_DATA_PATH} ({len(sim_df)} records)\n"
results_summary += f"Accuracy: {accuracy_sim:.4f}\n\n"
results_summary += f"Classification Report:\n{report_sim}\n\n"
results_summary += "IMPORTANT CAVEAT: Feature scaling was performed by fitting a new StandardScaler on the simulation data. \nFor truly accurate validation, the scaler originally fit on the model's training data should have been used.\nFurthermore, the simulation data features are a simplified representation and may not fully align with the original dataset's feature space.\n"

with open(RESULTS_FILE, "w") as f:
    f.write(results_summary)

print(f"Validation results saved to: {RESULTS_FILE}")
print("Validation script finished.")

Loading processed simulation data from: /content/processed_simulation_data_detailed.csv
Loaded 210 records from simulation data.
Number of features in simulation data: 27
Target variable distribution in simulation data:
Attack_label
1    0.52381
0    0.47619
Name: proportion, dtype: float64

         the scaler originally fit on the training data of the model should be used.
Reshaped simulation data for LSTM: (210, 1, 27)

Loading pre-trained LSTM model from: /content/drive/MyDrive/Colab Notebooks/results/LSTM_EdgeIIoT_model.h5
Error loading model: [Errno 2] Unable to synchronously open file (unable to open file: name = '/content/drive/MyDrive/Colab Notebooks/results/LSTM_EdgeIIoT_model.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)
Please ensure the model path is correct and the model was saved properly (e.g., lstm_model.h5).


NameError: name 'model' is not defined

In [8]:
# validate_model_on_simulation.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
import os

# Define paths
PROCESSED_SIM_DATA_PATH = "/content/processed_simulation_data_detailed.csv" # Assumes it's in the same directory
MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/lstm_model.h5" # User needs to ensure their trained model is here or update path
OUTPUT_PATH = "/content/drive/MyDrive/Colab Notebooks/results/"
os.makedirs(OUTPUT_PATH, exist_ok=True)
RESULTS_FILE = os.path.join(OUTPUT_PATH, "simulation_validation_results.txt")

# These are the 27 features the LSTM model was trained on (from user log)
# Ensure this list matches the columns in processed_simulation_data.csv (excluding Attack_label)
FEATURE_COL_NAMES = [
    'arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port',
    'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst',
    'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack',
    'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.qry.type',
    'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in',
    'mqtt.conflag.cleansess', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as',
    'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id'
]

print(f"Loading processed simulation data from: {PROCESSED_SIM_DATA_PATH}")
try:
    sim_df = pd.read_csv(PROCESSED_SIM_DATA_PATH)
except FileNotFoundError:
    print(f"Error: Processed simulation data file not found at {PROCESSED_SIM_DATA_PATH}")
    print("Please ensure 'preprocess_simulation_data.py' was run successfully and the CSV is in the correct location.")
    exit()

print(f"Loaded {len(sim_df)} records from simulation data.")

if 'Attack_label' not in sim_df.columns:
    print("Error: 'Attack_label' column not found in the simulation data.")
    exit()

# Separate features and labels
X_sim = sim_df[FEATURE_COL_NAMES]
y_sim = sim_df['Attack_label']

print(f"Number of features in simulation data: {X_sim.shape[1]}")
print(f"Target variable distribution in simulation data:\n{y_sim.value_counts(normalize=True)}")

# Feature Scaling
# IMPORTANT: This scaler should ideally be the *same* scaler object (fit on the original training data)
# used when training the LSTM model. If not available, fitting a new scaler on simulation data
# is incorrect for evaluation. For this script, we'll fit a new one on the sim data for demonstration,
# but highlight this is a major caveat for true performance assessment.
# User should replace this with loading their saved scaler if possible.
print("\nWARNING: Applying a new StandardScaler fit on the simulation data. For accurate validation, \n         the scaler originally fit on the training data of the model should be used.")
scaler = StandardScaler()
X_sim_scaled = scaler.fit_transform(X_sim)

# Reshape data for LSTM: (samples, timesteps, features)
# Assuming the LSTM model expects this shape, as per the training script structure
X_sim_lstm = X_sim_scaled.reshape((X_sim_scaled.shape[0], 1, X_sim_scaled.shape[1]))
print(f"Reshaped simulation data for LSTM: {X_sim_lstm.shape}")

# Load the pre-trained model
print(f"\nLoading pre-trained LSTM model from: {MODEL_PATH}")
try:
    model = tf.keras.models.load_model(MODEL_PATH)
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please ensure the model path is correct and the model was saved properly (e.g., lstm_model.h5).")
    exit()

model.summary()

# Make predictions on the simulation data
print("\nMaking predictions on simulation data...")
y_pred_sim_probs = model.predict(X_sim_lstm)
y_pred_sim = np.argmax(y_pred_sim_probs, axis=1)

# Evaluate performance
accuracy_sim = accuracy_score(y_sim, y_pred_sim)
report_sim = classification_report(y_sim, y_pred_sim, target_names=['Normal (0)', 'Attack (1)'])

print(f"\n--- Validation Results on Simulation Data ---")
print(f"Accuracy: {accuracy_sim:.4f}")
print("Classification Report:")
print(report_sim)

# Save the results
results_summary = f"Validation Results on Simulation Data (using model: {MODEL_PATH}):\n\n"
results_summary += f"Data Source: {PROCESSED_SIM_DATA_PATH} ({len(sim_df)} records)\n"
results_summary += f"Accuracy: {accuracy_sim:.4f}\n\n"
results_summary += f"Classification Report:\n{report_sim}\n\n"
results_summary += "IMPORTANT CAVEAT: Feature scaling was performed by fitting a new StandardScaler on the simulation data. \nFor truly accurate validation, the scaler originally fit on the model's training data should have been used.\nFurthermore, the simulation data features are a simplified representation and may not fully align with the original dataset's feature space.\n"

with open(RESULTS_FILE, "w") as f:
    f.write(results_summary)

print(f"Validation results saved to: {RESULTS_FILE}")
print("Validation script finished.")






Loading processed simulation data from: /content/processed_simulation_data_detailed.csv
Loaded 210 records from simulation data.
Number of features in simulation data: 27
Target variable distribution in simulation data:
Attack_label
1    0.52381
0    0.47619
Name: proportion, dtype: float64

         the scaler originally fit on the training data of the model should be used.
Reshaped simulation data for LSTM: (210, 1, 27)

Loading pre-trained LSTM model from: /content/drive/MyDrive/Colab Notebooks/lstm_model.h5



Making predictions on simulation data...




[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step

--- Validation Results on Simulation Data ---
Accuracy: 0.5000
Classification Report:
              precision    recall  f1-score   support

  Normal (0)       0.00      0.00      0.00       100
  Attack (1)       0.51      0.95      0.67       110

    accuracy                           0.50       210
   macro avg       0.26      0.48      0.33       210
weighted avg       0.27      0.50      0.35       210

Validation results saved to: /content/drive/MyDrive/Colab Notebooks/results/simulation_validation_results.txt
Validation script finished.
