In [1]:
from openai import OpenAI
import dotenv
import os
import time

dotenv.load_dotenv()

True

In [57]:
CLIENT = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("DENIS_KEY")
)

#model = "meta-llama/llama-3.3-8b-instruct:free"
#model = "mistralai/mistral-small-3.2-24b-instruct:free"
#model = "minimax/minimax-m2:free"

#model = "openrouter/polaris-alpha" -> out of use stealth model
#model = "openrouter/sherlock-think-alpha" -> out of use stealth model

''' >= 256K tokens '''
#model = "kwaipilot/kat-coder-pro:free"
#model = "google/gemini-2.0-flash-exp:free"
#model = "qwen/qwen3-coder:free"
model = "x-ai/grok-4.1-fast"

''' >= 128K tokens '''
#model = "nvidia/nemotron-nano-12b-v2-vl:free"
#model = "alibaba/tongyi-deepresearch-30b-a3b:free"


' >= 128K tokens '

In [18]:
def duplicate_lines_remover(rows):
    """
    Remove duplicate subsequent equal lines from the CSV data.
    """
    csv_text_clean = [rows[0]]
    starting_len = len(rows[1:])
    for line in rows[1:]:
        if csv_text_clean[-1] != line:
            csv_text_clean.append(line)
    print(f"Deduplication: reduced size from {starting_len} to {len(csv_text_clean) - 1}.")
    return csv_text_clean

In [19]:
def reduce_size(rows_with_header, remove_one_every_n=2):
    """
    Reduce the size of the dataset by removing one row every n rows.
    """
    clean_data = [rows_with_header[0]]  # Keep the header
    starting_len = len(rows_with_header[1:])
    for i in range(1, starting_len + 1):
        if i % remove_one_every_n == 0:
            continue
        clean_data.append(rows_with_header[i])
    print(f"Remove one every {remove_one_every_n}: reduced size from {starting_len} to {len(clean_data) - 1}.")
    return clean_data

In [58]:
# Lettura e riduzione del dataset di partenza
with open("datasets/baseline.csv", "r") as f:
    csv_text = f.read()

csv_text_splitted = csv_text.split("\n")

# Anonimizzazione dell'header
old_header = csv_text_splitted[0].split(",")
new_header = ",".join([chr(ord('A') + i) for i in range(len(old_header))])
csv_text_splitted_anon_header = [c for c in csv_text_splitted]
csv_text_splitted_anon_header[0] = new_header

# Per contesto mantengo i riferimenti (LETTERA, NOME_VALORE)
ref = {chr(ord('A') + i):old_header[i] for i in range(len(old_header))}

# Riduzione della dimensione cercando di mantenere la semantica
to_analyze = duplicate_lines_remover(csv_text_splitted)
to_analyze = reduce_size(to_analyze)

#to_analyze = reduce_size(to_analyze)
#to_analyze = reduce_size(to_analyze)

# Prende le prime righe di dati
data = "\n".join(to_analyze)

Deduplication: reduced size from 30059 to 10279.
Remove one every 2: reduced size from 10279 to 5140.


In [None]:
base_prompt = """You are an expert in Industrial Control Systems (ICS).

Your task is to analyze a time-series dataset representing the values of PLC registers over time. The dataset is provided in CSV format:
- The first row contains the register names.
- All fields are comma-separated.
- Each row after the header represents the system state at a single timestamp.

Analyze the dataset and infer:
- possible types of physical processes represented by the registers,
- possible subsystem groupings

You must respond ONLY with the following JSON structure:

{
  "system_type_inference": {
    "answer": "",
    "confidence": 0.0,
    "reasoning": ""
  },
  "subsystem_inference": {
    "estimated_subsystems": null,
    "confidence": 0.0,
    "identified_groups": [],
    "reasoning": ""
  },
  "limitations": [],
  "internal_checks": {
    "columns_used": [],
    "assumptions_detected": [],
    "warnings": []
  }
}

Use null when uncertain.
Do NOT add text outside the JSON or add JSON fields.

## Data (CSV)
"""

c = CLIENT.chat.completions.create(
        model=model,
        messages=[
            {
                "role":"developer",
                "content": [{"type":"text", "text":base_prompt}]
            },
            {
                "role":"user",
                "content": [{"type":"text", "text":data}]
            }
        ]
    )

try:
    answer = c.choices[0].message.content
    print(f"{answer}")
except Exception as e:
    print(f"Error: {e}")
    print("Response:", c)


{
  "system_type_inference": {
    "answer": "Multi-tank liquid level control system or similar accumulating process (e.g., hoppers, reservoirs) with hysteresis or bang-bang control between low and high setpoints.",
    "confidence": 0.85,
    "reasoning": "PLC1_IW0 oscillates between approximately 40 (PLC1_MW0 constant) and 80 (PLC1_MW1 constant), ramping up/down by 1 per timestep when coils QX00/QX01/QX02 are in states like [0,1,1] (decreasing) or [1,1,1] (increasing), [0,0,0] (idle/hold). PLC2_IW0 similarly oscillates between 10 (PLC2_MW0) and 20 (PLC2_MW1), controlled by PLC2_QX00 (1 increasing, 0 decreasing). PLC3_IW0 mostly constant at 1 (occasionally 2), MW1=10 constant, QX00 mostly 0, suggesting supervisory or sensor role. 'prev_' columns indicate lagged states for change detection or PID. Cyclic ramping patterns typical of PLC-simulated tank filling/draining tests. Constant MW values match setpoints."
  },
  "subsystem_inference": {
    "estimated_subsystems": 3,
    "confiden

In [None]:
# Analisi del dataset con output standardizzato
base_prompt = """
You are an expert in Industrial Control Systems (ICS), specialized in identifying system architecture and component relationships by analyzing time-series values of PLC registers.

## Data Format
You will receive a raw plaintext dataset in CSV form:
- The first row contains column names (register labels).
- All fields are comma-separated.
- Each following row represents the register states at a specific timestamp.

## Task
Analyze the dataset and answer the following question:
- Can you infer what type of physical industrial control system these data may refer to?

## Output Requirements
Respond ONLY with a JSON object in this exact structure:

{
  "q": {
    "answer": "",
    "confidence": 0.0,
    "reasoning": ""
  },
  "limitations": [],
  "internal_checks": {
    "columns_used": [],
    "assumptions_detected": [],
    "warnings": []
  }
}

Fill every field; use null if uncertain.  
Do NOT add text outside the JSON.

## Data (CSV)
"""


c = CLIENT.chat.completions.create(
        model=model,
        messages=[
            {
                "role":"developer",
                "content": [{"type":"text", "text":base_prompt}]
            },
            {
                "role":"user",
                "content": [{"type":"text", "text":data}]
            }
        ]
    )

try:
    answer = c.choices[0].message.content
    print(f"{answer}")
except Exception as e:
    print(f"Error: {e}")
    print("Response:", c)

{
  "q1": {
    "answer": "Multi-tank liquid level control system (two main tanks with pumps for filling/emptying/transfer)",
    "confidence": 0.95,
    "reasoning": "The dataset shows PLC1_InputRegisters_IW0 oscillating between approximately 40 and 80, and PLC2_InputRegisters_IW0 between 10 and 20, with inverse correlation: when PLC1 level decreases, PLC2 increases, and vice versa, indicating liquid transfer between two tanks via pumps. PLC1_MemoryRegisters_MW0=40 (low setpoint), MW1=80 (high setpoint); PLC2_MW0=10, MW1=20 similarly. PLC1_Coils_QX00-QX02 activate together (often 0,0,0 or 1,1,1) during transfer phases, likely controlling multiple pumps. PLC2_Coils_QX00 and PLC3_Coils_QX00 toggle similarly. PLC3_InputRegisters_IW0 mostly 1 (possibly flow sensor), PLC3_MemoryRegisters_MW1=10 constant. Previous values (prev_*) detect edges/changes for control logic. Pattern repeats cyclically with hysteresis control typical of tank level systems to avoid chattering."
  },
  "limitations"

In [14]:
# Interroga il modello per ottenere le invarianti
base_prompt = """
# Context
You are an expert in Industrial Control Systems (ICS).
Your expertise is in understanding system architecture by analyzing the time-series values of its registers.
An ICS system is secured using invariants: these are properties or relationships that must always remain constant to ensure the system's correct and safe operation.
Present your findings in a clear, structured format.

---

# ICS in analysis
The system being analyzed has 3 PLCs. Each PLC controls 3 water tanks (and their related components):
- PLC1 controls T-201 
- PLC2 controls T-202
- PLC3 controls T-203
The system's purpose is water purification (creating potable water) via a chemical process.

---

# Data
The analysis dataset is a time-series scan of the system's register states.
The plaintext dataset is in CSV format:
* The first row is the header (column names).
* All data is comma-separated.
* Data values in subsequent rows correspond to the header columns by their position (index).

---

# Task
Your objectives are:
1. Register Classification: Analyze the data and classify the registers by their functional type for every PLC.
2. Invariant Extraction: Identify and extract significant invariants from the data. These invariants are the critical relationships that define the system's normal behavior.
   Goal: These invariants will be used to monitor the system, secure it, and verify that its normal operational flow is not compromised at any time.
3. Try to demonstrate the invariants you found with examples from the data provided.
4. Try to demonstrate this property: "When PLC1_Coils_QX00 changes its state from 0 to 1, it activates an ascending trend in T-201."
"""

c = CLIENT.chat.completions.create(
    model=model,
    messages=[
        {
            "role":"developer",
            "content": [{"type":"text", "text":base_prompt}]
        },
        {
            "role":"user",
            "content": [{"type":"text", "text":data}]
        }
    ]
)

try:
    answer = c.choices[0].message.content
    print(f"{answer}")
except Exception as e:
    print(f"Error: {e}")
    print("Response:", c.error['message'])

Okay, I've analyzed the provided data and will present my findings in a structured format.

## 1. Register Classification

Based on the data and context, here's a classification of the registers by their functional type for each PLC:

*   **Input Registers (IW0):** Likely represent the water level sensor readings for tanks.  These are analog inputs converted to digital values. Values range from 10 to 81.

*   **Memory Registers (MW0 and MW1):** Used to store intermediary results or setpoints for the control logic.

*   **Coils (QX00, QX01, QX02):** Relays used to control the state of actuators and may control the flow to the tanks, pumps, or chemical dosing.

    **Register Classification Table:**

| Register                    | Functional Type              | Description                                                                     |
| -------------------------- | ---------------------------- | ------------------------------------------------------------------------------- |
| P

In [None]:
# Interroga il modello eseguendo tutte le domande
base_prompt = """
You are an expert in industrial control systems and in understanding the system architecture by looking at the register values.
In the following, I'll provide you with a list of register values collected over time.

Data:
- This is the scan obtained by observing, for a certain period of time, the contents of the registers of 3 PLCs of an industrial system.
- The first line contains headers that you can use to refer to each register in your answer.

Task:
- The goal is to analyze these values and identify any relationships between columns values.
- Within the full response attach an evaluable response — in this case, Yes, No or a Number.
"""

q1 = "Q1: Are you able to understand at what type of phisical industrial control system these data refers to?"
q2 = "Q2: Assuming that the industrial system is a water filtration system that certainly uses tanks, are you able to understand how many tanks are involved in the system?"
q3 = "Q3: Are you able to associate the 3 PLCs with the tanks you think you have identified?"
q4 = "Q4: In reality, there are three tanks. Are you able to identify which PLC registers are dedicated to measurements and which to actuations, for each tank?"
q5 = "Q5: In fact, there are three tanks. Are you able to identify which PLC registers are dedicated to measurements and which to actuations, for each tank?"
q6 = "Q6: In fact, there are three tanks. Are you able to identify if there are PLC registers used to store other significant information, such as setpoints or other parameters?"
q7 = "Q7: Considering that there are three tanks, are you able to deduce whether there is a physical connection, i.e., pipes, linking two or more tanks? (This is information that, in our paper, we are not able to derive)"
q8 = "Q8: Assuming you have identified the registers dedicated to containing measurements and actuations for each tank, are you able to construct, for each tank, a chart showing how these quantities, measurements and actuations, evolve over time?"
q9 = "Q9: Are you able to construct a chart representing the operation of the entire system over time?"
q10 = "Q10: For each tank, are you able to extract invariants that relate measurements with actuations?"
q11 = "Q11: Are you able to extract significant invariants that connect the register contents of the entire industrial system?"

questions = [q1, q2, q3, q4, q5, q6, q7, q10, q11]
i = 1

context = []


for q in questions:
    c = CLIENT.chat.completions.create(
        model=model,
        messages=[
            {
                "role":"developer",
                "content": [{"type":"text", "text":base_prompt}]
            },
            {
                "role":"assistant",
                "content": [{"type":"text", "text":"\n\n".join(context)},]
            },
            {
                "role":"user",
                "content": [{"type":"text", "text":data}, {"type":"text","text":q}]
            }
        ]
    )

    try:
        answer = c.choices[0].message.content
        print(f"Question n°{i}\n{answer}\n-------------------------\n\n")
    except Exception as e:
        print(f"Error: {e}")
        print("Response:", c)

    # Aggiornamento del context
    context.append(answer)

    i+=1

Question n°1
Short answer (evaluable): No

Reasoning (concise):

I see many strong structural patterns:

- PLC1/2/3:
  - IW0 ramps smoothly (≈40–80) in repetitive up/down “sawtooth” cycles.
  - MW0=40 and MW1=80 stay constant → likely configuration/limits, not process.
  - QX00 and the other coils toggle deterministically with IW0 ranges and previous-state columns.
  - prev_* columns line up: each scan is tightly consistent with the previous one → this is logged cyclic PLC runtime, not random noise.

- PLC2:
  - Very similar logic to PLC1, sharing 10/20 patterns and the same hysteresis-like coil behavior.
  - Often acts as a supervisor or companion to PLC1’s behavior.

- PLC3:
  - IW0 mostly small integer (0–10-ish), slowly changing.
  - QX00 is mostly 0, a few 1s; appears to count / signal discrete events, not continuous control.

All of this points to:

- A closed-loop control with:
  - One or more analog values being driven between min/max limits,
  - Threshold-based coil actuation 