In [10]:
from openai import OpenAI
import dotenv
import os
import json
import pandas as pd

from time import time
from datetime import datetime

dotenv.load_dotenv("../.env")

True

In [11]:
# logging setup
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# suppress httpx logs
logging.getLogger("httpx").setLevel(logging.WARNING)

In [12]:
class Question():
    def __init__(self, id, text, answers_domain, correct_answer_index, context):
        self.id = id
        self.text = text
        self.answers_domain = answers_domain
        self.correct_answer_index = correct_answer_index
        self.context = context
        
    def is_correct(self, response):
        """Checks if the response matches the correct answer."""
        return response.strip().lower() == self.answers_domain[self.correct_answer_index].strip().lower()
    
    def get_prompt_text(self):
        return f"{self.id}: {self.text}"
    
    def __str__(self):
        return self.get_prompt_text()

In [13]:
class Dataset():
    FORMAT_DESCRIPTIONS = {
        'csv': (
            "You will receive a raw plaintext dataset in CSV format.\nEach row represents a record, and each column represents an attribute of the data.\nHeader is included in the first row."
        ),

        'horizontal_csv': (
            "You will receive a raw plaintext dataset in Horizontal CSV format.\nHeader is included in the first column\nEach column represents a record, and each row represents an attribute of the data."
        ),

        'json': (
            "You will receive a raw plaintext dataset in JSON format.\nEach record is a JSON object with keys representing attributes."
        ),

        'markdown_kv': (
            "You will receive a raw plaintext dataset in Markdown Key-Value format.\nEvery record is represented as a series of key-value pairs, with each pair on a new line and records separated by a line containing three dashes '---'."
        )
    }
    
    def __init__(self, csv_path):
        try:
            self.df = pd.read_csv(csv_path)
            self.filepath = csv_path
            self.filename = os.path.basename(csv_path)
            # get dataset system name
            self.system_name = csv_path.split("/")[-2]
        except Exception as e:
            print(f"Error reading CSV file: {e}")
            
    # Methods to get text data in different formats  
            
    def get_csv_data(self):
        return self.df.to_csv(index=False)

    def get_horizontal_csv_data(self):
        return self.df.transpose().to_csv(header=False, index=True)

    def get_json_data(self):
        return self.df.to_json(orient="records", indent=4)

    def get_md_kv_data(self):
        output_lines = []
        records = self.df.to_dict(orient='records')
        
        for record in records:
            for key, value in record.items():
                output_lines.append(f"{key}: {value}")
            # Separator
            output_lines.append("---")
                
        return "\n".join(output_lines)
    
    def get_formatted_data(self, format_type):
        if format_type == 'csv':
            return self.get_csv_data()
        elif format_type == 'horizontal_csv':
            return self.get_horizontal_csv_data()
        elif format_type == 'json':
            return self.get_json_data()
        elif format_type == 'markdown_kv':
            return self.get_md_kv_data()
        else:
            raise ValueError(f"Unsupported format type: {format_type}")
    
    # Method to get data format description for prompting
    def get_format_description(self, format_type):
        return self.FORMAT_DESCRIPTIONS.get(format_type, "No description available for this format.")
            
    

In [14]:
class QuestionPrompt():
    def __init__(self, question, dataset):
        self.question = question
        self.dataset = dataset

    def build_prompt(self, format_type='csv'):
        base_prompt = "You are an expert in Industrial Control Systems (ICS) and Operational Technology (OT), specialized in identifying system architecture and component relationships by analyzing time-series values of PLC registers."
        data_format = f"# Data Format\n{self.dataset.get_format_description(format_type)}"
        context_section = f"# Context\n{self.question.context}"
        task_section = f"# Task\nAnalyze the provided data to determine the physical architecture and answer to this question:\n{self.question.get_prompt_text()}"

        if len(self.question.context) != 0:    
            prompt = f"{base_prompt}\n\n{data_format}\n\n{task_section}\n\n{context_section}\n\n"
        else:
            prompt = f"{base_prompt}\n\n{data_format}\n\n{task_section}\n\n"
            
        return prompt
            
    def build_data_prompt(self, format_type='csv'):
        data = f"# Data\n{self.dataset.get_formatted_data(format_type)}"

        return data

In [15]:
class LogErrorCode():
    MODEL_ERROR = 101
    JSON_PARSE_ERROR = 201
    MISSING_SHORT_ANSWER = 301
    
    def __init__(self, code):
        self.code = code
        if code == self.MODEL_ERROR:
            self.message = "Model returned an error."
        elif code == self.JSON_PARSE_ERROR:
            self.message = "Error parsing JSON response."
        elif code == self.MISSING_SHORT_ANSWER:
            self.message = "Missing short answer in response."
        else:
            self.message = "Unknown error code."
    
    def __str__(self):
        return f"[Error {self.code}]: {self.message}"

In [16]:
# initialize OpenAI client
CLIENT = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("DENIS_KEY")
)

In [17]:
# Q1-1: Classify the physical Industrial Control System (ICS) into one of the following categories
d = [
    'Energy & Power System',
    'Oil, Gas & Chemicals System',
    'Manufacturing System',
    'Water & Wastewater System',
    'Building Automation System',
    'Not Identifiable'
]

q_1_1 = Question(
    id="Q1-1",
    text=f"Classify the physical Industrial Control System (ICS) into one of the following categories: {d}",
    answers_domain=d,
    correct_answer_index=3,
    context=""
)

In [30]:
# Q2-1: Identify how many water tanks are involved during the operations of the ICS under consideration
d = [str(i) for i in range(11)]

q_2_1 = Question(
    id="Q2-1",
    text=f"Identify how many water tanks are involved during the operations of the ICS under consideration between the following options: {d}.",
    answers_domain=d,
    correct_answer_index=3,
    context="The ICS is categorized as a Water & Wastewater System."
)

In [29]:
# Q3-1: Identify how many PLCs are involved during the operations of the ICS under consideration
d = [str(i) for i in range(11)]

q_3_1 = Question(
    id="Q3-1",
    text=f"Identify how many PLCs are involved during the operations of the ICS under consideration between the following options: {d}.",
    answers_domain=d,
    correct_answer_index=3,
    context="The ICS is categorized as a Water & Wastewater System.\nThe operations involve three water tanks."
)

In [20]:
# send prompt to model
def send_prompt(model_name, question: Question, prompt: QuestionPrompt, format_type):
    '''
    Sends the constructed prompt to the specified model and retrieves the response.
    '''
    try:
        c = CLIENT.chat.completions.create(
            model=model_name,
            messages=[
                {
                    "role": "system",
                    "content": [{"type": "text", "text": prompt.build_prompt(format_type)}]
                },
                {
                    "role": "user",
                    "content": [{"type": "text", "text": prompt.build_data_prompt(format_type)}]
                },
            ],
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "response",
                    "strict": True,
                    "schema": {
                        "type": "object",
                        "properties": {
                            "response": {
                                "type": "object",
                                "properties": {
                                    "short_answer": {
                                        "enum": question.answers_domain,
                                    },
                                    "confidence": {
                                        "type": "number",
                                        "description": "Confidence level from 0 to 1, where 1 is highest"
                                    },
                                    "reasoning": {
                                        "type": "string",
                                        "description": "Reasoning behind the answer"
                                    }
                                },
                                "required": ["short_answer", "confidence", "reasoning"]
                            },
                            "limitations": {
                                "type": "array",
                                "description": "List of limitations or uncertainties in the analysis"
                            },
                            "internal_checks": {
                                "type": "object",
                                "properties": {
                                    "columns_used": {
                                        "type": "array",
                                        "description": "List of columns from the dataset that were used in the analysis"
                                    },
                                    "assumptions_detected": {
                                        "type": "array",
                                        "description": "List of assumptions made during the analysis"
                                    },
                                    "warnings": {
                                        "type": "array",
                                        "description": "List of warnings or potential issues identified during the analysis"
                                    }
                                },
                                "required": ["columns_used", "assumptions_detected", "warnings"]
                            }
                        },
                        "required": ["response", "limitations", "internal_checks"],
                        "additionalProperties": False
                    }
                }
            }
        )

        raw = c.choices[0].message.content
        if not raw:
            raise Exception("No content in response")
        return raw
    
    except Exception as e:
        return e

In [21]:
# function which creates the evaluation log file
def create_evaluation_log_file(json_log, question, dataset, dataset_format, model):
    sanitized_model_name = model.split("/")[1].replace(":", "-")
    path = f"../responses/{datetime.today().strftime('%Y_%m_%d')}/{dataset.system_name}/{sanitized_model_name}/{dataset_format}/"
    filename = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{question.id}.json"
    
    os.makedirs(os.path.dirname(path), exist_ok=True)

    with open(os.path.join(path, filename), 'w') as f:
        json.dump(json_log, f, indent=4)

    logging.info(f"JSON log file created: {os.path.join(path, filename)}")
    
    
def create_error_log(code, i, iterations):
    # define the error log structure
    error_log = {"code": None, "msg": None}
    # add model error
    error_log["code"] = LogErrorCode.MODEL_ERROR
    error_log["msg"] = LogErrorCode(LogErrorCode.MODEL_ERROR).message
    
    if code == LogErrorCode.MODEL_ERROR:
        log_msg = f"Model Error"
    elif code == LogErrorCode.JSON_PARSE_ERROR:
        log_msg = f"JSON Parse Error"
    elif code == LogErrorCode.MISSING_SHORT_ANSWER:
        log_msg = f"Missing Short Answer"

    logging.error(f"[Run {i+1}/{iterations}] ERROR {log_msg}")

    return {"error": error_log}


def evaluate_question(question: Question, dataset: Dataset, dataset_format, model, iterations=10):
    '''
    Evaluate a question on a specific configuration of based on dataset, dataset format and LLM model.
    Produce a file json log with results.
    '''
    # initialize json log structure
    json_log = {
        "experiment_id": f"exp_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
        "config": {
            "model": model,
            "dataset_format": dataset_format,
            "iterations": iterations,
            "question": question.get_prompt_text()
        },
        "dataset": {
            "filepath": dataset.filepath,
            "system_name": dataset.system_name,
            "is_anon": True if "anonymized" in dataset.filename else False
        },
        "runs": [],
        "aggregated_stats": {
            "correct_n": 0,
            "incorrect_n": 0,
            "error_n": 0,
            "accuracy": 0.0,
            "reliability": 0.0,
            "precision": 0.0,
            "evaluation_time": 0.0
        }
    }
    
    correct_n = 0
    error_n = 0
    runs_log = []
    
    logging.info(f"START EVAL | Q: {question.id} | Model: {model} | Dataset: {dataset.system_name} | Format: {dataset_format} | Iterations: {iterations}")
    
    start_time = time()
    for i in range(iterations):
        run_log = {"run": i+1, "short_answer": None, "confidence": None, "reasoning": None}
        
        # send prompt to model and get response
        response = send_prompt(model, question, QuestionPrompt(question, dataset), dataset_format)
        
        # check if there was a model error
        if isinstance(response, Exception):
            error_n += 1
            # add error log to run log
            run_log.update(create_error_log(LogErrorCode.MODEL_ERROR, i, iterations))
            runs_log.append(run_log)
            continue
        
        # parse JSON response
        try:
            # load json from the response
            json_response = json.loads(response)
            response_data = json_response.get('response', {})
            
            # get short answer and confidence
            short_answer = response_data.get('short_answer', '').strip()
            confidence = response_data.get('confidence', 0)
            reasoning = response_data.get('reasoning', '').strip()
            
            # check missing short answer
            if not short_answer:
                error_n += 1
                # add error log to run log
                run_log.update(create_error_log(LogErrorCode.MISSING_SHORT_ANSWER, i, iterations))
                runs_log.append(run_log)
                continue
            
            # correct answer check
            is_correct = question.is_correct(short_answer)
            if is_correct:
                correct_n += 1
                
            status_tag = "CORRECT" if is_correct else "INCORRECT"
            logging.info(f"[Run {i+1}/{iterations}] {status_tag} (Conf: {confidence}) | Ans: {short_answer}")
            
            # update log
            run_log["short_answer"] = short_answer
            run_log["confidence"] = confidence
            run_log["reasoning"] = reasoning
            
        # handle JSON parse errors
        except (TypeError, json.JSONDecodeError) as e:
            error_n += 1
            # add error log to run log
            run_log.update(create_error_log(LogErrorCode.JSON_PARSE_ERROR, i, iterations))
            runs_log.append(run_log)
            continue
        
        # in the end of each test, append the test log
        runs_log.append(run_log)        
    
    end_time = time()
    evaluation_time = round(end_time - start_time, 1)
    # calculate the number of valid iterations for avoid division by zero
    valid_iterations = iterations - error_n
    
    # add runs to json log
    json_log["runs"] = runs_log
    
    # add aggregated stats to json log
    json_log["aggregated_stats"]["correct_n"] = correct_n
    json_log["aggregated_stats"]["incorrect_n"] = iterations - correct_n - error_n
    json_log["aggregated_stats"]["error_n"] = error_n
    json_log["aggregated_stats"]["accuracy"] = round(correct_n / iterations, 2) * 100.0
    json_log["aggregated_stats"]["reliability"] = round((valid_iterations) / iterations, 2) * 100.0
    json_log["aggregated_stats"]["precision"] = round(correct_n / valid_iterations, 2) * 100.0 if valid_iterations > 0 else 0.0
    json_log["aggregated_stats"]["evaluation_time"] = round(evaluation_time, 3)
    
    # create evaluation log file
    create_evaluation_log_file(json_log, question, dataset, dataset_format, model)
    
    logging.info(f"END EVAL | Errors: {error_n} | Time: {evaluation_time} sec.")
    
    return json_log

In [22]:
# tested models
models = [
    "mistralai/devstral-2512:free",
    "xiaomi/mimo-v2-flash:free",
    "tngtech/tng-r1t-chimera:free"
]

In [23]:
ds = Dataset("../datasets/swat/compressed_simplified-swat_plc-data-log_750-lines.csv")

In [24]:
formats = Dataset.FORMAT_DESCRIPTIONS.keys()

for model in models:
    for dataset_format in formats:
        # evaluate Q1-1
        evaluate_question(q_1_1, ds, dataset_format, model, iterations=10)
        # evaluate Q2-1
        evaluate_question(q_2_1, ds, dataset_format, model, iterations=10)
        # evaluate Q3-1
        evaluate_question(q_3_1, ds, dataset_format, model, iterations=10)

INFO:root:START EVAL | Q: Q1-1 | Model: mistralai/devstral-2512:free | Dataset: swat | Format: csv | Iterations: 10


KeyboardInterrupt: 