# 1. Package Loading

In [1]:
# Standard Library
import json
import os
import re
import requests
import getpass
from collections import Counter

# Data Manipulation
import pandas as pd
import numpy as np

# PDFs
import pdfplumber

# LLMs
import tiktoken
import textwrap

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_community.graphs.graph_document import (
    GraphDocument,
    Node as BaseNode,
    Relationship as BaseRelationship,
)
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chat_models import init_chat_model
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chains import LLMChain
from langchain_google_genai import ChatGoogleGenerativeAI

# Neo4j
from neo4j import GraphDatabase
from neo4j.exceptions import AuthError

# Typing & Validation
from pydantic import BaseModel, ValidationError, Field
from typing import Any, Dict, List, Optional

# 2. Raw Data Extraction

## 2.1 Incident Reports

In [2]:
# Define the directory containing PDFs
pdf_directory = "./reports_ie/"

# List all PDF files in the directory
pdf_files = [f for f in os.listdir(pdf_directory) if f.lower().endswith(".pdf")]

count = 0
for file in pdf_files:
    count += 1
    print(f"{count}. {file}")

1. IE-10375 - 210827 Collision with track equipment.pdf
2. IE-10397 - 211207 Clontarf.pdf
3. IE-10404 - 230222 Broken Rail Emly.pdf
4. IE-200608 BnM Collision LC Offaly.pdf
5. IE-6218-200111 Collision RRME Rosslare.pdf
6. IE-6262-200429 LC Collision XM240.pdf
7. IE-6291-200524 LC XA068 Ashfield.pdf
8. IE-6305 - 200707_locomotive_224.pdf


In [3]:
# Specify file name to process
pdf_name = "IE-6291-200524 LC XA068 Ashfield.pdf"

In [4]:
# Defining regex patterns for detecting the start of the summary section and section headings
SUMMARY_START_PATTERN = re.compile(r"^(report\s+summary|summary)[:\s]*$", re.IGNORECASE)
SECTION_HEADING_PATTERN = re.compile(
    r"^(table of contents|contents|RAIU investigation|description of the occurrence|analysis|conclusions|measures taken|safety recommendations|additional information|list of abbreviations|glossary|references)",
    re.IGNORECASE)

# Defining the extraction function
def extract_summary_section(pdf_path: str, header_detection_pages: int = 10) -> str:
    summary_text = ""
    full_text = ""
    capturing = False
    detected_header = None
    first_lines = []

    with pdfplumber.open(pdf_path) as pdf:
        num_pages = len(pdf.pages)
        pages_to_check = min(header_detection_pages, num_pages)

        # Step 1: Detect a repeating header line across the first few pages
        for i in range(pages_to_check):
            page_text = pdf.pages[i].extract_text()
            if not page_text:
                continue
            lines = page_text.split("\n")
            if lines:
                first_lines.append(lines[0].strip())

        # Step 2: Determine most common first line (if repeated)
        if first_lines:
            first_line_counts = Counter(first_lines)
            most_common_line, count = first_line_counts.most_common(1)[0]
            if count >= pages_to_check * 0.7:  # Header appears on 70%+ of sampled pages
                detected_header = most_common_line
                print(f"[INFO] Detected consistent header: '{detected_header}'")

        # Step 3: Process all pages and remove header if matched
        for page in pdf.pages:
            page_text = page.extract_text()
            if not page_text:
                continue

            lines = page_text.split("\n")
            if detected_header and lines and lines[0].strip() == detected_header:
                lines = lines[1:]  # Remove header

            full_text += f"[Page {page.page_number}]\n" + "\n".join(lines) + "\n\n"

            for line in lines:
                stripped = line.strip()
                if not capturing and SUMMARY_START_PATTERN.match(stripped):
                    print(f"[INFO] Found summary start on Page {page.page_number}: '{stripped}'")
                    capturing = True
                    continue  # Skip the actual heading

                if capturing and SECTION_HEADING_PATTERN.match(stripped):
                    print(f"[INFO] Stopping capture at heading: '{stripped}'")
                    capturing = False
                    break

                if capturing:
                    summary_text += line + "\n"

    if not summary_text:
        print(f"Warning: No summary section found in {pdf_path}. Returning full text.")
        return full_text.strip()

    return summary_text.strip()

# Applying the function over the report
pdf_text = extract_summary_section(f"{pdf_directory}{pdf_name}", header_detection_pages=10)
print(pdf_text[:500])

[INFO] Detected consistent header: 'Member of public trapped in the barriers of CCTV LC XA068, Ashfield, Offaly, 24th May 2020'
[INFO] Found summary start on Page 4: 'Summary'
[INFO] Stopping capture at heading: 'Contents'
At approximately 12:13 hour (hrs) on the 24th May 2020, a Level Crossing Control Operative
(LCCO), referred to as LCCO1 in this report, located at Athlone Local Control Centre (ALCC)
cleared Closed-Circuit Television (CCTV) Level Crossing (LC) XA068, located in Ashfield,
Offaly for the passage of the 11:00 hrs passenger service from Galway to Heuston (Train
A703) while a member of the public (MOP) was inside the barriers of the level crossing.
The MOP had requested assistance from LCCO1 using th


## 2.2 Accident Categories

In [5]:
# Load accident category events 
cat_a_events = pd.read_csv("./data/category-a-event-types-source.csv", encoding='latin-1')
cat_b_events = pd.read_csv("./data/category-b-event-types-source.csv", encoding='latin-1')
cat_c_events = pd.read_csv("./data/category-c-event-types-source.csv", encoding='latin-1')

# Merge the DataFrames into one
cat_events = pd.concat([cat_a_events, cat_b_events, cat_c_events], ignore_index=True)
cat_events

Unnamed: 0,Id,Code,Name,Definition,Broader
0,A1,A1,Collisions,A collision event falling within the A.1 sub-c...,A
1,A1-1,A1.1,Collision of train with a train/rail vehicle,A front to front; front to end or a side colli...,A1
2,A1-2,A1.2,Collision of train with obstacle within the cl...,A collision between a part of a train and obje...,A1
3,A1-3,A1.3,Collision of one or more rail vehicles with an...,Same as A1.1 but concerning more rail vehicles...,A1
4,A1-4,A1.4,Collision of one or more rail vehicles with ob...,Same as A1.2 but concerning one or more rail v...,A1
...,...,...,...,...,...
226,C-3-4,C.3.4,Arson,,C-3
227,C-3-5,C.3.5,Vandalism,,C-3
228,C-3-6,C.3.6,Cyber attack,,C-3
229,C-3-0,C.3.0,Other external events - Security,Any variation falling within the category ÔExt...,C-3


## 2.3 Contributing & Systemic Factors

In [6]:
# Load the contributing and systemic factors data
contr_fact = pd.read_csv("./data/contributing-factors-source.csv", encoding='latin-1')
sys_fact = pd.read_csv("./data/systemic-factors-source.csv", encoding='latin-1')

# 3. Langchain Chunk Splitting

## 3.1 Incident Reports

In [7]:
# Define function to split the incident report into chunks
def split_report_into_chunks(text: str, chunk_size: int = 2000, chunk_overlap: int = 300) -> list[str]:
    """
    Splits text into smaller overlapping chunks using LangChain's text splitter.
    """
    if not text:
        print("Warning: No text provided for splitting.")
        return []

    if len(text) <= chunk_size:
        return [text]  # If text is smaller than chunk size, return as single chunk

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_text(text)

# Split the extracted text
report_chunks = split_report_into_chunks(pdf_text)

# Print the number of chunks and a sample chunk
print(f"[INFO] Total chunks: {len(report_chunks)}\n[INFO] First chunk:\n{report_chunks[0]}")

[INFO] Total chunks: 3
[INFO] First chunk:
At approximately 12:13 hour (hrs) on the 24th May 2020, a Level Crossing Control Operative
(LCCO), referred to as LCCO1 in this report, located at Athlone Local Control Centre (ALCC)
cleared Closed-Circuit Television (CCTV) Level Crossing (LC) XA068, located in Ashfield,
Offaly for the passage of the 11:00 hrs passenger service from Galway to Heuston (Train
A703) while a member of the public (MOP) was inside the barriers of the level crossing.
The MOP had requested assistance from LCCO1 using the telephone provided at the level
crossing. LCCO1 advised the MOP that they would raise the barriers and instructed the MOP
to stand beside some level crossing equipment (clear of the tracks, but within the confines of
the level crossing). LCCO1 did not raise the barriers and allowed Train A703 to pass through
LC XA068. The MOP was uninjured as a result of this incident.
The RAIU identified the following causal factors (CaF) associated with the incident

## 3.2 Accident Categories

In [8]:
# Convert the merged DataFrame to a list of rows in dictionary format
cat_events_dict = cat_events.to_dict("records")
cat_events_dict[:3]

[{'Id': 'A1',
  'Code': 'A1',
  'Name': 'Collisions',
  'Definition': 'A collision event falling within the A.1 sub-categories for which detailed information is not (yet) available.',
  'Broader': 'A'},
 {'Id': 'A1-1 ',
  'Code': 'A1.1 ',
  'Name': 'Collision of train with a train/rail vehicle',
  'Definition': 'A front to front; front to end or a side collision between a part of a train and a part of another train or rail vehicle; or with shunting rolling stock.',
  'Broader': 'A1'},
 {'Id': 'A1-2',
  'Code': 'A1.2',
  'Name': 'Collision of train with obstacle within the clearance gauge',
  'Definition': 'A collision between a part of a train and objects fixed or temporarily present on or near the track (except at level crossings if lost by a crossing vehicle or user); including collision with overhead contact lines.',
  'Broader': 'A1'}]

In [9]:
# Define function to split the event data into chunks
def split_events_into_chunks(data: list[dict], chunk_size: int = 2000, chunk_overlap: int = 300) -> list[list[str]]:
    """
    Converts each dictionary row into a string and splits each string into smaller overlapping chunks.
    """
    # Prepare the list of formatted strings (chunks) from the dictionary rows
    chunk_list = [
        # f"Id: {row['Id']} Code: {row['Code']} Name: {row['Name']} Definition: {row['Definition']} Broader: {row['Broader']}"
        f"AccidentType: {row['Name']}; Definition: {row['Definition']}"
        for row in data
    ]

    # Function to split a single text into smaller chunks
    def split_single_text(text: str) -> list[str]:
        if not text:
            print("[INFO] Warning: No text provided for splitting.")
            return []

        if len(text) <= chunk_size:
            return [text]  # If text is smaller than chunk size, return as a single chunk

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap
        )
        return text_splitter.split_text(text)

    # Split each chunk string from the dictionary rows into smaller chunks
    all_chunks = [split_single_text(chunk) for chunk in chunk_list]
    
    return all_chunks

# Apply function
event_chunks = split_events_into_chunks(cat_events_dict)

# Print the number of chunks and a sample chunk
print(f"[INFO] Total chunks: {len(event_chunks)}\n[INFO] First three chunks:\n{event_chunks[:3]}")

[INFO] Total chunks: 231
[INFO] First three chunks:
[['AccidentType: Collisions; Definition: A collision event falling within the A.1 sub-categories for which detailed information is not (yet) available.'], ['AccidentType: Collision of train with a train/rail vehicle; Definition: A front to front; front to end or a side collision between a part of a train and a part of another train or rail vehicle; or with shunting rolling stock.'], ['AccidentType: Collision of train with obstacle within the clearance gauge; Definition: A collision between a part of a train and objects fixed or temporarily present on or near the track (except at level crossings if lost by a crossing vehicle or user); including collision with overhead contact lines.']]


## 3.3 Contributing & Systemic Factors

In [10]:
# Convert the merged DataFrame to a list of rows in dictionary format
contr_fact_dict = contr_fact.to_dict("records")
sys_fact_dict = sys_fact.to_dict("records")

In [11]:
# Define function to split the factors data into chunks
def split_factors_into_chunks(data: list[dict], chunk_size: int = 2000, chunk_overlap: int = 300) -> list[list[str]]:
    """
    Converts each dictionary row into a string and splits each string into smaller overlapping chunks.
    """
    # Prepare the list of formatted strings (chunks) from the dictionary rows
    # if data == contr_fact_dict:
    #     factor_type = "Contributing"
    # else:
    #     factor_type = "Systemic"

    chunk_list = [
        # f"Type: {row['Code']}; Factor: {row['Name']}; Definition: {row['Definition']}"
        f"Factor: {row['Name']}; Definition: {row['Definition']}"
        for row in data
    ]

    # Function to split a single text into smaller chunks
    def split_single_text(text: str) -> list[str]:
        if not text:
            print("[INFO] Warning: No text provided for splitting.")
            return []

        if len(text) <= chunk_size:
            return [text]  # If text is smaller than chunk size, return as a single chunk

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap
        )
        return text_splitter.split_text(text)

    # Split each chunk string from the dictionary rows into smaller chunks
    all_chunks = [split_single_text(chunk) for chunk in chunk_list]
    
    return all_chunks

# Apply function to both Contributing and Systemic factors
contr_fact_chunks = split_factors_into_chunks(contr_fact_dict)
sys_fact_chunks = split_factors_into_chunks(sys_fact_dict)

# 4. Relevant Chunk Retrieval

In [12]:
# Define embeddings model
embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

## 4.1 Incident Reports

In [13]:
# Store text chunks into FAISS vector store
vectorstore_reports = FAISS.from_texts(report_chunks, embeddings)

In [14]:
# Define entities of interest to extract chunks for from the vector store
entity_queries = {
    "unique accident": "Description of the unique accident",
    "accident type": "Type of railway accident",
    "track section": "Where did the accident happen?",
    "date": "Date of the accident",
    "time": "Time when the accident occurred",
    "country": "Country where the accident took place",
    "regulatory body": "Which authority investigated the accident?",
    "contributing factor": "What were the contributing factors to the accident?",
    "systemic factor": "What were the systemic factors to the accident?",
}

# Define list of entity keys for later usage
entity_list = list(entity_queries.keys())

# Function for extracting most relevant chunks from vector store
def find_most_relevant_report_chunks(entities: dict[str, str], top_k: int) -> str:
    """
    Finds the most relevant text chunks for each entity of interest using FAISS similarity search.

    Args:
    - entities (dict): Mapping of entity name → query string
    - top_k (int): Number of chunks to retrieve per entity

    Returns:
    - combined_text (str): Combined relevant chunks (deduplicated)
    """
    retrieved_chunks = set()

    for entity, query in entities.items():
        print(f"[INFO] Searching for entity: {entity}")
        found_chunks = vectorstore_reports.similarity_search(query, k=top_k)

        for chunk in found_chunks:
            retrieved_chunks.add(chunk.page_content)

    combined_text = "\n".join(retrieved_chunks)
    print(f"\n[INFO] Found {len(retrieved_chunks)} unique relevant chunks.")
    return combined_text

# Find & combine relevant chunks
relevant_report_text = find_most_relevant_report_chunks(entity_queries, top_k=3)

print(f"\n[INFO] Most Relevant Chunks Combined:\n{relevant_report_text}")

[INFO] Searching for entity: unique accident
[INFO] Searching for entity: accident type
[INFO] Searching for entity: track section
[INFO] Searching for entity: date
[INFO] Searching for entity: time
[INFO] Searching for entity: country
[INFO] Searching for entity: regulatory body
[INFO] Searching for entity: contributing factor
[INFO] Searching for entity: systemic factor

[INFO] Found 3 unique relevant chunks.

[INFO] Most Relevant Chunks Combined:
At approximately 12:13 hour (hrs) on the 24th May 2020, a Level Crossing Control Operative
(LCCO), referred to as LCCO1 in this report, located at Athlone Local Control Centre (ALCC)
cleared Closed-Circuit Television (CCTV) Level Crossing (LC) XA068, located in Ashfield,
Offaly for the passage of the 11:00 hrs passenger service from Galway to Heuston (Train
A703) while a member of the public (MOP) was inside the barriers of the level crossing.
The MOP had requested assistance from LCCO1 using the telephone provided at the level
crossing. LC

## 4.2 Accident Categories & CoF/SF 

### 4.2.1 Common Function for Querying Accident & CoF/SF Vector Stores

In [15]:
# Function for extracting most relevant chunks from vector store (queried later when needed)
def find_most_relevant_iss_chunks(vectorstore: str, query_input: str, top_k: int) -> str:
    """
    Retrieves the most relevant accident category chunks from the vector store based on the query.
    """
    # Query which will be defined by GPT's extraction response
    query = f"{query_input}"
    
    # Perform the similarity search
    found_chunks = vectorstore.similarity_search(query, k=top_k)
    
    # Extract the text from each document in the list
    found_chunks = [doc.page_content for doc in found_chunks]
    
    # Join the list of texts into a single string
    found_chunks = "\n".join(found_chunks)
    
    return found_chunks

In [16]:
# Flatten the list of lists with event chunks for FAISS processing
flat_event_chunks = [chunk for sublist in event_chunks for chunk in sublist]

# Store text chunks into FAISS vector store
vectorstore_categories = FAISS.from_texts(flat_event_chunks, embeddings)

In [17]:
# Flatten the list of lists with factor chunks for FAISS processing
flat_contr_fact_chunks = [chunk for sublist in contr_fact_chunks for chunk in sublist]
flat_sys_fact_chunks = [chunk for sublist in sys_fact_chunks for chunk in sublist]

# Store text chunks into FAISS vector store
vectorstore_contr_fact = FAISS.from_texts(flat_contr_fact_chunks, embeddings)
vectorstore_sys_fact = FAISS.from_texts(flat_sys_fact_chunks, embeddings)

# 5. Instantiating Language Models

## 5.1 Instantiating LLMs

In [18]:
# Set the API key and model type ("gpt-4o-mini" or "gemini-2.5-flash-preview-04-17")
model_type = "gemini-2.5-flash-preview-04-17"

if model_type.startswith("gpt"):
  if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

  chat_model = init_chat_model(model_type, model_provider="openai")
elif model_type.startswith("gemini"):
  if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI Studio API Key: ")
    
  chat_model = ChatGoogleGenerativeAI(model=model_type, google_api_key=os.environ["GOOGLE_API_KEY"])

### 5.1.1 Token Count

In [19]:
# Function for calculating tokens (for GPT models only)
if model_type.startswith("gpt"):
    def count_tokens(text: str, model: str = model_type) -> int:
        """Efficiently counts tokens in a text for a given OpenAI model."""
        encoding = tiktoken.encoding_for_model(model)
        token_integers = encoding.encode(text)
        num_tokens = len(token_integers)
        return num_tokens

### 5.1.2 Memory Definement for Chat History

In [20]:
# Define memory for storing chat history
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


### 5.1.3 Schema & Prompt Templates Definement

In [21]:
# Define the JSON schema example for few-shot learning
schema_example = """{
    "nodes": [
        {"id": "Dublin-Cork Accident", "type": "UniqueAccident"},
        {"id": "Level Crossing Accident involving a train", "type": "AccidentType"},
        {"id": "105 MP-108 MP", "type": "TrackSection"},
        {"id": "23/12/2021", "type": "Date"},
        {"id": "16:32", "type": "Time"},
        {"id": "Ireland", "type": "Country"},
        {"id": "Complexity", "type": "ContributingFactor"},
        {"id": "Environment", "type": "ContributingFactor"},
        {"id": "Fatigue", "type": "ContributingFactor"},
        {"id": "Leadership and commitment", "type": "SystemicFactor"},
        {"id": "Safety objectives and planning", "type": "SystemicFactor"},
        {"id": "Information and communication", "type": "SystemicFactor"},
        {"id": "European Rail Agency", "type": "RegulatoryBody"},
    ],
    "rels": [
        {"source": "Dublin-Cork Accident", "target": "Ireland", "type": "occurred_in"},
        {"source": "Dublin-Cork Accident", "target": "Collision", "type": "is_type"},
        {"source": "Dublin-Cork Accident", "target": "105 MP-108 MP", "type": "is_track_section"},
        {"source": "Dublin-Cork Accident", "target": "European Rail Agency", "type": "investigated_by"},
        {"source": "Dublin-Cork Accident", "target": "23/12/2021", "type": "has_date"},
        {"source": "Dublin-Cork Accident", "target": "16:32", "type": "has_time"},
        {"source": "Dublin-Cork Accident", "target": "Complexity", "type": "contributing_factor"},
        {"source": "Dublin-Cork Accident", "target": "Environment", "type": "contributing_factor"},
        {"source": "Dublin-Cork Accident", "target": "Fatigue", "type": "contributing_factor"},
        {"source": "Dublin-Cork Accident", "target": "Leadership and commitment", "type": "systemic_factor"},
        {"source": "Dublin-Cork Accident", "target": "Safety objectives and planning", "type": "systemic_factor"},
        {"source": "Dublin-Cork Accident", "target": "Information and communication", "type": "systemic_factor"}
    ]
}"""

# Define the chat prompt for GPT extraction
extraction_chat_prompt = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template("You are an expert in analyzing railway accident reports. Follow the JSON schema provided."),
        MessagesPlaceholder(variable_name="chat_history"),
        HumanMessagePromptTemplate.from_template(
        """
        Analyze the following railway accident report context and extract structured knowledge in JSON format.

        Return a JSON object with:
        - `nodes`: A list of entities, specifically {entities_of_interest}.
        - `rels`: A list of relationships linking entities.

        Guidelines:
        - Look at the JSON schema example response and follow it closely. 
        - Ensure that the `source` and `target` nodes in `rels` are the SAME entities from the `nodes` list, and NOT different ones. 
        - Make sure to map ALL nodes with other important entities, e.g., (node UniqueAccident has_date node Date, node UniqueAccident occurred_at node Country).
        - Do NOT map entities like (node Date is_date to node Time) or (node AccidentType is_type to node Country). This is INCORRECT.
        - The `type` field in `rels` should be a verb phrase (e.g., "occurred_in", "investigated_by").
        - The `id` field in `nodes` should be the exact text of the entity, not a description or a summary.
        - Pay attention to date and type formats (e.g., EU date format, 24-hour time).
        - The `UniqueAccident` entity should be a unique identifier for the accident.
        - The `AccidentType` entity should be the type of accident.
        - The `TrackSection` entity should be the track section where the accident occurred.
        - The `Country` entity should be the country where the accident occurred.
        - The `ContributingFactor` entity should be a factor contributing to the accident.
        - The `SystemicFactor` entity should be a systemic factor contributing to the accident.
        - The `RegulatoryBody` entity should be the regulatory body that investigated the accident.

        Schema example:
        {schema_example}

        Accident report context:
        {relevant_report_text}

        JSON:
        """
        )
    ],
    # Only "relevant_report_text" is dynamic; the others are provided as constants.
    input_variables=["relevant_report_text"],
    partial_variables={
        "entities_of_interest": entity_list,
        "schema_example": schema_example
    }
)

# Set constant values as partial variables.
extraction_chat_prompt.partial_variables = {
    "entities_of_interest": entity_list,
    "schema_example": schema_example
}

extraction_chain = LLMChain(
    llm=chat_model,
    prompt=extraction_chat_prompt,
    memory=memory
)

  extraction_chain = LLMChain(


### 5.1.4 Query First Prompt

In [22]:
# Convert the extraction prompt to a string for token counting
extraction_prompt = extraction_chat_prompt.format(
    relevant_report_text=relevant_report_text,
    chat_history=[]
)

if model_type.startswith("gpt"):
    # Concatenate the messages into a single string
    token_limit = 4000
    token_count = count_tokens(extraction_prompt)
    estimated_cost = token_count * 0.00000015  # Approximate cost calculation

    print(f"Estimated cost: ${estimated_cost:.5f}")
    print(f"Token count for prompt: {token_count}")

    if token_count > token_limit:
        print(f"Token count is too high: {token_count}\nPlease reduce the chunk size or refine the prompt.")
        proceed = "no"
    else:
        proceed = input("Do you want to proceed with information extraction? (yes/no): ").strip().lower()

    if proceed != "yes":
        print("Extraction aborted by user.")
else:
    print("Sending request to model for extraction...\n")
    extraction_result_raw = extraction_chain.invoke({"relevant_report_text": relevant_report_text})["text"]
    print("Extraction Output:\n")
    print(extraction_result_raw)

Sending request to model for extraction...

Extraction Output:

```json
{
    "nodes": [
        {
            "id": "MOP trapped at LC XA068 Ashfield",
            "type": "UniqueAccident"
        },
        {
            "id": "Level Crossing Incident involving trapped person",
            "type": "AccidentType"
        },
        {
            "id": "LC XA068 Ashfield",
            "type": "TrackSection"
        },
        {
            "id": "24/05/2020",
            "type": "Date"
        },
        {
            "id": "12:13",
            "type": "Time"
        },
        {
            "id": "Ireland",
            "type": "Country"
        },
        {
            "id": "Railway Accident Investigation Unit",
            "type": "RegulatoryBody"
        },
        {
            "id": "Alternative actions permitted by Mid-Section CCTV Crossing functions",
            "type": "ContributingFactor"
        }
    ],
    "rels": [
        {
            "source": "MOP trapped at LC XA068

### 5.1.5 Extract Accident Type from GPT's First Prompt

In [23]:
# Additional pre-processing to extract the JSON from the response
extraction_json = re.sub(r'^```json\n?|```$', '', extraction_result_raw).strip()
extraction_json = json.loads(extraction_json)

# Extract GPT's AccidentType assumption in order to query the vector store for the relevant chunks according to it
accident_type = None
true_contr_fact = dict()
true_sys_fact = dict()

for node in extraction_json['nodes']:
        if node['type'] == 'AccidentType':
            accident_type = node['id']
        elif node['type'] == 'ContributingFactor':
            true_contr_fact[f"{model_type}: {node['id']}"] = ''
        elif node['type'] == 'SystemicFactor':
            true_sys_fact[f"{model_type}: {node['id']}"] = ''

# By using the initial AccidentType predicted by GPT, we can query our vector store and identify similar accident categories (which are standardized)
relevant_events_text = find_most_relevant_iss_chunks(vectorstore_categories, accident_type, top_k=3)

print(f"Accident type:\n\n{accident_type}")
print(f"\nRelevant categories in vector store:\n\n{relevant_events_text}")

Accident type:

Level Crossing Incident involving trapped person

Relevant categories in vector store:

AccidentType: Level Crossing Accident; Definition:  A level crossing accident falling within the A.3 sub-categories for which detailed information is not (yet) available.
AccidentType: Level Crossing Accident involving a train and pedestrians; Definition: Any accident at level crossings involving at least one train and crossing pedestrians. Note: pedestrians also include people crossing with bicycles (or other similar small vehicles normally allowed on pedestrian/bike paths).
AccidentType: Level Crossing Accident involving a train; Definition:  A level crossing accident involving a train and falling within the A.3.1 sub-categories for which detailed information is not (yet) available.


In [24]:
for key, value in true_contr_fact.items():
    relevant_result = find_most_relevant_iss_chunks(vectorstore_contr_fact, key, top_k=1)
    
    # Removing the definition part from the chunk to get only the factors to input in the GPT
    pattern = r"^(.*?)(?=; Definition)"
    match = re.search(pattern, relevant_result)
    result = match.group(1)
    result = re.sub(r'^Factor:\s*', '', result)

    factors = result
    true_contr_fact[key] = factors
    
true_contr_fact

{'gemini-2.5-flash-preview-04-17: Alternative actions permitted by Mid-Section CCTV Crossing functions': 'Awareness'}

In [25]:
for key, value in true_sys_fact.items():
    relevant_result = find_most_relevant_iss_chunks(vectorstore_sys_fact, key, top_k=1)
    
    # Removing the definition part from the chunk to get only the factors to input in the GPT
    pattern = r"^(.*?)(?=; Definition)"
    match = re.search(pattern, relevant_result)
    result = match.group(1)
    result = re.sub(r'^Factor:\s*', '', result)

    factors = result
    true_sys_fact[key] = factors

true_sys_fact

{}

### 5.1.6 Query Second (Refinement) Prompt

In [26]:
# Define the chat prompt for GPT refinement
refinement_chat_prompt = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template("You are an expert in refining railway accident report extractions."),
        MessagesPlaceholder(variable_name="chat_history"),
        HumanMessagePromptTemplate.from_template(
            """
            Now, you need to adapt three key elements of the extracted JSON: AccidentType, ContributingFactors, and SystemicFactors.

            First, for the AccidentType, choose ONLY one relevant accident category from the three options below. Copy the AccidentType word for word.
            
            AccidentType:
            {relevant_events_text}

            Second, for ContributingFactor, the key in the dictionary is your previous prediction from the JSON (GPT), and the value is the correct factor you need to replace the value in the JSON with. 
            Copy the factor exactly WORD FOR WORD into the JSON accordingly and replace the ContributingFactor ONLY (both in `nodes` and `rels`).
            
            ContributingFactor:
            {true_contr_fact}

            Third, for SystemicFactor, do the same - replace the SystemicFactor ONLY with the correct factor below.

            SystemicFactor:
            {true_sys_fact}

            Refine your previous answer and update the JSON with the correct AccidentType, ContributingFactors, and SystemicFactors. Here is your previous guess:
            {extraction_result}

            Provide only the updated JSON with the required changes, without any additional comments or text. If the factors repeat, then keep only one instance of each factor:
            """
        )
    ],
    input_variables=["extraction_result"],
    partial_variables={
        "relevant_events_text": relevant_events_text,
        "true_contr_fact": true_contr_fact,
        "true_sys_fact": true_sys_fact
    }
)

refinement_chain = LLMChain(
    llm=chat_model,
    prompt=refinement_chat_prompt,
    memory=memory
)

In [27]:
# Convert the extraction prompt to a string for token counting.
refinement_prompt = refinement_chat_prompt.format(
    extraction_result=extraction_result_raw,
    chat_history=memory.chat_memory.messages
)

if model_type.startswith("gpt"):
    # Concatenate the messages into a single string.
    token_limit = 5000
    token_count = count_tokens(refinement_prompt)
    print(f"Estimated cost: ${estimated_cost:.5f}")
    print(f"Token count for prompt: {token_count}")

    if token_count > token_limit:
        print(f"Token count is too high: {token_count}\nPlease reduce the chunk size or refine the prompt.")
        proceed = "no"
    else:
        proceed = input("Do you want to proceed with information extraction? (yes/no): ").strip().lower()

    if proceed != "yes":
        print("Extraction aborted by user.")
else:
    print("Sending request to model for extraction...\n")
    refinement_result_raw = refinement_chain.invoke({"extraction_result": extraction_result_raw})["text"]
    print("Refinement Output:\n")
    print(refinement_result_raw)

Sending request to model for extraction...

Refinement Output:

```json
{
    "nodes": [
        {
            "id": "MOP trapped at LC XA068 Ashfield",
            "type": "UniqueAccident"
        },
        {
            "id": "Level Crossing Accident involving a train and pedestrians",
            "type": "AccidentType"
        },
        {
            "id": "LC XA068 Ashfield",
            "type": "TrackSection"
        },
        {
            "id": "24/05/2020",
            "type": "Date"
        },
        {
            "id": "12:13",
            "type": "Time"
        },
        {
            "id": "Ireland",
            "type": "Country"
        },
        {
            "id": "Railway Accident Investigation Unit",
            "type": "RegulatoryBody"
        },
        {
            "id": "Awareness",
            "type": "ContributingFactor"
        }
    ],
    "rels": [
        {
            "source": "MOP trapped at LC XA068 Ashfield",
            "target": "Ireland",
     

In [28]:
# Clean up the final response and parse JSON.
final_response_str = re.sub(r'^```json\n?|```$', '', refinement_result_raw).strip()
final_response = json.loads(final_response_str)
print("\nFinal Refined JSON Response:")
print(json.dumps(final_response, ensure_ascii=False, indent=2))


Final Refined JSON Response:
{
  "nodes": [
    {
      "id": "MOP trapped at LC XA068 Ashfield",
      "type": "UniqueAccident"
    },
    {
      "id": "Level Crossing Accident involving a train and pedestrians",
      "type": "AccidentType"
    },
    {
      "id": "LC XA068 Ashfield",
      "type": "TrackSection"
    },
    {
      "id": "24/05/2020",
      "type": "Date"
    },
    {
      "id": "12:13",
      "type": "Time"
    },
    {
      "id": "Ireland",
      "type": "Country"
    },
    {
      "id": "Railway Accident Investigation Unit",
      "type": "RegulatoryBody"
    },
    {
      "id": "Awareness",
      "type": "ContributingFactor"
    }
  ],
  "rels": [
    {
      "source": "MOP trapped at LC XA068 Ashfield",
      "target": "Ireland",
      "type": "occurred_in"
    },
    {
      "source": "MOP trapped at LC XA068 Ashfield",
      "target": "Level Crossing Accident involving a train and pedestrians",
      "type": "is_type"
    },
    {
      "source": "MOP t

### 5.1.7 Validating the JSON Structure

In [29]:
# Define Pydantic models that match your JSON output structure.
class NodeModel(BaseModel):
    id: str
    type: str

class RelationshipModel(BaseModel):
    source: str
    target: str
    type: str

class KnowledgeGraphModel(BaseModel):
    nodes: List[NodeModel]
    rels: List[RelationshipModel]

# Assume final_json is your JSON input as a Python dictionary
try:
    kg = KnowledgeGraphModel.model_validate(final_response)
    print("Valid JSON in KG structure!")
except ValidationError as e:
    print("Validation error:", e)

Valid JSON in KG structure!


In [30]:
# Assign response to model for further analysis 
response_dict = {"model": model_type, "response": final_response}

# 6. Cross-model and Iteration Comparison

## 6.1 Overview of results

In [31]:
# Define CSV storage file
CSV_FILE = "pdf_processing_results.csv"

# Define function to append the JSON output of response_json function to a DataFrame
def append_pdf_json_result(pdf_name: str, response_json: dict) -> pd.DataFrame:
    """
    Appends the JSON output of response_json function to a DataFrame.
    
    - If the PDF has been processed before, it appends a **new row** instead of a new column.
    - Prevents duplicate JSON entries for the same iteration.
    - Ensures data is **stored in rows**, making querying and analysis easier.

    Args:
        pdf_name (str): Name of the processed PDF file.
        response_json (dict): JSON response from the knowledge extraction process.

    Returns:
        pd.DataFrame: Updated DataFrame with the new result.
    """
    # Convert JSON response to a formatted string for easy comparison
    json_output = json.dumps(response_json, indent=2)

    # Load existing results if the CSV exists
    path = "./data/" + CSV_FILE
    if os.path.exists(path):
        df = pd.read_csv(path, dtype={"iteration_number": int})
    else:
        # Create an empty DataFrame with the correct schema
        df = pd.DataFrame(columns=["pdf_name", "model_type", "iteration_number", "json_output"])

    # Filter for the current PDF's past records
    pdf_history = df[df["pdf_name"] == pdf_name]

    # Check for duplicates: If this JSON output already exists for the same PDF, skip re-adding
    if not pdf_history.empty and json_output in pdf_history["json_output"].values:
        print(f"[INFO] No changes detected in JSON for {pdf_name}, skipping new entry.")
        return df  # Exit early if it's a duplicate

    # Determine new iteration number
    iteration_number = pdf_history["iteration_number"].max() + 1 if not pdf_history.empty else 1

    # Append new result
    new_entry = pd.DataFrame({"pdf_name": [pdf_name], "model_type": [model_type], "iteration_number": [iteration_number], "json_output": [json_output]})
    df = pd.concat([df, new_entry], ignore_index=True)

    # Save back to CSV in append mode to avoid full file reads/writes
    df.to_csv(path, index=False)

    print(f"[INFO] Successfully added {pdf_name} - Iteration {iteration_number} to results!")
    return df

# Execute function
results_df = append_pdf_json_result(pdf_name, response_dict["response"])

[INFO] Successfully added IE-6291-200524 LC XA068 Ashfield.pdf - Iteration 2 to results!


In [32]:
# View DF
results_df

Unnamed: 0,pdf_name,model_type,iteration_number,json_output
0,IE-10375 - 210827 Collision with track equipme...,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Accident ..."
1,IE-10397 - 211207 Clontarf.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""CaF-05 In..."
2,IE-10404 - 230222 Broken Rail Emly.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Dublin-Co..."
3,IE-200608 BnM Collision LC Offaly.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Kilcolgan..."
4,IE-6218-200111 Collision RRME Rosslare.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Train A60..."
5,IE-6262-200429 LC Collision XM240.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Kilnageer..."
6,IE-6291-200524 LC XA068 Ashfield.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Level Cro..."
7,IE-6305 - 200707_locomotive_224.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Locomotiv..."
8,IE-6305 - 200707_locomotive_224.pdf,gemini-2.5-flash-preview-04-17,2,"{\n ""nodes"": [\n {\n ""id"": ""Locomotiv..."
9,IE-6291-200524 LC XA068 Ashfield.pdf,gemini-2.5-flash-preview-04-17,2,"{\n ""nodes"": [\n {\n ""id"": ""MOP trapp..."


## 6.2 Mapping entities to graph nods and rels

In [33]:
# Define classes for the entities extraction
class Property(BaseModel):
    """A single property consisting of key and value."""
    key: str = Field(..., description="Property key")
    value: str = Field(..., description="Property value")

class Node(BaseNode):
    """Represents an entity in the railway accident knowledge graph."""
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    """Represents a relationship between two entities in the graph."""
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """A knowledge graph storing railway accident data."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [34]:
# Defining functions for mapping extracted entities to graph nodes and relationships
def props_to_dict(props) -> dict:
    """Converts properties to a dictionary for graph storage."""
    properties = {}
    if not props:
        return properties
    for p in props:
        properties[p["key"]] = p["value"]
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Maps extracted entities to graph nodes."""
    properties = {"name": node.id}
    return BaseNode(
        id=node.id,
        type=node.type.capitalize(),
        properties=properties
    )

def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Maps extracted relationships to graph edges."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}

    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

# 7. Neo4j Storage

## 7.1 Instantiating Neo4j

In [None]:
# Neo4j Connection Setup
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"
NEO4J_DATABASE = "neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

try:
    # Test the connection
    with driver.session() as session:
        session.run("RETURN 1")
    print("Connected to Neo4j successfully.")
except AuthError as e:
    print("Authentication failed. Check your credentials:", str(e))

In [None]:
# Clear database
def clear_neo4j_database():
    """Delete all nodes and relationships in the Neo4j database."""
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run("MATCH (n) DETACH DELETE n")
    print("Neo4j database cleared successfully.")

# Run the function to clear the database
clear_neo4j_database()

In [None]:
results_df

## 7.2 Extracting the JSON from DF row

In [None]:
def get_json_output(df, pdf_name, iteration_number):
    """
    Gets the 'json_output' for the given pdf_name and iteration_number.
    Returns an empty dict if there's no match.
    """
    subset = df[
        (df["pdf_name"] == pdf_name) &
        (df["iteration_number"] == iteration_number)
    ]

    if subset.empty:
        print("No match found.")
        return {}

    json_str = subset.iloc[0]["json_output"]
    return json.loads(json_str)

# Choose the JSON to convert to graph
pdf_of_choice = "IE-6305 - 200707_locomotive_224.pdf"
json_to_convert = get_json_output(results_df, pdf_of_choice, iteration_number=1)
print(json.dumps(json_to_convert, ensure_ascii=False, indent=2))

## 7.3 Converting the JSON to graph 

In [None]:
def convert_json_to_graph(json_to_convert, source_text):
    """
    Converts extracted JSON into a graph-compatible format with correct entity types.
    """
    def get_node_type(json_data, node_id):
        """
        Helper function to retrieve the correct node type from JSON.
        """
        for node in json_data["nodes"]:
            if node["id"] == node_id:
                return node["type"]
        return "Unknown"  # Fallback if type is missing

    if not json_to_convert:
        print("No valid data to convert to a graph.")
        return None

    # Convert Nodes
    graph_nodes = [map_to_base_node(Node(id=node["id"], type=node["type"])) for node in json_to_convert["nodes"]]

    # Convert Relationships (Ensure correct types)
    graph_rels = []
    for rel in json_to_convert["rels"]:
        source_node = Node(id=rel["source"], type=get_node_type(json_to_convert, rel["source"]))
        target_node = Node(id=rel["target"], type=get_node_type(json_to_convert, rel["target"]))
        graph_rels.append(map_to_base_relationship(Relationship(source=source_node, target=target_node, type=rel["type"])))

    return GraphDocument(nodes=graph_nodes, relationships=graph_rels, source=Document(page_content=source_text))

## 7.4 Storing the graph in Neo4j

In [None]:
def store_in_neo4j(graph_document):
    """
    Stores extracted knowledge graph into Neo4j with dynamic labels.
    """
    with driver.session() as session:
        # Store nodes with dynamic labels
        for node in graph_document.nodes:
            session.run(f"""
                MERGE (n:{node.type} {{id: $id}})
                ON CREATE SET n.name = $name
            """, id=node.id, name=node.id)

        # Store relationships
        for rel in graph_document.relationships:
            session.run(f"""
                MATCH (s {{id: $source}})
                MATCH (t {{id: $target}})
                MERGE (s)-[r:{rel.type.upper()}]->(t)
            """, source=rel.source.id, target=rel.target.id)

In [None]:
def process_railway_accident_report(json_to_convert):
    """
    Converts the JSON to a graph format and stores it in Neo4j.
    """
    print("Converting JSON to graph format...")
    graph_document = convert_json_to_graph(json_to_convert, relevant_report_text)

    if graph_document:
        print("Graph structure created! Storing in Neo4j...")
        store_in_neo4j(graph_document)

In [None]:
# Store extracted entities into Neo4j
try:
    db_result = process_railway_accident_report(json_to_convert)
    print("Data stored in Neo4j successfully.")
except Exception as e:
    print("Failed to store data in Neo4j:", str(e))

In [None]:
# Close Neo4j connection
driver.close()

# 8. Comparison against ERAIL DB

## 8.1 Loading ERAIL DB

In [35]:
# Load the ERAIL DB
erail_db = pd.read_excel("./data/erail database.xlsx")
erail_db.head(5)

Unnamed: 0,Only received by email after ERAIL stopped to work,Report Type,Investigation Status,ERAIL Occurrence,Title,Reporting Body,Date of occurrence,Time of occurrence,Occurrence type,Occurrence description,...,Investigation report,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63
0,,Final report,Closed,FI-135,"Wrong-side signalling failure, 29/06/2002, Kou...",Accident Investigation Board of Finland,2002-06-29,1900-01-01 08:33:00,Wrong-side signalling failure,Signal malfunctioning,...,,,,,,,,,,
1,,Final report,Closed,NL-444,"Train derailment, 30/04/2003, Station Apeldoor...",The Dutch Safety Board,2003-04-30,1900-01-01 00:42:00,Train derailment,Derailment of a freight train which was loaded...,...,,,,,,,,,,
2,,Final report,Closed,FI-134,"Other, 15.4.2004, Kaukomarkkinat Oy's track at...",Accident Investigation Board of Finland,2004-04-15,1900-01-01 17:31:00,Other,Three methanol carrying Russian tank wagons de...,...,,,,,,,,,,
3,,Final report,Closed,FI-45,"Train derailment, 5/11/2004, Pieksämäki Railwa...",Accident Investigation Board of Finland,2004-05-11,1900-01-01 12:57:00,Train derailment,Two wagons derailed,...,,,,,,,,,,
4,,Final report,Closed,NL-168,"Spad, 21/05/2004, Station Amsterdam (The Nethe...",The Dutch Safety Board,2004-05-21,1900-01-01 18:35:00,Spad,SPAD (an empty double decker train collided wi...,...,,,,,,,,,,


## 8.2 Pre-processing data

In [36]:
# Convert date column to datetime & adjust format
erail_db["Date of occurrence"] = erail_db["Date of occurrence"].dt.strftime("%d/%m/%Y")

# Convert time column to datetime & adjust format
erail_db["Time of occurrence"] = pd.to_datetime(erail_db["Time of occurrence"], errors='coerce')
erail_db["Time of occurrence"] = erail_db["Time of occurrence"].dt.strftime("%H:%M")

erail_db.head(5)

Unnamed: 0,Only received by email after ERAIL stopped to work,Report Type,Investigation Status,ERAIL Occurrence,Title,Reporting Body,Date of occurrence,Time of occurrence,Occurrence type,Occurrence description,...,Investigation report,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63
0,,Final report,Closed,FI-135,"Wrong-side signalling failure, 29/06/2002, Kou...",Accident Investigation Board of Finland,29/06/2002,08:33,Wrong-side signalling failure,Signal malfunctioning,...,,,,,,,,,,
1,,Final report,Closed,NL-444,"Train derailment, 30/04/2003, Station Apeldoor...",The Dutch Safety Board,30/04/2003,00:42,Train derailment,Derailment of a freight train which was loaded...,...,,,,,,,,,,
2,,Final report,Closed,FI-134,"Other, 15.4.2004, Kaukomarkkinat Oy's track at...",Accident Investigation Board of Finland,15/04/2004,17:31,Other,Three methanol carrying Russian tank wagons de...,...,,,,,,,,,,
3,,Final report,Closed,FI-45,"Train derailment, 5/11/2004, Pieksämäki Railwa...",Accident Investigation Board of Finland,11/05/2004,12:57,Train derailment,Two wagons derailed,...,,,,,,,,,,
4,,Final report,Closed,NL-168,"Spad, 21/05/2004, Station Amsterdam (The Nethe...",The Dutch Safety Board,21/05/2004,18:35,Spad,SPAD (an empty double decker train collided wi...,...,,,,,,,,,,


## 8.3 Merging data for comparison

In [37]:
# Create a copy of the results DataFrame
comparison_df = results_df[["pdf_name", "model_type", "json_output"]].copy()

# Extract ID from the PDF name
comparison_df["ERAIL Occurrence"] = comparison_df["pdf_name"].str.extract(r'(IE-\d+)')

# Sample DataFrame (assuming json_output column contains dictionaries in string format)
comparison_df['json_output'] = comparison_df['json_output'].apply(json.loads)  # Convert JSON string to dictionary

# Function to extract node data
def extract_nodes(json_data):
    node_dict = {}
    for node in json_data.get("nodes", []):
        node_dict[f"LLM_{node['type']}"] = node["id"]  # Store ID based on type
    return pd.Series(node_dict)  # Convert dictionary to Series for easier DataFrame merging

# Apply the function to extract node data
nodes_df = comparison_df['json_output'].apply(extract_nodes)

# Merge extracted data into original DataFrame
comparison_df = pd.concat([comparison_df, nodes_df], axis=1)

# Drop the original json_output column if no longer needed
comparison_df.drop(columns=["json_output"], inplace=True)

# View comparison DataFrame
comparison_df

Unnamed: 0,pdf_name,model_type,ERAIL Occurrence,LLM_UniqueAccident,LLM_AccidentType,LLM_TrackSection,LLM_Date,LLM_Time,LLM_Country,LLM_RegulatoryBody,LLM_ContributingFactor,LLM_SystemicFactor
0,IE-10375 - 210827 Collision with track equipme...,gpt-4o-mini,IE-10375,Accident 2023002,Collision with Work Crew,"Old Curragh Station, County Kildare",26/08/2021,23:00,Ireland,Rail Accident Investigation Unit,Reinforcement,Learning from accidents and incidents
1,IE-10397 - 211207 Clontarf.pdf,gpt-4o-mini,IE-10397,CaF-05 Incident,Signal passed at danger without passing a dang...,Clontarf Road Station,07/12/2021,15:59,Ireland,Iarnród Éireann Railway Undertaking,Reinforcement,Safety objectives and planning
2,IE-10404 - 230222 Broken Rail Emly.pdf,gpt-4o-mini,IE-10404,Dublin-Cork Mainline Incident,Broken Rail Incident,110 miles 355 yards,21/02/2023,07:56,Ireland,Railway Accident Investigation Unit,Performance relevant factor,Operational planning and control
3,IE-200608 BnM Collision LC Offaly.pdf,gpt-4o-mini,IE-200608,Kilcolgan Level Crossing Accident,Level Crossing Accident,Kilcolgan Level Crossing,08/06/2020,13:15,Ireland,Bord na Móna,Other contributing factors,Actions to address risks
4,IE-6218-200111 Collision RRME Rosslare.pdf,gpt-4o-mini,IE-6218,Train A602 Collision,Collision of train with obstacle within the cl...,Level Crossing,20/10/2023,10:52,Ireland,Railway Accident Investigation Unit,Instructions,"Organizational roles, responsibilities, accoun..."
5,IE-6262-200429 LC Collision XM240.pdf,gpt-4o-mini,IE-6262,Kilnageer Level Crossing Accident,Level Crossing Accident involving a train and ...,Kilnageer Level Crossing XM240,29/04/2020,13:40,Ireland,Road Safety Authority,Instructions,Other operation
6,IE-6291-200524 LC XA068 Ashfield.pdf,gpt-4o-mini,IE-6291,Level Crossing Incident at LC XA068,Level Crossing Accident involving a train and ...,LC XA068,24/05/2020,12:13,Ireland,Railway Accident Investigation Unit,Awareness,Actions to address risks
7,IE-6305 - 200707_locomotive_224.pdf,gpt-4o-mini,IE-6305,Locomotive 224 Incident,Weld failure,Cork Kent to Dublin Heuston service,06/07/2020,14:25,Ireland,Railway Accident Investigation Unit,Tools,Learning from accidents and incidents
8,IE-6305 - 200707_locomotive_224.pdf,gemini-2.5-flash-preview-04-17,IE-6305,Locomotive 224 Chassis Plate Fracture,Technical Failure of the rolling stock,near Limerick Junction,06/07/2020,,Ireland,Railway Accident Investigation Unit (RAIU),Performance relevant factor,Competence
9,IE-6291-200524 LC XA068 Ashfield.pdf,gemini-2.5-flash-preview-04-17,IE-6291,MOP trapped at LC XA068 Ashfield,Level Crossing Accident involving a train and ...,LC XA068 Ashfield,24/05/2020,12:13,Ireland,Railway Accident Investigation Unit,Awareness,


In [38]:
# Merge the comparison DataFrame with the ERAIL database
merged_df = comparison_df.merge(erail_db, on="ERAIL Occurrence", how='inner')
merged_df

Unnamed: 0,pdf_name,model_type,ERAIL Occurrence,LLM_UniqueAccident,LLM_AccidentType,LLM_TrackSection,LLM_Date,LLM_Time,LLM_Country,LLM_RegulatoryBody,...,Investigation report,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63
0,IE-10375 - 210827 Collision with track equipme...,gpt-4o-mini,IE-10375,Accident 2023002,Collision with Work Crew,"Old Curragh Station, County Kildare",26/08/2021,23:00,Ireland,Rail Accident Investigation Unit,...,,,,,,,,,,
1,IE-10397 - 211207 Clontarf.pdf,gpt-4o-mini,IE-10397,CaF-05 Incident,Signal passed at danger without passing a dang...,Clontarf Road Station,07/12/2021,15:59,Ireland,Iarnród Éireann Railway Undertaking,...,,,,,,,,,,
2,IE-10404 - 230222 Broken Rail Emly.pdf,gpt-4o-mini,IE-10404,Dublin-Cork Mainline Incident,Broken Rail Incident,110 miles 355 yards,21/02/2023,07:56,Ireland,Railway Accident Investigation Unit,...,,,,,,,,,,
3,IE-6218-200111 Collision RRME Rosslare.pdf,gpt-4o-mini,IE-6218,Train A602 Collision,Collision of train with obstacle within the cl...,Level Crossing,20/10/2023,10:52,Ireland,Railway Accident Investigation Unit,...,,,,,,,,,,
4,IE-6262-200429 LC Collision XM240.pdf,gpt-4o-mini,IE-6262,Kilnageer Level Crossing Accident,Level Crossing Accident involving a train and ...,Kilnageer Level Crossing XM240,29/04/2020,13:40,Ireland,Road Safety Authority,...,,,,,,,,,,
5,IE-6291-200524 LC XA068 Ashfield.pdf,gpt-4o-mini,IE-6291,Level Crossing Incident at LC XA068,Level Crossing Accident involving a train and ...,LC XA068,24/05/2020,12:13,Ireland,Railway Accident Investigation Unit,...,,,,,,,,,,
6,IE-6305 - 200707_locomotive_224.pdf,gpt-4o-mini,IE-6305,Locomotive 224 Incident,Weld failure,Cork Kent to Dublin Heuston service,06/07/2020,14:25,Ireland,Railway Accident Investigation Unit,...,,,,,,,,,,
7,IE-6305 - 200707_locomotive_224.pdf,gemini-2.5-flash-preview-04-17,IE-6305,Locomotive 224 Chassis Plate Fracture,Technical Failure of the rolling stock,near Limerick Junction,06/07/2020,,Ireland,Railway Accident Investigation Unit (RAIU),...,,,,,,,,,,
8,IE-6291-200524 LC XA068 Ashfield.pdf,gemini-2.5-flash-preview-04-17,IE-6291,MOP trapped at LC XA068 Ashfield,Level Crossing Accident involving a train and ...,LC XA068 Ashfield,24/05/2020,12:13,Ireland,Railway Accident Investigation Unit,...,,,,,,,,,,


## 8.4 Comparison

In [39]:
# Compare LLM extracted date with the ERAIL database (source of truth)
merged_df["Date Match"] = np.where(merged_df["LLM_Date"] == merged_df["Date of occurrence"], "Match", "Mismatch")
merged_df[["ERAIL Occurrence", "model_type", "LLM_Date", "Date of occurrence", "Date Match"]]

Unnamed: 0,ERAIL Occurrence,model_type,LLM_Date,Date of occurrence,Date Match
0,IE-10375,gpt-4o-mini,26/08/2021,27/08/2021,Mismatch
1,IE-10397,gpt-4o-mini,07/12/2021,07/12/2021,Match
2,IE-10404,gpt-4o-mini,21/02/2023,22/02/2023,Mismatch
3,IE-6218,gpt-4o-mini,20/10/2023,11/01/2020,Mismatch
4,IE-6262,gpt-4o-mini,29/04/2020,29/04/2020,Match
5,IE-6291,gpt-4o-mini,24/05/2020,24/05/2020,Match
6,IE-6305,gpt-4o-mini,06/07/2020,07/07/2020,Mismatch
7,IE-6305,gemini-2.5-flash-preview-04-17,06/07/2020,07/07/2020,Mismatch
8,IE-6291,gemini-2.5-flash-preview-04-17,24/05/2020,24/05/2020,Match


In [40]:
# Compare LLM extracted time with the ERAIL database (source of truth)
merged_df["Time Match"] = np.where(merged_df["LLM_Time"] == merged_df["Time of occurrence"], "Match", "Mismatch")
merged_df[["ERAIL Occurrence", "model_type", "LLM_Time", "Time of occurrence", "Time Match"]]

Unnamed: 0,ERAIL Occurrence,model_type,LLM_Time,Time of occurrence,Time Match
0,IE-10375,gpt-4o-mini,23:00,00:20,Mismatch
1,IE-10397,gpt-4o-mini,15:59,16:05,Mismatch
2,IE-10404,gpt-4o-mini,07:56,11:45,Mismatch
3,IE-6218,gpt-4o-mini,10:52,,Mismatch
4,IE-6262,gpt-4o-mini,13:40,,Mismatch
5,IE-6291,gpt-4o-mini,12:13,,Mismatch
6,IE-6305,gpt-4o-mini,14:25,,Mismatch
7,IE-6305,gemini-2.5-flash-preview-04-17,,,Mismatch
8,IE-6291,gemini-2.5-flash-preview-04-17,12:13,,Mismatch


In [41]:
# Compare LLM extracted country with the ERAIL database (source of truth)
merged_df["Country Match"] = np.where(merged_df["LLM_Country"] == merged_df["Country"], "Match", "Mismatch")
merged_df[["ERAIL Occurrence", "model_type", "LLM_Country", "Country", "Country Match"]]

Unnamed: 0,ERAIL Occurrence,model_type,LLM_Country,Country,Country Match
0,IE-10375,gpt-4o-mini,Ireland,Ireland,Match
1,IE-10397,gpt-4o-mini,Ireland,Ireland,Match
2,IE-10404,gpt-4o-mini,Ireland,Ireland,Match
3,IE-6218,gpt-4o-mini,Ireland,Ireland,Match
4,IE-6262,gpt-4o-mini,Ireland,Ireland,Match
5,IE-6291,gpt-4o-mini,Ireland,Ireland,Match
6,IE-6305,gpt-4o-mini,Ireland,Ireland,Match
7,IE-6305,gemini-2.5-flash-preview-04-17,Ireland,Ireland,Match
8,IE-6291,gemini-2.5-flash-preview-04-17,Ireland,Ireland,Match
