# 1. Package Loading

In [1]:
# Standard Library
import json
import os
import re
import requests
import getpass
import tqdm

# Data Manipulation
import pandas as pd
import numpy as np

# PDFs
import pdfplumber

# LLMs
import tiktoken
import textwrap

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_community.graphs.graph_document import (
    GraphDocument,
    Node as BaseNode,
    Relationship as BaseRelationship,
)
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chat_models import init_chat_model
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chains import LLMChain

# Neo4j
from neo4j import GraphDatabase
from neo4j.exceptions import AuthError

# Typing & Validation
from pydantic import BaseModel, ValidationError, Field
from typing import Any, Dict, List, Optional

# 2. Raw Data Extraction

## 2.1 Incident Reports

In [2]:
# Define the directory containing PDFs
pdf_directory = "./reports_ie/"

# List all PDF files in the directory
pdf_files = [f for f in os.listdir(pdf_directory) if f.lower().endswith(".pdf")]

count = 0
for file in pdf_files:
    count += 1
    print(f"{count}. {file}")

1. IE-10375 - 210827 Collision with track equipment.pdf
2. IE-10397 - 211207 Clontarf.pdf
3. IE-10404 - 230222 Broken Rail Emly.pdf
4. IE-200608 BnM Collision LC Offaly.pdf
5. IE-6218-200111 Collision RRME Rosslare.pdf
6. IE-6262-200429 LC Collision XM240.pdf
7. IE-6291-200524 LC XA068 Ashfield.pdf
8. IE-6305 - 200707_locomotive_224.pdf


In [163]:
# Specify file name to process
pdf_name = "IE-10375 - 210827 Collision with track equipment.pdf"

In [188]:
# Define regex patterns to identify the start of the summary and contents sections
SUMMARY_PATTERN = re.compile(r"^summary", re.IGNORECASE)
CONTENTS_PATTERN = re.compile(r"^contents", re.IGNORECASE)

# Define function to extract the summary section from Ireland reports
def extract_summary_section(pdf_path: str, header_lines: int = 1) -> str:
    """
    Extracts text from 'Summary' to 'Contents' in an Irish rail report PDF.
    If no summary section is found, returns the text from all pages.
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"File not found: {pdf_path}")

    summary_text = ""
    full_text = ""
    capturing = False  

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text_lines = page_text.split("\n")
                text_without_header = text_lines[header_lines:]
                page_content = f"[Page {page.page_number}]\n" + "\n".join(text_without_header) + "\n\n"

                # Always append to full_text
                full_text += page_content

                if text_without_header:
                    first_line = text_without_header[0].strip().lower()

                    if SUMMARY_PATTERN.match(first_line):
                        capturing = True
                    
                    if CONTENTS_PATTERN.match(first_line) and capturing:
                        # Stop capturing when "Contents" is found after summary started.
                        break
                    
                    if capturing:
                        summary_text += page_content

    if not summary_text:
        print(f"Warning: No summary section found in {pdf_path}. Returning full text.")
        return full_text

    return summary_text

# Example usage:
pdf_text = extract_summary_section(f"./reports_ie/{pdf_name}", header_lines=1)
print(pdf_text[:500])

[Page 15]
Summary of the accident
22 On the evening of the 26th August 2021, at 23:00 hrs, a work detail incorporating three IÉ-
IM CCE staff, the ES, PIC, GO and eight contracted staff met for a safety briefing at a
works compound adjacent to the old Curragh Station.
23 The work scheduled by the CCE Infrastructure Department was to replace a defective 9
metre (m) section of rail. The work crew were briefed by the ES on their duties for the night
and given site safety information including that 


## 2.2 Accident Categories

In [5]:
# Load accident category events 
cat_a_events = pd.read_csv("./data/category-a-event-types-source.csv", encoding='latin-1')
cat_b_events = pd.read_csv("./data/category-b-event-types-source.csv", encoding='latin-1')
cat_c_events = pd.read_csv("./data/category-c-event-types-source.csv", encoding='latin-1')

# Merge the DataFrames into one
cat_events = pd.concat([cat_a_events, cat_b_events, cat_c_events], ignore_index=True)
cat_events

Unnamed: 0,Id,Code,Name,Definition,Broader
0,A1,A1,Collisions,A collision event falling within the A.1 sub-c...,A
1,A1-1,A1.1,Collision of train with a train/rail vehicle,A front to front; front to end or a side colli...,A1
2,A1-2,A1.2,Collision of train with obstacle within the cl...,A collision between a part of a train and obje...,A1
3,A1-3,A1.3,Collision of one or more rail vehicles with an...,Same as A1.1 but concerning more rail vehicles...,A1
4,A1-4,A1.4,Collision of one or more rail vehicles with ob...,Same as A1.2 but concerning one or more rail v...,A1
...,...,...,...,...,...
226,C-3-4,C.3.4,Arson,,C-3
227,C-3-5,C.3.5,Vandalism,,C-3
228,C-3-6,C.3.6,Cyber attack,,C-3
229,C-3-0,C.3.0,Other external events - Security,Any variation falling within the category ÔExt...,C-3


# 3. Langchain Chunk Splitting

## 3.1 Incident Reports

In [190]:
# Define function to split the incident report into chunks
def split_report_into_chunks(text: str, chunk_size: int = 2000, chunk_overlap: int = 300) -> list[str]:
    """
    Splits text into smaller overlapping chunks using LangChain's text splitter.
    """
    if not text:
        print("Warning: No text provided for splitting.")
        return []

    if len(text) <= chunk_size:
        return [text]  # If text is smaller than chunk size, return as single chunk

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_text(text)

# Split the extracted text
report_chunks = split_report_into_chunks(pdf_text)

# Print the number of chunks and a sample chunk
print(f"Total chunks: {len(report_chunks)}\nFirst chunk:\n{report_chunks[0]}")

Total chunks: 83
First chunk:
[Page 15]
Summary of the accident
22 On the evening of the 26th August 2021, at 23:00 hrs, a work detail incorporating three IÉ-
IM CCE staff, the ES, PIC, GO and eight contracted staff met for a safety briefing at a
works compound adjacent to the old Curragh Station.
23 The work scheduled by the CCE Infrastructure Department was to replace a defective 9
metre (m) section of rail. The work crew were briefed by the ES on their duties for the night
and given site safety information including that the work would be under an Absolute
Possession T3 Possession), to be referred to as T3 Possessions in this report.
24 After the briefing, the ES and GO, followed by the contractors, drove to the access point
close to the intended work site. They waited a few minutes until the ES confirmed the last
timetabled train, Train J219, passed the worksite, and then the ES stated that they were
“good-to-go”. The T3 Possession had not been granted at this stage i.e. the railwa

## 3.2 Accident Categories

In [7]:
# Convert the merged DataFrame to a list of rows in dictionary format
cat_events_dict = cat_events.to_dict("records")
cat_events_dict[:5]

[{'Id': 'A1',
  'Code': 'A1',
  'Name': 'Collisions',
  'Definition': 'A collision event falling within the A.1 sub-categories for which detailed information is not (yet) available.',
  'Broader': 'A'},
 {'Id': 'A1-1 ',
  'Code': 'A1.1 ',
  'Name': 'Collision of train with a train/rail vehicle',
  'Definition': 'A front to front; front to end or a side collision between a part of a train and a part of another train or rail vehicle; or with shunting rolling stock.',
  'Broader': 'A1'},
 {'Id': 'A1-2',
  'Code': 'A1.2',
  'Name': 'Collision of train with obstacle within the clearance gauge',
  'Definition': 'A collision between a part of a train and objects fixed or temporarily present on or near the track (except at level crossings if lost by a crossing vehicle or user); including collision with overhead contact lines.',
  'Broader': 'A1'},
 {'Id': 'A1-3 ',
  'Code': 'A1.3 ',
  'Name': 'Collision of one or more rail vehicles with another rail vehicle',
  'Definition': 'Same as A1.1 but 

In [8]:
# Define function to split the event data into chunks
def split_events_into_chunks(data: list[dict], chunk_size: int = 2000, chunk_overlap: int = 300) -> list[list[str]]:
    """
    Converts each dictionary row into a string and splits each string into smaller overlapping chunks.
    """
    # Prepare the list of formatted strings (chunks) from the dictionary rows
    chunk_list = [
        # f"Id: {row['Id']} Code: {row['Code']} Name: {row['Name']} Definition: {row['Definition']} Broader: {row['Broader']}"
        f"Accident Type: {row['Name']}"
        for row in data
    ]

    # Function to split a single text into smaller chunks
    def split_single_text(text: str) -> list[str]:
        if not text:
            print("Warning: No text provided for splitting.")
            return []

        if len(text) <= chunk_size:
            return [text]  # If text is smaller than chunk size, return as a single chunk

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap
        )
        return text_splitter.split_text(text)

    # Split each chunk string from the dictionary rows into smaller chunks
    all_chunks = [split_single_text(chunk) for chunk in chunk_list]
    
    return all_chunks

# Apply function
event_chunks = split_events_into_chunks(cat_events_dict)

# Print the number of chunks and a sample chunk
print(f"Total chunks: {len(event_chunks)}\nFirst three chunks:\n{event_chunks[:3]}")

Total chunks: 231
First three chunks:
[['Accident Type: Collisions'], ['Accident Type: Collision of train with a train/rail vehicle'], ['Accident Type: Collision of train with obstacle within the clearance gauge']]


# 4. Relevant Chunk Retrieval

In [9]:
# Define embeddings model
embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

## 4.1 Incident Reports

In [191]:
# Store text chunks into FAISS vector store
vectorstore_reports = FAISS.from_texts(report_chunks, embeddings)

In [195]:
# Define entities of interest that you'd like to extract chunks for from the vector store
entities_of_interest = ["unique accident", "accident type", "track section", "date", "time", "country", "regulatory body"]

# Function for extracting most relevant chunks from vector store
def find_most_relevant_report_chunks(entities: list[str], top_k: int) -> str:
    """
    Finds the most relevant text chunks for each entity of interest
    using FAISS similarity search and removes duplicates (if same chunk retrieved).
    
    Args:
    - entities (list): List of entity names to query (e.g., ["date", "location"])
    - top_k (int): Number of chunks to retrieve per entity
    
    Returns:
    - unique_relevant_chunks (list): Deduplicated relevant chunks
    """
    retrieved_chunks = set()  # Use a set to avoid duplicate chunks

    for entity in entities:
        print(f"Searching for entity: {entity}")
        query = f"Report details about {entity}."
        found_chunks = vectorstore_reports.similarity_search(query, k=top_k)

        for chunk in found_chunks:
            retrieved_chunks.add(chunk.page_content)  # Add chunk if not already present

    # Convert set back to a list and join into a single string
    unique_relevant_chunks = list(retrieved_chunks)
    combined_text = "\n".join(unique_relevant_chunks)

    print(f"\nFound {len(unique_relevant_chunks)} unique relevant chunks.")
    return combined_text

# Find & combine relevant chunks
relevant_report_text = find_most_relevant_report_chunks(entities_of_interest, top_k=3)

print(f"\nMost Relevant Chunks Combined:\n{relevant_report_text}")

Searching for entity: unique accident
Searching for entity: accident type
Searching for entity: track section
Searching for entity: date
Searching for entity: time
Searching for entity: country
Searching for entity: regulatory body

Found 11 unique relevant chunks.

Most Relevant Chunks Combined:
[Page 26]
60 For TS&SS, ensuring that employees and Contractors under his/her control execute their
daily tasks in a manner that is technically correct, at the correct frequency, with the correct
care and in accordance with the CCE Documentation, such as to ensure the safe operation
of the Track and Structures (5.16.4.1). Following the instructions and technical advice of
the STSE on TS&SS and delivering the precautionary/mitigation actions per Risk in
accordance with the STSE’s programme requirements (5.16.4.3). Identifying the technical
training requirements per employee and releasing employees for training (5.16.4.7).
Permanent Way Inspector
61 Every Supervisor that has a responsibility for

## 4.2 Accident Categories

In [124]:
# Flatten the list of lists with event chunks for FAISS processing
flat_event_chunks = [chunk for sublist in event_chunks for chunk in sublist]

# Store text chunks into FAISS vector store
vectorstore_categories = FAISS.from_texts(flat_event_chunks, embeddings)

In [98]:
# Function for extracting most relevant chunks from vector store (queried later when needed)
def find_most_relevant_cat_chunks(query_input: str, top_k: int) -> str:
    """
    Retrieves the most relevant accident category chunks from the vector store based on the query.
    """
    query = f"{query_input}"
    
    # Perform the similarity search
    found_chunks = vectorstore_categories.similarity_search(query, k=top_k)
    
    # Extract the text from each document in the list
    found_chunks = [doc.page_content for doc in found_chunks]
    
    # Join the list of texts into a single string
    found_chunks = "\n".join(found_chunks)
    
    return found_chunks

# 5. Instantiating Language Models

## 5.1 Instantiating GPT

In [14]:
# Set the API key and model name
model_gpt = "gpt-4o-mini"

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

chat_model = init_chat_model(model_gpt, model_provider="openai")

### 5.1.1 Token Count

In [15]:
# Function for calculating tokens
def count_tokens(text: str, model: str = model_gpt) -> int:
    """Efficiently counts tokens in a text for a given OpenAI model."""
    encoding = tiktoken.encoding_for_model(model)
    token_integers = encoding.encode(text)
    num_tokens = len(token_integers)
    return num_tokens

### 5.1.2 Memory Definement for Chat History

In [196]:
# Define memory for storing chat history
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [200]:
print(memory)

chat_memory=InMemoryChatMessageHistory(messages=[HumanMessage(content='[Page 26]\n60 For TS&SS, ensuring that employees and Contractors under his/her control execute their\ndaily tasks in a manner that is technically correct, at the correct frequency, with the correct\ncare and in accordance with the CCE Documentation, such as to ensure the safe operation\nof the Track and Structures (5.16.4.1). Following the instructions and technical advice of\nthe STSE on TS&SS and delivering the precautionary/mitigation actions per Risk in\naccordance with the STSE’s programme requirements (5.16.4.3). Identifying the technical\ntraining requirements per employee and releasing employees for training (5.16.4.7).\nPermanent Way Inspector\n61 Every Supervisor that has a responsibility for overseeing and guiding workplace activities\nin any CCE Location is accountable for OS “during his working hours” (5.17.1.1) including:\n• Doing Safety Tours thoroughly and identifying good corrective actions and\nimp

### 5.1.3 Schema & Prompt Templates Definement

In [197]:
# Define the JSON schema example for few-shot learning
schema_example = """{
    "nodes": [
        {"id": "Dublin-Cork Accident", "type": "UniqueAccident"},
        {"id": "Level Crossing Accident involving a train", "type": "AccidentType"},
        {"id": "105 MP-108 MP", "type": "TrackSection"},
        {"id": "23/12/2021", "type": "Date"},
        {"id": "16:32", "type": "Time"},
        {"id": "Ireland", "type": "Country"},
        {"id": "European Rail Agency", "type": "RegulatoryBody"}
    ],
    "rels": [
        {"source": "Dublin-Cork Accident", "target": "Ireland", "type": "occurred_in"},
        {"source": "Dublin-Cork Accident", "target": "Collision", "type": "is_type"},
        {"source": "Dublin-Cork Accident", "target": "105 MP-108 MP", "type": "is_track_section"},
        {"source": "Dublin-Cork Accident", "target": "European Rail Agency", "type": "investigated_by"},
        {"source": "Dublin-Cork Accident", "target": "23/12/2021", "type": "has_date"},
        {"source": "Dublin-Cork Accident", "target": "16:32", "type": "has_time"}
    ]
}"""

# Define the chat prompt for GPT extraction
extraction_chat_prompt = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template("You are an expert in analyzing railway accident reports. Follow the JSON schema provided."),
        MessagesPlaceholder(variable_name="chat_history"),
        HumanMessagePromptTemplate.from_template(
        """
        Analyze the following railway accident report context and extract structured knowledge in JSON format.

        Return a JSON object with:
        - `nodes`: A list of entities, specifically {entities_of_interest}.
        - `rels`: A list of relationships linking entities.

        Guidelines:
        - Look at the JSON schema example response and follow it closely. 
        - Ensure that the `source` and `target` nodes in `rels` are the SAME entities from the `nodes` list, and NOT different ones. 
        - Make sure to map ALL nodes with other important entities, e.g., (node UniqueAccident has_date node Date, node UniqueAccident occurred_at node Country).
        - Do NOT map entities like (node Date is_date to node Time) or (node AccidentType is_type to node Country). This is INCORRECT.
        - The `type` field in `rels` should be a verb phrase (e.g., "occurred_in", "investigated_by").
        - The `id` field in `nodes` should be the exact text of the entity, not a description or a summary.
        - Pay attention to date and type formats (e.g., EU date format, 24-hour time).
        - The `UniqueAccident` entity should be a unique identifier for the accident.
        - The `AccidentType` entity should be the type of accident.
        - The `TrackSection` entity should be the track section where the accident occurred.
        - The `Country` entity should be the country where the accident occurred.
        - The `RegulatoryBody` entity should be the regulatory body that investigated the accident.

        Schema example:
        {schema_example}

        Accident report context:
        {relevant_report_text}

        JSON:
        """
        )
    ],
    # Only "relevant_report_text" is dynamic; the others are provided as constants.
    input_variables=["relevant_report_text"],
    partial_variables={
        "entities_of_interest": entities_of_interest,
        "schema_example": schema_example
    }
)

# Set constant values as partial variables.
extraction_chat_prompt.partial_variables = {
    "entities_of_interest": entities_of_interest,
    "schema_example": schema_example
}

extraction_chain = LLMChain(
    llm=chat_model,
    prompt=extraction_chat_prompt,
    memory=memory
)

### 5.1.4 Query First Prompt

In [198]:
# Convert the extraction prompt to a string for token counting
extraction_prompt = extraction_chat_prompt.format(
    relevant_report_text=relevant_report_text,
    chat_history=[]
)

# Concatenate the messages into a single string
token_limit = 4000
token_count = count_tokens(extraction_prompt)
estimated_cost = token_count * 0.00000015  # Approximate cost calculation

print(f"Estimated cost: ${estimated_cost:.5f}")
print(f"Token count for prompt: {token_count}")

if token_count > token_limit:
    print(f"Token count is too high: {token_count}\nPlease reduce the chunk size or refine the prompt.")
    proceed = "no"
else:
    proceed = input("Do you want to proceed with information extraction? (yes/no): ").strip().lower()

if proceed != "yes":
    print("Extraction aborted by user.")
else:
    print("Sending request to GPT for extraction...\n")
    extraction_result_raw = extraction_chain.invoke({"relevant_report_text": relevant_report_text})["text"]
    print("Extraction Output:\n")
    print(extraction_result_raw)

Estimated cost: $0.00059
Token count for prompt: 3961


Sending request to GPT for extraction...

Extraction Output:

```json
{
    "nodes": [
        {"id": "Collision between Iarnród Éireann passenger train and rail-mounted maintenance equipment", "type": "UniqueAccident"},
        {"id": "Collision", "type": "AccidentType"},
        {"id": "Up Line", "type": "TrackSection"},
        {"id": "11/01/2020", "type": "Date"},
        {"id": "00:16", "type": "Time"},
        {"id": "Ireland", "type": "Country"},
        {"id": "Railway Accident Investigation Unit", "type": "RegulatoryBody"}
    ],
    "rels": [
        {"source": "Collision between Iarnród Éireann passenger train and rail-mounted maintenance equipment", "target": "Ireland", "type": "occurred_in"},
        {"source": "Collision between Iarnród Éireann passenger train and rail-mounted maintenance equipment", "target": "Collision", "type": "is_type"},
        {"source": "Collision between Iarnród Éireann passenger train and rail-mounted maintenance equipment", "target": "Up Line",

### 5.1.5 Extract Accident Type from GPT's First Prompt

In [201]:
# Additional pre-processing to extract the JSON from the response
extraction_json = re.sub(r'^```json\n?|```$', '', extraction_result_raw).strip()
extraction_json = json.loads(extraction_json)

# Extract GPT's AccidentType assumption in order to query the vector store for the relevant chunks according to it
for node in extraction_json['nodes']:
        if node['type'] == 'AccidentType':
            accident_type = node['id']

# By using the initial AccidentType predicted by GPT, we can query our vector store and identify similar accident categories (which are standardized)
relevant_events_text = find_most_relevant_cat_chunks(accident_type, top_k=5)

print(f"Accident type:\n\n{accident_type}")
print(f"\nRelevant categories in vector store:\n\n{relevant_events_text}")

Accident type:

Collision

Relevant categories in vector store:

Accident Type: Collisions
Accident Type: Other collision type
Accident Type: Collision of train with a train/rail vehicle
Accident Type: Collision of one or more rail vehicles with another rail vehicle
Accident Type: Collision of one or more rail vehicles with obstacle within the clearance gauge


### 5.1.6 Query Second (Refinement) Prompt

In [202]:
# Define the chat prompt for GPT refinement
refinement_chat_prompt = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template("You are an expert in refining railway accident report extractions."),
        MessagesPlaceholder(variable_name="chat_history"),
        HumanMessagePromptTemplate.from_template(
            """
            You previously inferred an AccidentType from the accident report. Now, refine your answer based on the following relevant accident category events.
            Choose the most relevant one from the list below and copy it exactly:\n\n{relevant_events_text}\n\n
            Refine your previous answer and update the JSON with the correct AccidentType. Here is your previous guess:\n{extraction_result}\n\n
            If none of them match, you maintain the previous answer. Provide only the updated JSON, without any additional comments or text:
            """
        )
    ],
    input_variables=["extraction_result"],
    partial_variables={
        "relevant_events_text": relevant_events_text,
    }
)

refinement_chain = LLMChain(
    llm=chat_model,
    prompt=refinement_chat_prompt,
    memory=memory
)

In [203]:
# Convert the extraction prompt to a string for token counting.
refinement_prompt = refinement_chat_prompt.format(
    extraction_result=extraction_result_raw,
    chat_history=memory.chat_memory.messages
)

# Concatenate the messages into a single string.
token_limit = 5000
token_count = count_tokens(refinement_prompt)
print(f"Estimated cost: ${estimated_cost:.5f}")
print(f"Token count for prompt: {token_count}")

if token_count > token_limit:
    print(f"Token count is too high: {token_count}\nPlease reduce the chunk size or refine the prompt.")
    proceed = "no"
else:
    proceed = input("Do you want to proceed with information extraction? (yes/no): ").strip().lower()

if proceed != "yes":
    print("Extraction aborted by user.")
else:
    print("Sending request to GPT for extraction...\n")
    refinement_result_raw = refinement_chain.invoke({"extraction_result": extraction_result_raw})["text"]
    print("Refinement Output:\n")
    print(refinement_result_raw)

Estimated cost: $0.00059
Token count for prompt: 4157
Sending request to GPT for extraction...

Refinement Output:

```json
{
    "nodes": [
        {"id": "Collision between Iarnród Éireann passenger train and rail-mounted maintenance equipment", "type": "UniqueAccident"},
        {"id": "Collision of one or more rail vehicles with obstacle within the clearance gauge", "type": "AccidentType"},
        {"id": "Up Line", "type": "TrackSection"},
        {"id": "11/01/2020", "type": "Date"},
        {"id": "00:16", "type": "Time"},
        {"id": "Ireland", "type": "Country"},
        {"id": "Railway Accident Investigation Unit", "type": "RegulatoryBody"}
    ],
    "rels": [
        {"source": "Collision between Iarnród Éireann passenger train and rail-mounted maintenance equipment", "target": "Ireland", "type": "occurred_in"},
        {"source": "Collision between Iarnród Éireann passenger train and rail-mounted maintenance equipment", "target": "Collision of one or more rail vehicles 

In [204]:
# Clean up the final response and parse JSON.
final_response_str = re.sub(r'^```json\n?|```$', '', refinement_result_raw).strip()
final_response = json.loads(final_response_str)
print("\nFinal Refined JSON Response:")
print(json.dumps(final_response, ensure_ascii=False, indent=2))


Final Refined JSON Response:
{
  "nodes": [
    {
      "id": "Collision between Iarnród Éireann passenger train and rail-mounted maintenance equipment",
      "type": "UniqueAccident"
    },
    {
      "id": "Collision of one or more rail vehicles with obstacle within the clearance gauge",
      "type": "AccidentType"
    },
    {
      "id": "Up Line",
      "type": "TrackSection"
    },
    {
      "id": "11/01/2020",
      "type": "Date"
    },
    {
      "id": "00:16",
      "type": "Time"
    },
    {
      "id": "Ireland",
      "type": "Country"
    },
    {
      "id": "Railway Accident Investigation Unit",
      "type": "RegulatoryBody"
    }
  ],
  "rels": [
    {
      "source": "Collision between Iarnród Éireann passenger train and rail-mounted maintenance equipment",
      "target": "Ireland",
      "type": "occurred_in"
    },
    {
      "source": "Collision between Iarnród Éireann passenger train and rail-mounted maintenance equipment",
      "target": "Collision of

### 5.1.7 Validating the JSON Structure

In [205]:
# Define Pydantic models that match your JSON output structure.
class NodeModel(BaseModel):
    id: str
    type: str

class RelationshipModel(BaseModel):
    source: str
    target: str
    type: str

class KnowledgeGraphModel(BaseModel):
    nodes: List[NodeModel]
    rels: List[RelationshipModel]

# Assume final_json is your JSON input as a Python dictionary
try:
    kg = KnowledgeGraphModel.model_validate(final_response)
    print("Valid JSON in KG structure!")
except ValidationError as e:
    print("Validation error:", e)

Valid JSON in KG structure!


In [206]:
# Assign response to model for further analysis 
response_dict = {"model": model_gpt, "response": final_response}

## 5.2 Instantiating Local Models

### 5.2.1 Building the prompt

In [None]:
# Define the function to build the prompt for local models
def build_local_prompt(text):
    """
    Constructs a structured prompt to extract entities and relationships for railway accidents.
    """
    
    schema_example = """
    {
        "nodes": [
            {"id": "Dublin-Cork Accident", "type": "UniqueAccident"},
            {"id": "Train Derailment", "type": "AccidentType"},
            {"id": "23/12/2021", "type": "Date"},
            {"id": "16:32", "type": "Time"},
            {"id": "Ireland", "type": "Country"},
            {"id": "European Rail Agency", "type": "RegulatoryBody"}
        ],
        "rels": [
            {"source": "Dublin-Cork Accident", "target": "Ireland", "type": "occurred_in"},
            {"source": "Dublin-Cork Accident", "target": "Collision", "type": "is_type"},
            {"source": "Dublin-Cork Accident", "target": "European Rail Agency", "type": "investigated_by"},
            {"source": "Dublin-Cork Accident", "target": "23/12/2021", "type": "has_date"},
            {"source": "Dublin-Cork Accident", "target": "16:32", "type": "has_time"}
        ]
    }
    """

    return f"""
    Extract structured entities and output ONLY a JSON object from this railway accident report. Do not provide a summary or comment on the incident.

    Return the JSON object with:
    - `nodes`: A list of entities, specifically {entities_of_interest}.
    - `rels`: A list of relationships linking entities.

    Schema example:
    {schema_example}

    Accident report context:
    {text}

    JSON:
    """

### 5.2.2 Defining the endpoint

In [None]:
# Define the API URL
url = "http://llama-max-ollama.ai.wu.ac.at/api/generate"

# Define prompt
prompt = build_local_prompt(relevant_report_text)

# Specify local model
model_local = "llama3.1:latest"

### 5.2.3 Querying the model

In [None]:
# Define the payload and query the local model
payload = {
    "model": f"{model_local}",  # Ensure correct model name
    "prompt": f"{prompt}",
    "stream": False  # If 'raw' is unnecessary, remove it
}

# Set headers
headers = {"Content-Type": "application/json"}

# Send POST request
response = requests.post(url, json=payload, headers=headers)

# Handle response
if response.status_code == 200:
    try:
        data = response.json()  # Parse response JSON
        if "response" in data:
            local_response = textwrap.fill(data["response"], width=80)
            print("Generated Summary:\n")
            print(local_response)
        else:
            print("No 'response' key found in the JSON.")
    except json.JSONDecodeError:
        print(f"Invalid JSON response: {response.text}")
else:
    print(f"Error {response.status_code}: {response.text}")

### 5.2.4 Extracting JSON from response

In [None]:
def extract_json(text):
    # List of regex patterns to try.
    patterns = [
        # Pattern for JSON wrapped in a markdown code block:
        r'```json\s*([\s\S]*?)\s*```',
        # Fallback pattern: JSON object starting with '{' and ending with '}'
        r'({[\s\S]*})'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            json_str = match.group(1)
            # Optionally, remove unwanted control characters.
            json_str = re.sub(r'[\x00-\x1F]+', '', json_str)
            try:
                return json.loads(json_str)
            except json.JSONDecodeError as e:
                print("JSON decode error:", e)
                continue
    return None

local_response = extract_json(local_response)
response_dict = {"model": model_local, "response": local_response}
local_response

# 6. Cross-model and Iteration Comparison

## 6.1 Overview of results

In [208]:
# Define CSV storage file
CSV_FILE = "pdf_processing_results.csv"

# Define function to append the JSON output of response_json function to a DataFrame
def append_pdf_json_result(pdf_name: str, response_json: dict) -> pd.DataFrame:
    """
    Appends the JSON output of response_json function to a DataFrame.
    
    - If the PDF has been processed before, it appends a **new row** instead of a new column.
    - Prevents duplicate JSON entries for the same iteration.
    - Ensures data is **stored in rows**, making querying and analysis easier.

    Args:
        pdf_name (str): Name of the processed PDF file.
        response_json (dict): JSON response from the knowledge extraction process.

    Returns:
        pd.DataFrame: Updated DataFrame with the new result.
    """
    # Convert JSON response to a formatted string for easy comparison
    json_output = json.dumps(response_json, indent=2)

    # Load existing results if the CSV exists
    path = "./data/" + CSV_FILE
    if os.path.exists(path):
        df = pd.read_csv(path, dtype={"iteration_number": int})
    else:
        # Create an empty DataFrame with the correct schema
        df = pd.DataFrame(columns=["pdf_name", "model_type", "iteration_number", "json_output"])

    model_type = model_gpt if response_dict.get("model") == model_gpt else model_local

    # Filter for the current PDF's past records
    pdf_history = df[df["pdf_name"] == pdf_name]

    # Check for duplicates: If this JSON output already exists for the same PDF, skip re-adding
    if not pdf_history.empty and json_output in pdf_history["json_output"].values:
        print(f"No changes detected in JSON for {pdf_name}, skipping new entry.")
        return df  # Exit early if it's a duplicate

    # Determine new iteration number
    iteration_number = pdf_history["iteration_number"].max() + 1 if not pdf_history.empty else 1

    # Append new result
    new_entry = pd.DataFrame({"pdf_name": [pdf_name], "model_type": [model_type], "iteration_number": [iteration_number], "json_output": [json_output]})
    df = pd.concat([df, new_entry], ignore_index=True)

    # Save back to CSV in **append mode** to avoid full file reads/writes
    df.to_csv(path, index=False)

    print(f"Successfully added {pdf_name} - Iteration {iteration_number} to results!")
    return df

# Execute function
results_df = append_pdf_json_result(pdf_name, response_dict["response"])

Successfully added IE-10375 - 210827 Collision with track equipment.pdf - Iteration 2 to results!


In [209]:
results_df

Unnamed: 0,pdf_name,model_type,iteration_number,json_output
0,IE-10375 - 210827 Collision with track equipme...,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Collision..."
1,IE-10397 - 211207 Clontarf.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Signal Pa..."
2,IE-10404 - 230222 Broken Rail Emly.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Track Cir..."
3,IE-200608 BnM Collision LC Offaly.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Kilcolgan..."
4,IE-6218-200111 Collision RRME Rosslare.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Train A60..."
5,IE-10375 - 210827 Collision with track equipme...,gpt-4o-mini,2,"{\n ""nodes"": [\n {\n ""id"": ""Collision..."


## 6.2 Mapping entities to graph nods and rels

In [49]:
# Define classes for the entities extraction
class Property(BaseModel):
    """A single property consisting of key and value."""
    key: str = Field(..., description="Property key")
    value: str = Field(..., description="Property value")

class Node(BaseNode):
    """Represents an entity in the railway accident knowledge graph."""
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    """Represents a relationship between two entities in the graph."""
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """A knowledge graph storing railway accident data."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [50]:
# Defining functions for mapping extracted entities to graph nodes and relationships
def props_to_dict(props) -> dict:
    """Converts properties to a dictionary for graph storage."""
    properties = {}
    if not props:
        return properties
    for p in props:
        properties[p["key"]] = p["value"]
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Maps extracted entities to graph nodes."""
    properties = {"name": node.id}
    return BaseNode(
        id=node.id,
        type=node.type.capitalize(),
        properties=properties
    )

def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Maps extracted relationships to graph edges."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}

    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

# 7. Neo4j Storage

## 7.1 Instantiating Neo4j

In [53]:
# Neo4j Connection Setup
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"
NEO4J_DATABASE = "neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

try:
    # Test the connection
    with driver.session() as session:
        session.run("RETURN 1")
    print("Connected to Neo4j successfully.")
except AuthError as e:
    print("Authentication failed. Check your credentials:", str(e))

Connected to Neo4j successfully.


In [54]:
# Clear database
def clear_neo4j_database():
    """Delete all nodes and relationships in the Neo4j database."""
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run("MATCH (n) DETACH DELETE n")
    print("Neo4j database cleared successfully.")

# Run the function to clear the database
clear_neo4j_database()

Neo4j database cleared successfully.


In [210]:
results_df

Unnamed: 0,pdf_name,model_type,iteration_number,json_output
0,IE-10375 - 210827 Collision with track equipme...,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Collision..."
1,IE-10397 - 211207 Clontarf.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Signal Pa..."
2,IE-10404 - 230222 Broken Rail Emly.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Track Cir..."
3,IE-200608 BnM Collision LC Offaly.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Kilcolgan..."
4,IE-6218-200111 Collision RRME Rosslare.pdf,gpt-4o-mini,1,"{\n ""nodes"": [\n {\n ""id"": ""Train A60..."
5,IE-10375 - 210827 Collision with track equipme...,gpt-4o-mini,2,"{\n ""nodes"": [\n {\n ""id"": ""Collision..."


## 7.2 Extracting the JSON from DF row

In [211]:
def get_json_output(df, pdf_name, iteration_number):
    """
    Gets the 'json_output' for the given pdf_name and iteration_number.
    Returns an empty dict if there's no match.
    """
    subset = df[
        (df["pdf_name"] == pdf_name) &
        (df["iteration_number"] == iteration_number)
    ]

    if subset.empty:
        print("No match found.")
        return {}

    json_str = subset.iloc[0]["json_output"]
    return json.loads(json_str)

# Choose the JSON to convert to graph
pdf_of_choice = pdf_name
json_to_convert = get_json_output(results_df, pdf_of_choice, iteration_number=2)
print(json.dumps(json_to_convert, ensure_ascii=False, indent=2))

{
  "nodes": [
    {
      "id": "Collision between Iarnród Éireann passenger train and rail-mounted maintenance equipment",
      "type": "UniqueAccident"
    },
    {
      "id": "Collision of one or more rail vehicles with obstacle within the clearance gauge",
      "type": "AccidentType"
    },
    {
      "id": "Up Line",
      "type": "TrackSection"
    },
    {
      "id": "11/01/2020",
      "type": "Date"
    },
    {
      "id": "00:16",
      "type": "Time"
    },
    {
      "id": "Ireland",
      "type": "Country"
    },
    {
      "id": "Railway Accident Investigation Unit",
      "type": "RegulatoryBody"
    }
  ],
  "rels": [
    {
      "source": "Collision between Iarnród Éireann passenger train and rail-mounted maintenance equipment",
      "target": "Ireland",
      "type": "occurred_in"
    },
    {
      "source": "Collision between Iarnród Éireann passenger train and rail-mounted maintenance equipment",
      "target": "Collision of one or more rail vehicles wit

## 7.3 Converting the JSON to graph 

In [58]:
def convert_json_to_graph(json_to_convert, source_text):
    """
    Converts extracted JSON into a graph-compatible format with correct entity types.
    """
    def get_node_type(json_data, node_id):
        """
        Helper function to retrieve the correct node type from JSON.
        """
        for node in json_data["nodes"]:
            if node["id"] == node_id:
                return node["type"]
        return "Unknown"  # Fallback if type is missing

    if not json_to_convert:
        print("No valid data to convert to a graph.")
        return None

    # Convert Nodes
    graph_nodes = [map_to_base_node(Node(id=node["id"], type=node["type"])) for node in json_to_convert["nodes"]]

    # Convert Relationships (Ensure correct types)
    graph_rels = []
    for rel in json_to_convert["rels"]:
        source_node = Node(id=rel["source"], type=get_node_type(json_to_convert, rel["source"]))
        target_node = Node(id=rel["target"], type=get_node_type(json_to_convert, rel["target"]))
        graph_rels.append(map_to_base_relationship(Relationship(source=source_node, target=target_node, type=rel["type"])))

    return GraphDocument(nodes=graph_nodes, relationships=graph_rels, source=Document(page_content=source_text))

## 7.4 Storing the graph in Neo4j

In [59]:
def store_in_neo4j(graph_document):
    """
    Stores extracted knowledge graph into Neo4j with dynamic labels.
    """
    with driver.session() as session:
        # Store nodes with dynamic labels
        for node in graph_document.nodes:
            session.run(f"""
                MERGE (n:{node.type} {{id: $id}})
                ON CREATE SET n.name = $name
            """, id=node.id, name=node.id)

        # Store relationships
        for rel in graph_document.relationships:
            session.run(f"""
                MATCH (s {{id: $source}})
                MATCH (t {{id: $target}})
                MERGE (s)-[r:{rel.type.upper()}]->(t)
            """, source=rel.source.id, target=rel.target.id)

In [60]:
def process_railway_accident_report(json_to_convert):
    """
    Converts the JSON to a graph format and stores it in Neo4j.
    """
    print("Converting JSON to graph format...")
    graph_document = convert_json_to_graph(json_to_convert, relevant_report_text)

    if graph_document:
        print("Graph structure created! Storing in Neo4j...")
        store_in_neo4j(graph_document)

In [212]:
# Store extracted entities into Neo4j
try:
    db_result = process_railway_accident_report(json_to_convert)
    print("Data stored in Neo4j successfully.")
except Exception as e:
    print("Failed to store data in Neo4j:", str(e))

Converting JSON to graph format...
Graph structure created! Storing in Neo4j...
Data stored in Neo4j successfully.


In [None]:
# Close Neo4j connection
driver.close()

# 8. Comparison against ERAIL DB

## 8.1 Loading ERAIL DB

In [None]:
# Load the ERAIL DB
erail_db = pd.read_excel("./data/erail database.xlsx")
erail_db.head(5)

## 8.2 Pre-processing data

In [None]:
# Convert date column to datetime & adjust format
erail_db["Date of occurrence"] = erail_db["Date of occurrence"].dt.strftime("%d/%m/%Y")

# Convert time column to datetime & adjust format
erail_db["Time of occurrence"] = pd.to_datetime(erail_db["Time of occurrence"], errors='coerce')
erail_db["Time of occurrence"] = erail_db["Time of occurrence"].dt.strftime("%H:%M")

erail_db.head(5)

## 8.3 Merging data for comparison

In [None]:
# Create a copy of the results DataFrame
comparison_df = results_df[["pdf_name", "json_output"]].copy()

# Extract ID from the PDF name
comparison_df["ERAIL Occurrence"] = comparison_df["pdf_name"].str.extract(r'(IE-\d+)')

# Sample DataFrame (assuming json_output column contains dictionaries in string format)
comparison_df['json_output'] = comparison_df['json_output'].apply(json.loads)  # Convert JSON string to dictionary

# Function to extract node data
def extract_nodes(json_data):
    node_dict = {}
    for node in json_data.get("nodes", []):
        node_dict[f"gpt_{node['type']}"] = node["id"]  # Store ID based on type
    return pd.Series(node_dict)  # Convert dictionary to Series for easier DataFrame merging

# Apply the function to extract node data
nodes_df = comparison_df['json_output'].apply(extract_nodes)

# Merge extracted data into original DataFrame
comparison_df = pd.concat([comparison_df, nodes_df], axis=1)

# Drop the original json_output column if no longer needed
comparison_df.drop(columns=["json_output"], inplace=True)

# View comparison DataFrame
comparison_df

In [None]:
# Merge the comparison DataFrame with the ERAIL database
merged_df = comparison_df.merge(erail_db, on="ERAIL Occurrence", how='inner')
merged_df

## 8.4 Comparison

In [None]:
# Compare GPT extracted date with the ERAIL database (source of truth)
merged_df["Date Match"] = np.where(merged_df["gpt_Date"] == merged_df["Date of occurrence"], "Match", "Mismatch")
merged_df[["ERAIL Occurrence", "gpt_Date", "Date of occurrence", "Date Match"]]

In [None]:
# Compare GPT extracted time with the ERAIL database (source of truth)
merged_df["Time Match"] = np.where(merged_df["gpt_Time"] == merged_df["Time of occurrence"], "Match", "Mismatch")
merged_df[["ERAIL Occurrence", "gpt_Time", "Time of occurrence", "Time Match"]]

In [None]:
# Compare GPT extracted country with the ERAIL database (source of truth)
merged_df["Country Match"] = np.where(merged_df["gpt_Country"] == merged_df["Country"], "Match", "Mismatch")
merged_df[["ERAIL Occurrence", "gpt_Country", "Country", "Country Match"]]