# Package Loading

In [194]:
# Standard Library
import json
import os
import re

# Data Manipulation
import pandas as pd
import numpy as np

# PDFs
import pdfplumber

# LLMs
import faiss
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_community.graphs.graph_document import (
    GraphDocument,
    Node as BaseNode,
    Relationship as BaseRelationship,
)
from langchain_huggingface import HuggingFaceEmbeddings
from openai import OpenAI
import tiktoken 

# Neo4j
from neo4j import GraphDatabase
from neo4j.exceptions import AuthError

# Typing & Validation
from pydantic import BaseModel, Field
from typing import Any, Dict, List, Optional

# Raw Data Extraction

In [2]:
# Define the directory containing PDFs
pdf_directory = "./reports_ie/"

# List all PDF files in the directory
pdf_files = [f for f in os.listdir(pdf_directory) if f.lower().endswith(".pdf")]

for file in pdf_files:
    print(file)

IE-10375 - 210827 Collision with track equipment.pdf
IE-10397 - 211207 Clontarf.pdf
IE-10404 - 230222 Broken Rail Emly.pdf
IE-200608 BnM Collision LC Offaly.pdf
IE-6218-200111 Collision RRME Rosslare.pdf
IE-6262-200429 LC Collision XM240.pdf
IE-6291-200524 LC XA068 Ashfield.pdf
IE-6305 - 200707_locomotive_224.pdf


In [107]:
# Specify file name that you want to process
pdf_name = "IE-6305 - 200707_locomotive_224.pdf"

In [108]:
# Define regex patterns to identify the start of the summary and contents sections
SUMMARY_PATTERN = re.compile(r"^summary", re.IGNORECASE)
CONTENTS_PATTERN = re.compile(r"^contents", re.IGNORECASE)

# Define function to extract the summary section from Ireland reports
def extract_summary_section(pdf_path: str, header_lines: int = 1) -> str:
    """
    Extracts text from 'Summary' to 'Contents' in an Irish rail report PDF.
    If no summary section is found, returns the text from all pages.
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"File not found: {pdf_path}")

    summary_text = ""
    full_text = ""
    capturing = False  

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text_lines = page_text.split("\n")
                text_without_header = text_lines[header_lines:]
                page_content = f"[Page {page.page_number}]\n" + "\n".join(text_without_header) + "\n\n"

                # Always append to full_text
                full_text += page_content

                if text_without_header:
                    first_line = text_without_header[0].strip().lower()

                    if SUMMARY_PATTERN.match(first_line):
                        capturing = True
                    
                    if CONTENTS_PATTERN.match(first_line) and capturing:
                        # Stop capturing when "Contents" is found after summary started.
                        break
                    
                    if capturing:
                        summary_text += page_content

    if not summary_text:
        print(f"Warning: No summary section found in {pdf_path}. Returning full text.")
        return full_text

    return summary_text

# Example usage:
pdf_text = extract_summary_section(f"./reports_ie/{pdf_name}", header_lines=1)
print(pdf_text[:500])

[Page 4]
Summary
Iarnród Éireann (IÉ) 201 Class Locomotives were manufactured by General Motors (GM) in
Canada and entered service in 1994. Locomotive 224 had its engine and generator removed
in 2010 and 2019 for maintenance. When the engine and generator was removed cracks were
identified in the Bed Plate (non-structural component) between the two Chassis Plates
(structural component) of the Locomotive. IÉ carried out weld repairs to the Bed Plate but the
weld repair did not conform to the EN15


# Langchain Chunk Splitting

In [109]:
def split_text_into_chunks(text: str, chunk_size: int = 2000, chunk_overlap: int = 300) -> list[str]:
    """
    Splits text into smaller overlapping chunks using LangChain's text splitter.
    """
    if not text:
        print("Warning: No text provided for splitting.")
        return []

    if len(text) <= chunk_size:
        return [text]  # If text is smaller than chunk size, return as single chunk

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_text(text)

# Split the extracted text
text_chunks = split_text_into_chunks(pdf_text)

# Print the number of chunks and a sample chunk
print(f"Total chunks: {len(text_chunks)}\nFirst chunk:\n{text_chunks[0]}")

Total chunks: 4
First chunk:
[Page 4]
Summary
Iarnród Éireann (IÉ) 201 Class Locomotives were manufactured by General Motors (GM) in
Canada and entered service in 1994. Locomotive 224 had its engine and generator removed
in 2010 and 2019 for maintenance. When the engine and generator was removed cracks were
identified in the Bed Plate (non-structural component) between the two Chassis Plates
(structural component) of the Locomotive. IÉ carried out weld repairs to the Bed Plate but the
weld repair did not conform to the EN15085 2007 standard series, entitled, “Railway
applications - Welding of railway vehicles and components” or any IÉ approved written
specification; and, on one occasion (in 2010 or 2019) the weld repair was unnecessarily
continued from the Bed Plate into the Chassis Plate.
On 6th July 2020, the 14:25 hrs Cork Kent to Dublin Heuston passenger service operated with
Locomotive 224 at the rear. Locomotive 224 experienced a coolant leak and electrical fault
that caused the 

# Relevant Chunk Retrieval

### Vector-based Retrieval

In [110]:
# Define embeddings 
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [111]:
# Store text chunks into FAISS vector store
vectorstore = FAISS.from_texts(text_chunks, embeddings)

print(f"Stored {len(text_chunks)} chunks in FAISS.")

Stored 4 chunks in FAISS.


In [112]:
# Define entities of interest that you'd like to extract chunks for from the vector store
entities_of_interest = ["unique accident", "accident type", "date", "time", "country", "regulatory body"]

# Function for extracting most relevant chunks from vector store
def find_most_relevant_chunks(entities: list[str], top_k: int) -> str:
    """
    Finds the most relevant text chunks for each entity of interest
    using FAISS similarity search and removes duplicates (if same chunk retrieved).
    
    Args:
    - entities (list): List of entity names to query (e.g., ["date", "location"])
    - top_k (int): Number of chunks to retrieve per entity
    
    Returns:
    - unique_relevant_chunks (list): Deduplicated relevant chunks
    """
    retrieved_chunks = set()  # Use a set to avoid duplicate chunks

    for entity in entities:
        print(f"Searching for entity: {entity}")
        query = f"Extract details specifically about {entity}. Focus on structured information."
        found_chunks = vectorstore.similarity_search(query, k=top_k)

        for chunk in found_chunks:
            retrieved_chunks.add(chunk.page_content)  # Add chunk if not already present

    # Convert set back to a list and join into a single string
    unique_relevant_chunks = list(retrieved_chunks)
    combined_text = "\n".join(unique_relevant_chunks)

    print(f"\nFound {len(unique_relevant_chunks)} unique relevant chunks.")
    return combined_text

# Find & combine relevant chunks
relevant_text = find_most_relevant_chunks(entities_of_interest, top_k=3)

print(f"\nMost Relevant Chunks Combined:\n{relevant_text}")

Searching for entity: unique accident
Searching for entity: accident type
Searching for entity: date
Searching for entity: time
Searching for entity: country
Searching for entity: regulatory body

Found 4 unique relevant chunks.

Most Relevant Chunks Combined:
[Page 5]
The Chassis Plate of Locomotive 224 failed as a result of the following causal factor (CaF):
• CaF-01 – The flat Bed Plates were not replaced with cupped Bed Plates when Bed Plate
cracks were identified as set out in the OEM Service Advisory SA 08-007;
• CaF-02 – During a weld repair of the Bed Plate, the Bed Plate was unnecessarily welded
to the Chassis Plate.
Contributory factors (CoF) were identified as:
• CoF-01 – IÉ-RU had not adopted the EN 15085 standard series, entitled “Railway
applications – Welding of railway vehicles and components” which were first published in
2007; although it is noted that this standard series is not mandatory;
• CoF-02 – While Service Advisory (SA 08-007) was available to the welder carr

## Instantiating GPT

In [73]:
# Set the API key and model name
MODEL="gpt-4o-mini"

# Load OpenAI API Key from requirements file
with open("gpt-personal-key.txt", "r") as file:
    OPENAI_API_KEY = file.read().strip()

# Instantiate OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

## Token Count

In [10]:
# Function for calculating tokens
def count_tokens(text: str, model: str = "gpt-4o") -> int:
    """Efficiently counts tokens in a text for a given OpenAI model."""
    if model not in count_tokens.encoders:
        count_tokens.encoders[model] = tiktoken.encoding_for_model(model)
    return len(count_tokens.encoders[model].encode(text))

count_tokens.encoders = {}

## Entity Extraction

In [11]:
# Define classes for the entities extraction
class Property(BaseModel):
    """A single property consisting of key and value."""
    key: str = Field(..., description="Property key")
    value: str = Field(..., description="Property value")

class Node(BaseNode):
    """Represents an entity in the railway accident knowledge graph."""
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    """Represents a relationship between two entities in the graph."""
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """A knowledge graph storing railway accident data."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [12]:
def call_gpt(prompt, temperature=1):
    """
    Calls the GPT model with the structured prompt and returns the raw response.
    """
    completion = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are an expert in analyzing railway accident reports. Return output in JSON format only."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature
    )
    
    response_text = completion.choices[0].message.content.strip()
    response_text = re.sub(r'^```json\n?|```$', '', response_text).strip()
    
    return response_text

In [13]:
def build_prompt(text):
    """
    Constructs a structured prompt to extract entities and relationships for railway accidents.
    """
    
    schema_example = """
    {
        "nodes": [
            {"id": "Dublin-Cork Accident", "type": "UniqueAccident"},
            {"id": "Train Derailment", "type": "AccidentType"},
            {"id": "23/12/2021", "type": "Date"},
            {"id": "16:32", "type": "Time"},
            {"id": "Ireland", "type": "Country"},
            {"id": "European Rail Agency", "type": "RegulatoryBody"}
        ],
        "rels": [
            {"source": "Dublin-Cork Accident", "target": "Ireland", "type": "occurred_in"},
            {"source": "Dublin-Cork Accident", "target": "", "type": "is_type"},
            {"source": "Dublin-Cork Accident", "target": "European Rail Agency", "type": "investigated_by"},
            {"source": "Dublin-Cork Accident", "target": "23/12/2021", "type": "has_date"},
            {"source": "Dublin-Cork Accident", "target": "16:32", "type": "has_time"}
        ]
    }
    """

    return f"""
    Analyze the following railway accident report context and extract structured knowledge.

    Return a JSON object with:
    - `nodes`: A list of entities, specifically {entities_of_interest}.
    - `rels`: A list of relationships linking entities.

    Look at the JSON schema example response and follow it closely. Pay attention to date and type formats (e.g., EU date format, 24-hour time).
    Ensure that the `source` and `target` nodes in `rels` are the same entities from the `nodes` list, and not different ones. 
    Make sure to map all nodes with other important entities, e.g., (node UniqueAccident has_date node Date, node UniqueAccident occurred_at node Country).
    DO NOT map entities like (node Date is_date to node Time) or (node AccidentType is_type to node Country). This is incorrect.
    The `type` field in `rels` should be a verb phrase (e.g., "occurred_in", "investigated_by").
    And the `id` field in `nodes` should be the exact text of the entity, not a description or a summary.

    Schema example:
    {schema_example}

    Accident report context:
    {text}

    JSON:
    """

In [14]:
def extract_knowledge_graph(text: str) -> Dict[str, Any]:
    """
    Extracts entities & relationships from a railway accident report using GPT.
    - First, counts tokens and allows user decision.
    - If within limit, runs GPT and handles errors.
    """

    # Build prompt
    prompt = build_prompt(text)

    # Call GPT
    response_text = call_gpt(prompt)

    try:
        extracted_graph = json.loads(response_text)  # Ensure valid JSON
        return extracted_graph  # Successfully parsed knowledge graph
    except json.JSONDecodeError as e:
        print("Error parsing JSON:", str(e))
        print("Storing raw response for review...")

        # Save the faulty response for debugging
        with open("failed_graph_extractions.json", "a") as file:
            json.dump({"input_text": text[:1000], "raw_output": response_text}, file, indent=4)
            file.write("\n")

        return {}  # Return empty dictionary in case of failure

In [113]:
# Define token limit for function execution
token_limit = 4096

# Build the prompt and count tokens
prompt = build_prompt(relevant_text)
token_count = count_tokens(prompt)
estimated_cost = token_count * 0.00000015  # Approximate OpenAI pricing

# Check token limit
if token_count > token_limit:
    print(f"Token count is too high: {token_count}\nPlease reduce the chunk size or refine the prompt.")
else:
    print(f"Token count for prompt: {token_count}")

Token count for prompt: 1715


In [114]:
# Confirm Execution
proceed = input("Do you want to proceed with information extraction? (yes/no): ").strip().lower()
if proceed != "yes":
    print("Extraction aborted by user.")
else:
    print("Sending request to GPT...")
    response_json = extract_knowledge_graph(pdf_text)

response_json

Sending request to GPT...


{'nodes': [{'id': 'Cork Kent to Dublin Heuston Incident',
   'type': 'UniqueAccident'},
  {'id': 'Train Mechanical Failure', 'type': 'AccidentType'},
  {'id': '06/07/2020', 'type': 'Date'},
  {'id': '14:25', 'type': 'Time'},
  {'id': 'Ireland', 'type': 'Country'},
  {'id': 'Railway Accident Investigation Unit', 'type': 'RegulatoryBody'}],
 'rels': [{'source': 'Cork Kent to Dublin Heuston Incident',
   'target': 'Ireland',
   'type': 'occurred_in'},
  {'source': 'Cork Kent to Dublin Heuston Incident',
   'target': 'Train Mechanical Failure',
   'type': 'is_type'},
  {'source': 'Cork Kent to Dublin Heuston Incident',
   'target': 'Railway Accident Investigation Unit',
   'type': 'investigated_by'},
  {'source': 'Cork Kent to Dublin Heuston Incident',
   'target': '06/07/2020',
   'type': 'has_date'},
  {'source': 'Cork Kent to Dublin Heuston Incident',
   'target': '14:25',
   'type': 'has_time'}]}

In [115]:
# Define CSV storage file
CSV_FILE = "pdf_processing_results.csv"

def append_pdf_json_result(pdf_name: str, response_json: dict) -> pd.DataFrame:
    """
    Appends the JSON output of response_json function to a DataFrame.
    
    - If the PDF has been processed before, it appends a **new row** instead of a new column.
    - Prevents duplicate JSON entries for the same iteration.
    - Ensures data is **stored in rows**, making querying and analysis easier.

    Args:
        pdf_name (str): Name of the processed PDF file.
        response_json (dict): JSON response from the knowledge extraction process.

    Returns:
        pd.DataFrame: Updated DataFrame with the new result.
    """
    # Convert JSON response to a formatted string for easy comparison
    json_output = json.dumps(response_json, indent=2)

    # Load existing results if the CSV exists
    if os.path.exists(CSV_FILE):
        df = pd.read_csv(CSV_FILE, dtype={"iteration_number": int})
    else:
        # Create an empty DataFrame with the correct schema
        df = pd.DataFrame(columns=["pdf_name", "iteration_number", "json_output"])

    # Filter for the current PDF's past records
    pdf_history = df[df["pdf_name"] == pdf_name]

    # Check for duplicates: If this JSON output already exists for the same PDF, skip re-adding
    if not pdf_history.empty and json_output in pdf_history["json_output"].values:
        print(f"No changes detected in JSON for {pdf_name}, skipping new entry.")
        return df  # Exit early if it's a duplicate

    # Determine new iteration number
    iteration_number = pdf_history["iteration_number"].max() + 1 if not pdf_history.empty else 1

    # Append new result
    new_entry = pd.DataFrame({"pdf_name": [pdf_name], "iteration_number": [iteration_number], "json_output": [json_output]})
    df = pd.concat([df, new_entry], ignore_index=True)

    # Save back to CSV in **append mode** to avoid full file reads/writes
    df.to_csv(CSV_FILE, index=False)

    print(f"Successfully added {pdf_name} - Iteration {iteration_number} to results!")
    return df

# Example execution
results_df = append_pdf_json_result(pdf_name, response_json)

Successfully added IE-6305 - 200707_locomotive_224.pdf - Iteration 1 to results!


In [116]:
results_df

Unnamed: 0,pdf_name,iteration_number,json_output
0,IE-10375 - 210827 Collision with track equipme...,1,"{\n ""nodes"": [\n {\n ""id"": ""Train J28..."
1,IE-10397 - 211207 Clontarf.pdf,1,"{\n ""nodes"": [\n {\n ""id"": ""Train E24..."
2,IE-10404 - 230222 Broken Rail Emly.pdf,1,"{\n ""nodes"": [\n {\n ""id"": ""Broken Ra..."
3,IE-200608 BnM Collision LC Offaly.pdf,1,"{\n ""nodes"": [\n {\n ""id"": ""BnM Flat ..."
4,IE-6218-200111 Collision RRME Rosslare.pdf,1,"{\n ""nodes"": [\n {\n ""id"": ""Collision..."
5,IE-6262-200429 LC Collision XM240.pdf,1,"{\n ""nodes"": [\n {\n ""id"": ""Kilnageer..."
6,IE-6291-200524 LC XA068 Ashfield.pdf,1,"{\n ""nodes"": [\n {\n ""id"": ""Athlone L..."
7,IE-6305 - 200707_locomotive_224.pdf,1,"{\n ""nodes"": [\n {\n ""id"": ""Cork Kent..."


In [19]:
# Defining functions for mapping extracted entities to graph nodes and relationships
def props_to_dict(props) -> dict:
    """Converts properties to a dictionary for graph storage."""
    properties = {}
    if not props:
        return properties
    for p in props:
        properties[p["key"]] = p["value"]
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Maps extracted entities to graph nodes."""
    properties = {"name": node.id}
    return BaseNode(
        id=node.id,
        type=node.type.capitalize(),
        properties=properties
    )

def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Maps extracted relationships to graph edges."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}

    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

# Neo4j Storage

In [20]:
# Neo4j Connection Setup
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"
NEO4J_DATABASE = "neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

try:
    # Test the connection
    with driver.session() as session:
        session.run("RETURN 1")
    print("Connected to Neo4j successfully.")
except AuthError as e:
    print("Authentication failed. Check your credentials:", str(e))

Connected to Neo4j successfully.


In [21]:
# Clear database
def clear_neo4j_database():
    """Delete all nodes and relationships in the Neo4j database."""
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run("MATCH (n) DETACH DELETE n")
    print("Neo4j database cleared successfully.")

# Run the function to clear the database
clear_neo4j_database()

Neo4j database cleared successfully.


In [117]:
results_df

Unnamed: 0,pdf_name,iteration_number,json_output
0,IE-10375 - 210827 Collision with track equipme...,1,"{\n ""nodes"": [\n {\n ""id"": ""Train J28..."
1,IE-10397 - 211207 Clontarf.pdf,1,"{\n ""nodes"": [\n {\n ""id"": ""Train E24..."
2,IE-10404 - 230222 Broken Rail Emly.pdf,1,"{\n ""nodes"": [\n {\n ""id"": ""Broken Ra..."
3,IE-200608 BnM Collision LC Offaly.pdf,1,"{\n ""nodes"": [\n {\n ""id"": ""BnM Flat ..."
4,IE-6218-200111 Collision RRME Rosslare.pdf,1,"{\n ""nodes"": [\n {\n ""id"": ""Collision..."
5,IE-6262-200429 LC Collision XM240.pdf,1,"{\n ""nodes"": [\n {\n ""id"": ""Kilnageer..."
6,IE-6291-200524 LC XA068 Ashfield.pdf,1,"{\n ""nodes"": [\n {\n ""id"": ""Athlone L..."
7,IE-6305 - 200707_locomotive_224.pdf,1,"{\n ""nodes"": [\n {\n ""id"": ""Cork Kent..."


In [118]:
def get_json_output(df, pdf_name, iteration_number):
    """
    Gets the 'json_output' for the given pdf_name and iteration_number.
    Returns an empty dict if there's no match.
    """
    subset = df[
        (df["pdf_name"] == pdf_name) &
        (df["iteration_number"] == iteration_number)
    ]

    if subset.empty:
        print("No match found.")
        return {}

    json_str = subset.iloc[0]["json_output"]
    return json.loads(json_str)

# Example usage:
pdf_of_choice = "IE-6305 - 200707_locomotive_224.pdf"
json_to_convert = get_json_output(results_df, pdf_of_choice, 1)
print(json.dumps(json_to_convert, indent=2))

{
  "nodes": [
    {
      "id": "Cork Kent to Dublin Heuston Incident",
      "type": "UniqueAccident"
    },
    {
      "id": "Train Mechanical Failure",
      "type": "AccidentType"
    },
    {
      "id": "06/07/2020",
      "type": "Date"
    },
    {
      "id": "14:25",
      "type": "Time"
    },
    {
      "id": "Ireland",
      "type": "Country"
    },
    {
      "id": "Railway Accident Investigation Unit",
      "type": "RegulatoryBody"
    }
  ],
  "rels": [
    {
      "source": "Cork Kent to Dublin Heuston Incident",
      "target": "Ireland",
      "type": "occurred_in"
    },
    {
      "source": "Cork Kent to Dublin Heuston Incident",
      "target": "Train Mechanical Failure",
      "type": "is_type"
    },
    {
      "source": "Cork Kent to Dublin Heuston Incident",
      "target": "Railway Accident Investigation Unit",
      "type": "investigated_by"
    },
    {
      "source": "Cork Kent to Dublin Heuston Incident",
      "target": "06/07/2020",
      "type"

In [24]:
def convert_json_to_graph(json_to_convert, source_text):
    """
    Converts extracted JSON into a graph-compatible format with correct entity types.
    """

    def get_node_type(json_data, node_id):
        """
        Helper function to retrieve the correct node type from JSON.
        """
        for node in json_data["nodes"]:
            if node["id"] == node_id:
                return node["type"]
        return "Unknown"  # Fallback if type is missing

    if not json_to_convert:
        print("No valid data to convert to a graph.")
        return None

    # Convert Nodes
    graph_nodes = [map_to_base_node(Node(id=node["id"], type=node["type"])) for node in json_to_convert["nodes"]]

    # Convert Relationships (Ensure correct types)
    graph_rels = []
    for rel in json_to_convert["rels"]:
        source_node = Node(id=rel["source"], type=get_node_type(json_to_convert, rel["source"]))
        target_node = Node(id=rel["target"], type=get_node_type(json_to_convert, rel["target"]))
        graph_rels.append(map_to_base_relationship(Relationship(source=source_node, target=target_node, type=rel["type"])))

    return GraphDocument(nodes=graph_nodes, relationships=graph_rels, source=Document(page_content=source_text))

In [25]:
def store_in_neo4j(graph_document):
    """
    Stores extracted knowledge graph into Neo4j with dynamic labels.
    """
    with driver.session() as session:
        # Store nodes with dynamic labels
        for node in graph_document.nodes:
            session.run(f"""
                MERGE (n:{node.type} {{id: $id}})
                ON CREATE SET n.name = $name
            """, id=node.id, name=node.id)

        # Store relationships
        for rel in graph_document.relationships:
            session.run(f"""
                MATCH (s {{id: $source}})
                MATCH (t {{id: $target}})
                MERGE (s)-[r:{rel.type.upper()}]->(t)
            """, source=rel.source.id, target=rel.target.id)

In [26]:
def process_railway_accident_report(json_to_convert):
    """
    Converts the JSON to a graph format and stores it in Neo4j.
    """
    print("Converting JSON to graph format...")
    graph_document = convert_json_to_graph(json_to_convert, relevant_text)

    if graph_document:
        print("Graph structure created! Storing in Neo4j...")
        store_in_neo4j(graph_document)

In [119]:
# Store extracted entities into Neo4j
try:
    db_result = process_railway_accident_report(json_to_convert)
    print("Data stored in Neo4j successfully.")
except Exception as e:
    print("Failed to store data in Neo4j:", str(e))

Converting JSON to graph format...
Graph structure created! Storing in Neo4j...
Data stored in Neo4j successfully.


In [None]:
# Close Neo4j connection
driver.close()

# Comparison against ERAIL DB

In [163]:
erail_db = pd.read_excel("erail database.xlsx")
erail_db.head(5)

Unnamed: 0,Only received by email after ERAIL stopped to work,Report Type,Investigation Status,ERAIL Occurrence,Title,Reporting Body,Date of occurrence,Time of occurrence,Occurrence type,Occurrence description,...,Investigation report,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63
0,,Final report,Closed,FI-135,"Wrong-side signalling failure, 29/06/2002, Kou...",Accident Investigation Board of Finland,2002-06-29,1900-01-01 08:33:00,Wrong-side signalling failure,Signal malfunctioning,...,,,,,,,,,,
1,,Final report,Closed,NL-444,"Train derailment, 30/04/2003, Station Apeldoor...",The Dutch Safety Board,2003-04-30,1900-01-01 00:42:00,Train derailment,Derailment of a freight train which was loaded...,...,,,,,,,,,,
2,,Final report,Closed,FI-134,"Other, 15.4.2004, Kaukomarkkinat Oy's track at...",Accident Investigation Board of Finland,2004-04-15,1900-01-01 17:31:00,Other,Three methanol carrying Russian tank wagons de...,...,,,,,,,,,,
3,,Final report,Closed,FI-45,"Train derailment, 5/11/2004, Pieksämäki Railwa...",Accident Investigation Board of Finland,2004-05-11,1900-01-01 12:57:00,Train derailment,Two wagons derailed,...,,,,,,,,,,
4,,Final report,Closed,NL-168,"Spad, 21/05/2004, Station Amsterdam (The Nethe...",The Dutch Safety Board,2004-05-21,1900-01-01 18:35:00,Spad,SPAD (an empty double decker train collided wi...,...,,,,,,,,,,


In [None]:
# Convert date column to datetime & adjust format
erail_db["Date of occurrence"] = erail_db["Date of occurrence"].dt.strftime("%d/%m/%Y")

# Convert time column to datetime & adjust format
erail_db["Time of occurrence"] = pd.to_datetime(erail_db["Time of occurrence"], errors='coerce')
erail_db["Time of occurrence"] = erail_db["Time of occurrence"].dt.strftime("%H:%M")

erail_db.head(5)

Unnamed: 0,Only received by email after ERAIL stopped to work,Report Type,Investigation Status,ERAIL Occurrence,Title,Reporting Body,Date of occurrence,Time of occurrence,Occurrence type,Occurrence description,...,Investigation report,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63
0,,Final report,Closed,FI-135,"Wrong-side signalling failure, 29/06/2002, Kou...",Accident Investigation Board of Finland,29/06/2002,08:33,Wrong-side signalling failure,Signal malfunctioning,...,,,,,,,,,,
1,,Final report,Closed,NL-444,"Train derailment, 30/04/2003, Station Apeldoor...",The Dutch Safety Board,30/04/2003,00:42,Train derailment,Derailment of a freight train which was loaded...,...,,,,,,,,,,
2,,Final report,Closed,FI-134,"Other, 15.4.2004, Kaukomarkkinat Oy's track at...",Accident Investigation Board of Finland,15/04/2004,17:31,Other,Three methanol carrying Russian tank wagons de...,...,,,,,,,,,,
3,,Final report,Closed,FI-45,"Train derailment, 5/11/2004, Pieksämäki Railwa...",Accident Investigation Board of Finland,11/05/2004,12:57,Train derailment,Two wagons derailed,...,,,,,,,,,,
4,,Final report,Closed,NL-168,"Spad, 21/05/2004, Station Amsterdam (The Nethe...",The Dutch Safety Board,21/05/2004,18:35,Spad,SPAD (an empty double decker train collided wi...,...,,,,,,,,,,


In [None]:
# Create a copy of the results DataFrame
comparison_df = results_df[["pdf_name", "json_output"]].copy()

# Extract ID from the PDF name
comparison_df["ERAIL Occurrence"] = comparison_df["pdf_name"].str.extract(r'(IE-\d+)')

# Sample DataFrame (assuming json_output column contains dictionaries in string format)
comparison_df['json_output'] = comparison_df['json_output'].apply(json.loads)  # Convert JSON string to dictionary

# Function to extract node data
def extract_nodes(json_data):
    node_dict = {}
    for node in json_data.get("nodes", []):
        node_dict[f"gpt_{node['type']}"] = node["id"]  # Store ID based on type
    return pd.Series(node_dict)  # Convert dictionary to Series for easier DataFrame merging

# Apply the function to extract node data
nodes_df = comparison_df['json_output'].apply(extract_nodes)

# Merge extracted data into original DataFrame
comparison_df = pd.concat([comparison_df, nodes_df], axis=1)

# Drop the original json_output column if no longer needed
comparison_df.drop(columns=["json_output"], inplace=True)

# View comparison DataFrame
comparison_df

In [None]:
# Merge the comparison DataFrame with the ERAIL database
merged_df = comparison_df.merge(erail_db, on="ERAIL Occurrence", how='inner')
merged_df

In [None]:
# Compare GPT extracted date with the ERAIL database (source of truth)
merged_df["Date Match"] = np.where(merged_df["gpt_Date"] == merged_df["Date of occurrence"], "Match", "Mismatch")
merged_df[["ERAIL Occurrence", "gpt_Date", "Date of occurrence", "Date Match"]]

Unnamed: 0,ERAIL Occurrence,gpt_Date,Date of occurrence,Date Match
0,IE-10375,27/08/2021,27/08/2021,Match
1,IE-10397,07/12/2021,07/12/2021,Match
2,IE-10404,22/02/2023,22/02/2023,Match
3,IE-6218,11/01/2020,11/01/2020,Match
4,IE-6262,29/04/2020,29/04/2020,Match
5,IE-6291,24/05/2020,24/05/2020,Match
6,IE-6305,06/07/2020,07/07/2020,Mismatch


In [None]:
# Compare GPT extracted time with the ERAIL database (source of truth)
merged_df["Time Match"] = np.where(merged_df["gpt_Time"] == merged_df["Time of occurrence"], "Match", "Mismatch")
merged_df[["ERAIL Occurrence", "gpt_Time", "Time of occurrence", "Time Match"]]

Unnamed: 0,ERAIL Occurrence,gpt_Time,Time of occurrence,Time Match
0,IE-10375,00:18,00:20,Mismatch
1,IE-10397,15:59,16:05,Mismatch
2,IE-10404,07:56,11:45,Mismatch
3,IE-6218,10:52,,Mismatch
4,IE-6262,13:40,,Mismatch
5,IE-6291,12:13,,Mismatch
6,IE-6305,14:25,,Mismatch


In [None]:
# Compare GPT extracted country with the ERAIL database (source of truth)
merged_df["Country Match"] = np.where(merged_df["gpt_Country"] == merged_df["Country"], "Match", "Mismatch")
merged_df[["ERAIL Occurrence", "gpt_Country", "Country", "Country Match"]]

Unnamed: 0,ERAIL Occurrence,gpt_Country,Country,Country Match
0,IE-10375,Ireland,Ireland,Match
1,IE-10397,Ireland,Ireland,Match
2,IE-10404,Ireland,Ireland,Match
3,IE-6218,Ireland,Ireland,Match
4,IE-6262,Ireland,Ireland,Match
5,IE-6291,Ireland,Ireland,Match
6,IE-6305,Ireland,Ireland,Match
