# Package Loading

In [None]:
pip install requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [None]:
# General 
import os

# PDFs
import pdfplumber
import json
import regex as reg

# LLMs
from openai import OpenAI
import tiktoken
import faiss

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from pydantic import Field, BaseModel
from typing import List, Dict, Any, Optional

# Neo4j
from neo4j import GraphDatabase
from neo4j.exceptions import AuthError

# Raw Data Extraction

### Whole Extraction

In [118]:
# Extract the text
def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Test PDF extraction
pdf_text = extract_text_from_pdf("raiu_example_collision.pdf")
print(pdf_text[:500])  # Print first 500 characters

Railway Accident
Investigation Unit
Ireland
INVESTIGATION REPORT
Collision between a car and a train at
Level Crossing XM190, Mayo, 9th September 2023
RAIU Investigation Report No: 2024-R003
Published: 12/12/2024
Collision between a car and a train at Level Crossing XM190, Mayo, 9th September 2023
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed report remains with the RAIU by virtue of Regulation 9 (7


### Pre-processed Extraction

In [119]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF while allowing for pre-processing.
    """
    text = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
    
    return text  # Returns a list where each item is a page's text

# Extract pages as a list
pdf_pages = extract_text_from_pdf("raiu_example_collision.pdf")

# Print the first few pages to inspect where the TOC might be
for i, page in enumerate(pdf_pages[:5]):  # Check first 5 pages
    print(f"Page {i+1}:\n{page[:500]}\n{'-'*40}")


Page 1:
Railway Accident
Investigation Unit
Ireland
INVESTIGATION REPORT
Collision between a car and a train at
Level Crossing XM190, Mayo, 9th September 2023
RAIU Investigation Report No: 2024-R003
Published: 12/12/2024
----------------------------------------
Page 2:
Collision between a car and a train at Level Crossing XM190, Mayo, 9th September 2023
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed report remains with the RAIU by virtue of Regulation 9 (7) of European Union
(EU) (Railway Safety) (Reporting and Investigation of Serious Accidents, Accidents and
Incidents) Regulations 2020 (S.I. 430 of 2020). No person may produce, reproduce or transmit
in any form o
----------------------------------------
Page 3:
Collision between a car and a train at Level Crossing XM190, Mayo, 9th September 2023
Preface
The RAIU is an independent investigation unit within the Department of Transport which

In [120]:
def extract_text_omit_toc(pdf_path, toc_start=7, toc_end=9):
    """
    Extracts text from a PDF while skipping the Table of Contents.
    """
    text = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            if toc_start <= i+1 <= toc_end:  # Skip TOC pages
                continue
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
    
    return text

# Extract text without TOC
filtered_pdf_pages = extract_text_omit_toc("raiu_example_collision.pdf")

# Join pages into a single text document
cleaned_text = "\n".join(filtered_pdf_pages)
print(cleaned_text[:10000])  # Preview the cleaned text

Railway Accident
Investigation Unit
Ireland
INVESTIGATION REPORT
Collision between a car and a train at
Level Crossing XM190, Mayo, 9th September 2023
RAIU Investigation Report No: 2024-R003
Published: 12/12/2024
Collision between a car and a train at Level Crossing XM190, Mayo, 9th September 2023
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed report remains with the RAIU by virtue of Regulation 9 (7) of European Union
(EU) (Railway Safety) (Reporting and Investigation of Serious Accidents, Accidents and
Incidents) Regulations 2020 (S.I. 430 of 2020). No person may produce, reproduce or transmit
in any form or by any means this report or any part thereof without the express permission of
the RAIU. This report may be freely used for educational purposes.
Where the report has been altered following its original publication, details on the changes will
be given.
Report structure
The report str

In [121]:
def clean_text(text):
    """
    Cleans text by removing headers, footers, and empty lines.
    """
    lines = text.split("\n")
    cleaned_lines = []

    for line in lines:
        line = line.strip()

        if re.match(r'^(Page \d+|Collision between a car and a train at Level Crossing XM190, Mayo, 9th September 2023|Railway Accident Investigation Unit)$', line):
            continue

        cleaned_lines.append(line)

    return "\n".join(cleaned_lines)

# Apply cleaning
final_cleaned_text = clean_text(cleaned_text)
print(final_cleaned_text[:1000])  # Preview cleaned text

Railway Accident
Investigation Unit
Ireland
INVESTIGATION REPORT
Collision between a car and a train at
Level Crossing XM190, Mayo, 9th September 2023
RAIU Investigation Report No: 2024-R003
Published: 12/12/2024
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed report remains with the RAIU by virtue of Regulation 9 (7) of European Union
(EU) (Railway Safety) (Reporting and Investigation of Serious Accidents, Accidents and
Incidents) Regulations 2020 (S.I. 430 of 2020). No person may produce, reproduce or transmit
in any form or by any means this report or any part thereof without the express permission of
the RAIU. This report may be freely used for educational purposes.
Where the report has been altered following its original publication, details on the changes will
be given.
Report structure
The report structure is written as close as possible to the structure set out in the “Commission
Imp

# Langchain Chunk Splitting

In [122]:
def split_text_into_chunks(text, chunk_size=2000, chunk_overlap=300):
    """
    Splits text into smaller overlapping chunks using LangChain's text splitter.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_text(text)
    return chunks

# Split the extracted text
text_chunks = split_text_into_chunks(final_cleaned_text)

# Print the number of chunks and a sample chunk
print(f"Total Chunks: {len(text_chunks)}")

Total Chunks: 88


# Relevant Chunk Retrieval

### Keyword-based Retrieval

In [None]:
# def find_relevant_chunks(chunks, keyword):
#     """
#     Returns chunks that contain a specific keyword.
#     """
#     relevant_chunks = [chunk for chunk in chunks if keyword.lower() in chunk.lower()]
#     return relevant_chunks

# # Example: Find chunks mentioning "location"
# location_chunks = find_relevant_chunks(text_chunks, "location")

# print(f"Found {len(location_chunks)} relevant chunks.")
# print("Sample Chunk:\n", location_chunks[0] if location_chunks else "No relevant chunks found.")

### Vector-based Retrieval

In [123]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [124]:
# Convert text chunks into FAISS vector store
vectorstore = FAISS.from_texts(text_chunks, embeddings)

print(f"Stored {len(text_chunks)} chunks in FAISS.")

Stored 88 chunks in FAISS.


In [125]:
def find_most_relevant_chunks(entities, top_k=1):
    """
    Finds the most relevant text chunks for each entity of interest
    using FAISS similarity search and removes duplicates.
    
    Args:
    - entities (list): List of entity names to query (e.g., ["date", "location", "regulatory_body"])
    - top_k (int): Number of chunks to retrieve per entity
    
    Returns:
    - unique_relevant_chunks (list): Deduplicated relevant chunks
    """
    retrieved_chunks = set()  # Use a set to avoid duplicate chunks

    for entity in entities:
        print(f"Searching for entity: {entity}")
        query = f"Information about {entity}."
        found_chunks = vectorstore.similarity_search(query, k=top_k)

        for chunk in found_chunks:
            retrieved_chunks.add(chunk.page_content)  # Add chunk if not already present

    # Convert set back to a list and join into a single string
    unique_relevant_chunks = list(retrieved_chunks)
    combined_text = "\n".join(unique_relevant_chunks)

    print(f"Found {len(unique_relevant_chunks)} unique relevant chunks.")
    return combined_text

# Define entities of interest
entities_of_interest = ["date", "location", "regulatory_body"]

# Find & combine relevant chunks
relevant_text = find_most_relevant_chunks(entities_of_interest, top_k=1)

print(f"Most Relevant Chunks Combined:\n{relevant_text}")

Searching for entity: date
Searching for entity: location
Searching for entity: regulatory_body
Found 3 unique relevant chunks.
Most Relevant Chunks Combined:
board to remain in situ, and the horn must still be sounded. When no longer required a
whistle board will be removed”.
Railway Accident Investigation Unit 77
Safety Campaigns
266 IÉ-IM continue to promote level crossing safety, when suitable events to promote,
arise, examples are as follows:
• June 2024 – Nine level crossings were attended in recognition of International Level
Crossing Awareness Day (ILCAD). IÉ-IM coordinated this initiative with An Garda
Síochána Roads Policing Units and the RSA. All level crossing users were stopped and
engaged on the day, distributing safety information leaflets and answering any queries.
ILCAD is an annual event in which IÉ-IM will be supporting each year;
• June 2024 – IÉ-IM updated all of their UWLC booklets and refreshed the website to
highlight these updates;
• June 2024 – IÉ-IM targeted 

## Instantiating GPT

In [126]:
## Set the API key and model name
MODEL="gpt-4o-mini"

# Load OpenAI API Key from requirements file
with open("gpt-personal-key.txt", "r") as file:
    OPENAI_API_KEY = file.read().strip()

client = OpenAI(api_key=OPENAI_API_KEY)

## Token Count

In [127]:
def count_tokens(text, model="gpt-4o"):
    """
    Counts the number of tokens in a given text for a specified OpenAI model.
    """
    encoder = tiktoken.encoding_for_model(model)
    tokens = encoder.encode(text)
    return len(tokens)

## Entity Extraction

In [128]:
class Property(BaseModel):
    """A single property consisting of key and value."""
    key: str = Field(..., description="Property key")
    value: str = Field(..., description="Property value")

class Node(BaseNode):
    """Represents an entity in the railway accident knowledge graph."""
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    """Represents a relationship between two entities in the graph."""
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """A knowledge graph storing railway accident data."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [129]:
def call_gpt(prompt, temperature=1):
    """
    Calls the GPT model with the structured prompt and returns the raw response.
    """
    completion = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are an expert in analyzing railway accident reports. Return output in JSON format only. Very important: the `source` and `target` nodes in `rels` must be the same entities from the `nodes` list, and not different ones."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature
    )
    
    response_text = completion.choices[0].message.content.strip()
    
    # Remove markdown JSON formatting if present
    response_text = re.sub(r'^```json\n?|```$', '', response_text).strip()
    
    return response_text

In [140]:
def build_prompt(text):
    """
    Constructs a structured prompt to extract entities and relationships for railway accidents.
    """
    return f"""
    Analyze the following railway accident report context and extract structured knowledge.

    Return a JSON object with:
    - `nodes`: A list of entities, specifically {entities_of_interest}.
    - `rels`: A list of relationships linking entities.

    Example JSON response:
    {{
        "nodes": [
            {{"id": "Train Derailment", "type": "AccidentType"}},
            {{"id": "23-12-2021", "type": "Date"}},
            {{"id": "Dublin, Ireland", "type": "Location"}},
            {{"id": "European Rail Agency", "type": "RegulatoryBody"}}
            
        ],
        "rels": [
            {{"source": "Train Derailment", "target": "Dublin, Ireland", "type": "occurred_at"}},
            {{"source": "Train Derailment", "target": "European Rail Agency", "type": "investigated_by"}}
        ]
    }}

    Text:
    {text}

    JSON:
    """

In [141]:
def extract_knowledge_graph(text, token_limit=4096):
    """
    Extracts structured knowledge graph (entities & relationships) from a railway accident report using GPT.
    - First, counts tokens and allows user decision.
    - If within limit, runs GPT and handles errors.
    """

    # Step 1: Build Structured Graph Extraction Prompt
    prompt = build_prompt(text)
    
    # Step 2: Count Tokens
    token_count = count_tokens(prompt)
    estimated_cost = token_count * 0.00000015  # Approximate OpenAI pricing

    print(f"Token Count for Prompt: {token_count} (Limit: {token_limit})")
    print(f"Estimated Cost for {MODEL}: ${estimated_cost:.7f}")

    # Step 3: Check Token Limit
    if token_count > token_limit:
        print("Token count is too high! Please reduce the chunk size or refine the prompt.")
        return None  # Stop execution here

    # Step 4: Confirm Execution
    proceed = input("Do you want to proceed with knowledge graph extraction? (yes/no): ").strip().lower()
    if proceed != "yes":
        print("Extraction aborted by user.")
        return None  # Stop execution

    print("Sending request to GPT...")

    # Step 5: Call GPT for Extraction
    response_text = call_gpt(prompt)

    # Step 6: Process Response & Handle JSON Errors
    try:
        extracted_graph = json.loads(response_text)  # Ensure valid JSON
        return extracted_graph  # Successfully parsed knowledge graph
    except json.JSONDecodeError as e:
        print("Error parsing JSON:", str(e))
        print("Storing raw response for review...")

        # Save the faulty response for debugging
        with open("failed_graph_extractions.json", "a") as file:
            json.dump({"input_text": text[:1000], "raw_output": response_text}, file, indent=4)
            file.write("\n")

        return {}  # Return empty dictionary in case of failure

In [142]:
response_json = extract_knowledge_graph(relevant_text)
response_json

Token Count for Prompt: 1442 (Limit: 4096)
Estimated Cost for gpt-4o-mini: $0.0002163
Sending request to GPT...


{'nodes': [{'id': '9th of September 2023', 'type': 'Date'},
  {'id': 'Level Crossing XM190', 'type': 'Location'},
  {'id': 'Railway Accident Investigation Unit', 'type': 'RegulatoryBody'}],
 'rels': [{'source': '9th of September 2023',
   'target': 'Level Crossing XM190',
   'type': 'occurred_at'},
  {'source': '9th of September 2023',
   'target': 'Railway Accident Investigation Unit',
   'type': 'investigated_by'}]}

In [143]:
def props_to_dict(props) -> dict:
    """Converts properties to a dictionary for graph storage."""
    properties = {}
    if not props:
        return properties
    for p in props:
        properties[p["key"]] = p["value"]
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Maps extracted entities to graph nodes."""
    properties = {"name": node.id}
    return BaseNode(
        id=node.id,
        type=node.type.capitalize(),
        properties=properties
    )

def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Maps extracted relationships to graph edges."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}

    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

# Neo4j Storage

In [134]:
# Neo4j Connection Setup
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"
NEO4J_DATABASE = "neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

try:
    # Test the connection
    with driver.session() as session:
        session.run("RETURN 1")
    print("Connected to Neo4j successfully.")
except AuthError as e:
    print("Authentication failed. Check your credentials:", str(e))

Connected to Neo4j successfully.


In [144]:
def clear_neo4j_database():
    """Delete all nodes and relationships in the Neo4j database."""
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run("MATCH (n) DETACH DELETE n")
    print("Neo4j database cleared successfully.")

# Run the function to clear the database
clear_neo4j_database()

Neo4j database cleared successfully.


In [145]:
def convert_json_to_graph(response_json, source_text):
    """
    Converts extracted JSON into a graph-compatible format with nodes and relationships.
    """
    if not response_json:
        print("⚠ No valid data to convert to a graph.")
        return None

    # Convert Nodes
    graph_nodes = [map_to_base_node(Node(id=node["id"], type=node["type"])) for node in response_json["nodes"]]

    # Convert Relationships
    graph_rels = []
    for rel in response_json["rels"]:
        source_node = Node(id=rel["source"], type="Unknown")  # Temporary, type should be resolved
        target_node = Node(id=rel["target"], type="Unknown")  # Temporary
        graph_rels.append(map_to_base_relationship(Relationship(source=source_node, target=target_node, type=rel["type"])))

    # Create the structured GraphDocument with a source field
    return GraphDocument(nodes=graph_nodes, relationships=graph_rels, source=Document(page_content=source_text))


In [146]:
def store_in_neo4j(graph_document):
    """
    Stores extracted knowledge graph into Neo4j.
    """
    with driver.session() as session:
        # Store nodes
        for node in graph_document.nodes:
            session.run("""
                MERGE (n:Entity {id: $id, type: $type})
                SET n.name = $name
            """, id=node.id, type=node.type, name=node.id)

        # Store relationships
        for rel in graph_document.relationships:
            session.run("""
                MATCH (s:Entity {id: $source})
                MATCH (t:Entity {id: $target})
                MERGE (s)-[:RELATIONSHIP {type: $type}]->(t)
            """, source=rel.source.id, target=rel.target.id, type=rel.type)

In [147]:
def process_railway_accident_report(text):
    
    print("🔹 Converting JSON to graph format...")
    graph_document = convert_json_to_graph(response_json, relevant_text)

    if graph_document:
        print("✅ Graph structure created! Storing in Neo4j...")
        store_in_neo4j(graph_document)

In [148]:
# Store extracted entities into Neo4j
try:
    db_result = process_railway_accident_report(response_json)
    print("Data stored in Neo4j successfully.")
except Exception as e:
    print("Failed to store data in Neo4j:", str(e))

🔹 Converting JSON to graph format...
✅ Graph structure created! Storing in Neo4j...
Data stored in Neo4j successfully.


In [None]:
# Close Neo4j connection
driver.close()