# Package Loading

In [71]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp312-cp312-win_amd64.whl.metadata (6.8 kB)
Downloading tiktoken-0.9.0-cp312-cp312-win_amd64.whl (894 kB)
   ---------------------------------------- 0.0/894.9 kB ? eta -:--:--
   --------------------------------------- 894.9/894.9 kB 39.5 MB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [72]:
# General 
import os

# PDFs
import pdfplumber
import json
import regex as re

# LLMs
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
import tiktoken

# Neo4j
from neo4j import GraphDatabase
from neo4j.exceptions import AuthError

# Instantiating GPT & Neo4j

In [2]:
## Set the API key and model name
MODEL="gpt-4o-mini"

# Load OpenAI API Key from requirements file
with open("gpt-personal-key.txt", "r") as file:
    OPENAI_API_KEY = file.read().strip()

client = OpenAI(api_key=OPENAI_API_KEY)

In [3]:
# Neo4j Connection Setup
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"
NEO4J_DATABASE = "neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

try:
    # Test the connection
    with driver.session() as session:
        session.run("RETURN 1")
    print("Connected to Neo4j successfully.")
except AuthError as e:
    print("Authentication failed. Check your credentials:", str(e))

ServiceUnavailable: Couldn't connect to localhost:7687 (resolved to ('[::1]:7687', '127.0.0.1:7687')):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)

In [75]:
def clear_neo4j_database():
    """Delete all nodes and relationships in the Neo4j database."""
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run("MATCH (n) DETACH DELETE n")
    print("Neo4j database cleared successfully.")

# Run the function to clear the database
clear_neo4j_database()

Neo4j database cleared successfully.


# Raw Data Extraction

### Whole Extraction

In [None]:
# Extract the text
def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Test PDF extraction
pdf_text = extract_text_from_pdf("raiu_example.pdf")
print(pdf_text[:500])  # Print first 500 characters

### Pre-processed Extraction

In [47]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF while allowing for pre-processing.
    """
    text = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
    
    return text  # Returns a list where each item is a page's text

# Extract pages as a list
pdf_pages = extract_text_from_pdf("raiu_example.pdf")

# Print the first few pages to inspect where the TOC might be
for i, page in enumerate(pdf_pages[:5]):  # Check first 5 pages
    print(f"Page {i+1}:\n{page[:500]}\n{'-'*40}")


Page 1:
Railway Accident
Investigation Unit
Ireland
INVESTIGATION REPORT
Broken Rail near Emly,
County Tipperary, 22nd February 2023
RAIU Investigation Report No: 2024-R002
Published: 22nd March 2024
----------------------------------------
Page 2:
Broken Rail near Emly, County Tipperary, 22nd February 2023
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed report remains with the RAIU by virtue of in Regulation 9 (7) of European Union
(EU) (Railway Safety) (Reporting and Investigation of Serious Accidents, Accidents and
Incidents) Regulations 2020 (S.I. 430 of 2020). No person may produce, reproduce or transmit
in any form or by any means this rep
----------------------------------------
Page 3:
Broken Rail near Emly, County Tipperary, 22nd February 2023
Preface
The RAIU is an independent investigation unit within the Department of Transport which
conducts investigations into accidents and inc

In [53]:
def extract_text_omit_toc(pdf_path, toc_start=4, toc_end=5):
    """
    Extracts text from a PDF while skipping the Table of Contents.
    """
    text = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            if toc_start <= i+1 <= toc_end:  # Skip TOC pages
                continue
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
    
    return text

# Extract text without TOC
filtered_pdf_pages = extract_text_omit_toc("raiu_example.pdf")

# Join pages into a single text document
cleaned_text = "\n".join(filtered_pdf_pages)
print(cleaned_text[:10000])  # Preview the cleaned text


Railway Accident
Investigation Unit
Ireland
INVESTIGATION REPORT
Broken Rail near Emly,
County Tipperary, 22nd February 2023
RAIU Investigation Report No: 2024-R002
Published: 22nd March 2024
Broken Rail near Emly, County Tipperary, 22nd February 2023
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed report remains with the RAIU by virtue of in Regulation 9 (7) of European Union
(EU) (Railway Safety) (Reporting and Investigation of Serious Accidents, Accidents and
Incidents) Regulations 2020 (S.I. 430 of 2020). No person may produce, reproduce or transmit
in any form or by any means this report or any part thereof without the express permission of
the RAIU. This report may be freely used for educational purposes.
Where the report has been altered following its original publication, details on the changes will
be given.
Report structure
The report structure is written as closely as possible to 

In [52]:
def clean_text(text):
    """
    Cleans text by removing headers, footers, and empty lines.
    """
    lines = text.split("\n")
    cleaned_lines = []

    for line in lines:
        line = line.strip()

        # Remove page numbers (e.g., "Page 12" or "Railway Accident Investigation Unit  |  2023 Report")
        if re.match(r'^(Page \d+|Broken Rail near Emly, County Tipperary, 22nd February 2023|Railway Accident Investigation Unit)$', line):
            continue

        # Remove overly short lines (likely just noise)
        if len(line) < 10:
            continue

        cleaned_lines.append(line)

    return "\n".join(cleaned_lines)

# Apply cleaning
final_cleaned_text = clean_text(cleaned_text)
print(final_cleaned_text[:1000])  # Preview cleaned text

Railway Accident
Investigation Unit
INVESTIGATION REPORT
Broken Rail near Emly,
County Tipperary, 22nd February 2023
RAIU Investigation Report No: 2024-R002
Published: 22nd March 2024
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed report remains with the RAIU by virtue of in Regulation 9 (7) of European Union
(EU) (Railway Safety) (Reporting and Investigation of Serious Accidents, Accidents and
Incidents) Regulations 2020 (S.I. 430 of 2020). No person may produce, reproduce or transmit
in any form or by any means this report or any part thereof without the express permission of
the RAIU. This report may be freely used for educational purposes.
Where the report has been altered following its original publication, details on the changes will
Report structure
The report structure is written as closely as possible to the structure set out in the “Commission
Implementation Regulation (EU) 2020/5

# Langchain Chunk Splitting

In [57]:
def split_text_into_chunks(text, chunk_size=2000, chunk_overlap=300):
    """
    Splits text into smaller overlapping chunks using LangChain's text splitter.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_text(text)
    return chunks

# Split the extracted text
text_chunks = split_text_into_chunks(final_cleaned_text)

# Print the number of chunks and a sample chunk
print(f"Total Chunks: {len(text_chunks)}")
print(f"Sample Chunk: {text_chunks[0]}")

Total Chunks: 50
Sample Chunk: Railway Accident
Investigation Unit
INVESTIGATION REPORT
Broken Rail near Emly,
County Tipperary, 22nd February 2023
RAIU Investigation Report No: 2024-R002
Published: 22nd March 2024
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed report remains with the RAIU by virtue of in Regulation 9 (7) of European Union
(EU) (Railway Safety) (Reporting and Investigation of Serious Accidents, Accidents and
Incidents) Regulations 2020 (S.I. 430 of 2020). No person may produce, reproduce or transmit
in any form or by any means this report or any part thereof without the express permission of
the RAIU. This report may be freely used for educational purposes.
Where the report has been altered following its original publication, details on the changes will
Report structure
The report structure is written as closely as possible to the structure set out in the “Commission
Implem

# Relevant Chunk Retrieval

### Keyword-based Retrieval

In [58]:
def find_relevant_chunks(chunks, keyword):
    """
    Returns chunks that contain a specific keyword.
    """
    relevant_chunks = [chunk for chunk in chunks if keyword.lower() in chunk.lower()]
    return relevant_chunks

# Example: Find chunks mentioning "location"
location_chunks = find_relevant_chunks(text_chunks, "location")

print(f"Found {len(location_chunks)} relevant chunks.")
print("Sample Chunk:\n", location_chunks[0] if location_chunks else "No relevant chunks found.")

Found 16 relevant chunks.
Sample Chunk:
 conditions may have led to a serious accident.
The RAIU may also carry out trend investigations where the occurrence is part of a group of
related occurrences that may or may not have warranted an investigation as individual
occurrences, but the apparent trend warrants investigation.
The RAIU investigation shall analyse the established facts and findings (i.e. performance of
operators, rolling stock and/or technical installations) which caused the occurrence. The
analyses shall then lead to the identification of the safety critical factors that caused or
otherwise contributed to the occurrence, including facts identified as precursors. An accident
or incident may be caused by causal, contributing and systemic factors which are equally
important and should be consider during the RAIU investigation. From this, the RAIU may
make safety recommendations in order to prevent accidents and incidents in the future and
improve railway safety.
It is not th

### Vector-based Retrieval

In [23]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [59]:
# Convert text chunks into FAISS vector database
vectorstore = FAISS.from_texts(text_chunks, embeddings)

print(f"Stored {len(text_chunks)} chunks in FAISS.")

Stored 50 chunks in FAISS.


In [68]:
# Example: Find the most relevant chunk for a question
query_vector = "Train wagon"

In [69]:
def find_most_relevant_chunks(query, top_k=3):
    """
    Finds the most relevant text chunks using FAISS similarity search.
    """
    retrieved_chunks = vectorstore.similarity_search(query, k=top_k)
    return [chunk.page_content for chunk in retrieved_chunks]

top_chunks = find_most_relevant_chunks(query_vector, top_k=3)

relevant_text = "\n".join(top_chunks)

print(f"Most Relevant Chunks Combined:\n {relevant_text}")

Most Relevant Chunks Combined:
 February 2023 when the first Cork to Dublin passenger service resumed on the Up line.
Railway Accident Investigation Unit 12
Rolling Stock
52 Twenty-four passenger trains travelled over the welded section of rail, on the Up line, from
the 21st February until the time of the weld breaking on the 22nd February 2023; these
trains served Cork to Dublin and Tralee to Dublin. The trains involved were MkIV and
22000 InterCity Rail car (ICR).
53 The MkIV push/pull trains consist of a 201 Class locomotive, a catering vehicle, one 1st
class carriage, five standard class carriages and a generator car (see Figure 2). The
maximum service speed of the train formation is 100 mph (160 km/h).
Figure 2 – MkIV 201 Class Locomotive
54 The ICR 22000 series DMU (Rotem) consists of three car and four car sets (see Figure 3).
The maximum service speed of the train formation is 100 mph (160 km/h).
Figure 3 – 22000 InterCity Rail Cars
Railway Accident Investigation Unit 13
Signal

### Hybrid Search

In [70]:
def keyword_filter(chunks, keyword):
    """
    Filters chunks that contain a specific keyword.
    """
    return [chunk for chunk in chunks if keyword.lower() in chunk.lower()]

# First, get the top vector-based matches
top_chunks = find_most_relevant_chunks(query_vector, top_k=5)

# Then, filter out only those mentioning "location" or similar words
filtered_chunks = keyword_filter(top_chunks, query_vector)

# If no good keyword matches, just use the first vector match
final_chunk = filtered_chunks[0] if filtered_chunks else top_chunks[0]

print(f"Filtered Most Relevant Chunk:\n {final_chunk}")

Filtered Most Relevant Chunk:
 February 2023 when the first Cork to Dublin passenger service resumed on the Up line.
Railway Accident Investigation Unit 12
Rolling Stock
52 Twenty-four passenger trains travelled over the welded section of rail, on the Up line, from
the 21st February until the time of the weld breaking on the 22nd February 2023; these
trains served Cork to Dublin and Tralee to Dublin. The trains involved were MkIV and
22000 InterCity Rail car (ICR).
53 The MkIV push/pull trains consist of a 201 Class locomotive, a catering vehicle, one 1st
class carriage, five standard class carriages and a generator car (see Figure 2). The
maximum service speed of the train formation is 100 mph (160 km/h).
Figure 2 – MkIV 201 Class Locomotive
54 The ICR 22000 series DMU (Rotem) consists of three car and four car sets (see Figure 3).
The maximum service speed of the train formation is 100 mph (160 km/h).
Figure 3 – 22000 InterCity Rail Cars
Railway Accident Investigation Unit 13
Signall

## Token Count

In [74]:
def count_tokens(text, model="gpt-4o"):
    """
    Counts the number of tokens in a given text for a specified OpenAI model.
    """
    encoder = tiktoken.encoding_for_model(model)
    tokens = encoder.encode(text)
    return len(tokens)

In [91]:
def build_prompt(text):
    """
    Constructs the entity extraction prompt.
    """
    return f"""
    Extract key entities from the following accident report.

    Provide the output in valid JSON format with categories:
    - date
    - location
    - regulatory_body
    Ensure that the response is only valid JSON and contains no other text or formatting. Here's an example for you to follow:
    {{
        "date": "2023-02-22",
        "location": "Emly, County Tipperary",
        "regulatory_body": "Railway Accident Investigation Unit"
    }}
    If you cannot extract anything, please provide an empty JSON object.

    Here is the text to analyze: {text}
    """

In [92]:
def call_gpt(prompt, temperature=0.3):
    """
    Calls the GPT model with the structured prompt and returns the raw response.
    """
    completion = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are an expert in analyzing railway accident reports. Return output in JSON format only."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature
    )
    
    response_text = completion.choices[0].message.content.strip()
    
    # Remove markdown JSON formatting if present
    response_text = re.sub(r'^```json\n?|```$', '', response_text).strip()
    
    return response_text

In [None]:
def extract_entities(text, token_limit=1000):
    """
    Extracts key entities from an accident report using GPT.
    - First, counts tokens and allows user decision.
    - If within limit, runs GPT and handles errors.
    """
    prompt = build_prompt(text)
    token_count = count_tokens(prompt)

    print(f"🔹 Token Count for Prompt: {token_count} (Limit: {token_limit})")

    # Allow user to decide if they want to proceed
    if token_count > token_limit:
        print("Token count is too high! Please reduce the chunk size or refine the prompt.")
        return None  # Stops execution here

    # Confirm before making the API call
    proceed = input("Do you want to proceed with extraction? (yes/no): ").strip().lower()
    if proceed != "yes":
        print("Extraction aborted by user.")
        return None  # Stops execution

    print("Sending request to GPT...")

    response_text = call_gpt(prompt)

    try:
        return json.loads(response_text)  # Ensure valid JSON
    except json.JSONDecodeError as e:
        print("Error parsing JSON:", str(e))
        print("Storing raw response for review...")

        # Save the faulty response for debugging
        with open("failed_gpt_responses.json", "a") as file:
            json.dump({"input_text": text[:1000], "raw_output": response_text}, file, indent=4)
            file.write("\n")

        return {}  # Return empty dictionary in case of failure

In [94]:
entities = extract_entities(final_chunk)

print("Extracted Entities:", entities)

🔹 Token Count for Prompt: 584 (Limit: 1000)
⏳ Sending request to GPT...
Extracted Entities: {'date': '2023-02-22', 'location': 'Cork to Dublin', 'regulatory_body': 'Railway Accident Investigation Unit'}


# Entity Extraction

In [59]:
def extract_entities(text):
    """Extract key entities using GPT-4o-mini."""
    prompt = f"""
    Extract key entities from the following accident report:
    {text}
    
    Provide the output in valid JSON format with categories:
    - date
    - location
    - regulatory_body
    Ensure that the response is only valid JSON and contains no other text or formatting.
    """
    
    try:
        completion = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are an expert in analyzing rail accident reports. Return output in JSON format only."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3
        )
        
        response_text = completion.choices[0].message.content.strip()
        
        # Remove markdown JSON formatting if present
        response_text = re.sub(r'^```json\n?|```$', '', response_text).strip()
        
        return json.loads(response_text)  # Ensure valid JSON
    
    except json.JSONDecodeError as e:
        print("Error parsing JSON from OpenAI response:", str(e))
        print("Raw response:", response_text)
        return {}  # Return empty dictionary in case of failure

In [60]:
# Test GPT-4o-mini entity extraction
entity_json = extract_entities(pdf_text)
print("Extracted Entities:", json.dumps(entity_json, indent=4))

Extracted Entities: {
    "date": "22nd February 2023",
    "location": "Emly, County Tipperary, Ireland",
    "regulatory_body": "Railway Accident Investigation Unit (RAIU)"
}


In [74]:
print(json.dumps(entity_json))

{"date": "22nd February 2023", "location": "Emly, County Tipperary, Ireland", "regulatory_body": "Railway Accident Investigation Unit (RAIU)"}


In [70]:
# Store in Neo4j
def store_in_neo4j(json_data):
    """Store extracted data in Neo4j."""
    if not json_data:
        print("No valid entities to store in Neo4j.")
        return
    
    with driver.session(database=NEO4J_DATABASE) as session:
        for category, item in json_data.items():  # Iterate over key-value pairs
            if isinstance(item, list):  # If it's a list, iterate over items
                for value in item:
                    session.run("""
                        MERGE (n:Entity {name: $name, category: $category})
                    """, name=value, category=category)
            else:  # If it's a single string, store it directly
                session.run("""
                    MERGE (n:Entity {name: $name, category: $category})
                """, name=item, category=category)


In [76]:
# Store in Neo4j with Relationships
def store_in_neo4j(json_data):
    """Store extracted data in Neo4j and create relationships."""
    if not json_data:
        print("No valid entities to store in Neo4j.")
        return
    
    with driver.session(database=NEO4J_DATABASE) as session:
        # Create nodes
        session.run("""
            MERGE (d:Date {name: $date})
        """, date=json_data.get("date", "Unknown"))
        
        session.run("""
            MERGE (l:Location {name: $location})
        """, location=json_data.get("location", "Unknown"))
        
        session.run("""
            MERGE (r:RegulatoryBody {name: $regulatory_body})
        """, regulatory_body=json_data.get("regulatory_body", "Unknown"))
        
        # Create relationships
        session.run("""
            MATCH (d:Date {name: $date}), (l:Location {name: $location})
            MERGE (d)-[:OCCURRED_AT]->(l)
        """, date=json_data.get("date", "Unknown"), location=json_data.get("location", "Unknown"))
        
        session.run("""
            MATCH (l:Location {name: $location}), (r:RegulatoryBody {name: $regulatory_body})
            MERGE (l)-[:REGULATED_BY]->(r)
        """, location=json_data.get("location", "Unknown"), regulatory_body=json_data.get("regulatory_body", "Unknown"))

In [77]:
# Store extracted entities into Neo4j
try:
    db_result = store_in_neo4j(entity_json)
    print("Data stored in Neo4j successfully.")
except Exception as e:
    print("Failed to store data in Neo4j:", str(e))

Data stored in Neo4j successfully.


In [None]:
# Close Neo4j connection
driver.close()