# Package Loading

In [None]:
# pip install tiktoken

In [1]:
# General 
import os

# PDFs
import pdfplumber
import json
import regex as re

# LLMs
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
import tiktoken

# Neo4j
from neo4j import GraphDatabase
from neo4j.exceptions import AuthError

# Instantiating GPT & Neo4j

In [2]:
## Set the API key and model name
MODEL="gpt-4o-mini"

# Load OpenAI API Key from requirements file
with open("gpt-personal-key.txt", "r") as file:
    OPENAI_API_KEY = file.read().strip()

client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
# Neo4j Connection Setup
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"
NEO4J_DATABASE = "neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

try:
    # Test the connection
    with driver.session() as session:
        session.run("RETURN 1")
    print("Connected to Neo4j successfully.")
except AuthError as e:
    print("Authentication failed. Check your credentials:", str(e))

In [None]:
def clear_neo4j_database():
    """Delete all nodes and relationships in the Neo4j database."""
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run("MATCH (n) DETACH DELETE n")
    print("Neo4j database cleared successfully.")

# Run the function to clear the database
clear_neo4j_database()

# Raw Data Extraction

### Whole Extraction

In [3]:
# Extract the text
def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Test PDF extraction
pdf_text = extract_text_from_pdf("raiu_example_collision.pdf")
print(pdf_text[:500])  # Print first 500 characters

Railway Accident
Investigation Unit
Ireland
INVESTIGATION REPORT
Collision between a car and a train at
Level Crossing XM190, Mayo, 9th September 2023
RAIU Investigation Report No: 2024-R003
Published: 12/12/2024
Collision between a car and a train at Level Crossing XM190, Mayo, 9th September 2023
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed report remains with the RAIU by virtue of Regulation 9 (7


### Pre-processed Extraction

In [8]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF while allowing for pre-processing.
    """
    text = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
    
    return text  # Returns a list where each item is a page's text

# Extract pages as a list
pdf_pages = extract_text_from_pdf("raiu_example_collision.pdf")

# Print the first few pages to inspect where the TOC might be
for i, page in enumerate(pdf_pages[:5]):  # Check first 5 pages
    print(f"Page {i+1}:\n{page[:500]}\n{'-'*40}")


Page 1:
Railway Accident
Investigation Unit
Ireland
INVESTIGATION REPORT
Collision between a car and a train at
Level Crossing XM190, Mayo, 9th September 2023
RAIU Investigation Report No: 2024-R003
Published: 12/12/2024
----------------------------------------
Page 2:
Collision between a car and a train at Level Crossing XM190, Mayo, 9th September 2023
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed report remains with the RAIU by virtue of Regulation 9 (7) of European Union
(EU) (Railway Safety) (Reporting and Investigation of Serious Accidents, Accidents and
Incidents) Regulations 2020 (S.I. 430 of 2020). No person may produce, reproduce or transmit
in any form o
----------------------------------------
Page 3:
Collision between a car and a train at Level Crossing XM190, Mayo, 9th September 2023
Preface
The RAIU is an independent investigation unit within the Department of Transport which

In [10]:
def extract_text_omit_toc(pdf_path, toc_start=7, toc_end=9):
    """
    Extracts text from a PDF while skipping the Table of Contents.
    """
    text = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            if toc_start <= i+1 <= toc_end:  # Skip TOC pages
                continue
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
    
    return text

# Extract text without TOC
filtered_pdf_pages = extract_text_omit_toc("raiu_example_collision.pdf")

# Join pages into a single text document
cleaned_text = "\n".join(filtered_pdf_pages)
print(cleaned_text[:10000])  # Preview the cleaned text

Railway Accident
Investigation Unit
Ireland
INVESTIGATION REPORT
Collision between a car and a train at
Level Crossing XM190, Mayo, 9th September 2023
RAIU Investigation Report No: 2024-R003
Published: 12/12/2024
Collision between a car and a train at Level Crossing XM190, Mayo, 9th September 2023
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed report remains with the RAIU by virtue of Regulation 9 (7) of European Union
(EU) (Railway Safety) (Reporting and Investigation of Serious Accidents, Accidents and
Incidents) Regulations 2020 (S.I. 430 of 2020). No person may produce, reproduce or transmit
in any form or by any means this report or any part thereof without the express permission of
the RAIU. This report may be freely used for educational purposes.
Where the report has been altered following its original publication, details on the changes will
be given.
Report structure
The report str

In [11]:
def clean_text(text):
    """
    Cleans text by removing headers, footers, and empty lines.
    """
    lines = text.split("\n")
    cleaned_lines = []

    for line in lines:
        line = line.strip()

        if re.match(r'^(Page \d+|Collision between a car and a train at Level Crossing XM190, Mayo, 9th September 2023|Railway Accident Investigation Unit)$', line):
            continue

        cleaned_lines.append(line)

    return "\n".join(cleaned_lines)

# Apply cleaning
final_cleaned_text = clean_text(cleaned_text)
print(final_cleaned_text[:1000])  # Preview cleaned text

Railway Accident
Investigation Unit
Ireland
INVESTIGATION REPORT
Collision between a car and a train at
Level Crossing XM190, Mayo, 9th September 2023
RAIU Investigation Report No: 2024-R003
Published: 12/12/2024
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed report remains with the RAIU by virtue of Regulation 9 (7) of European Union
(EU) (Railway Safety) (Reporting and Investigation of Serious Accidents, Accidents and
Incidents) Regulations 2020 (S.I. 430 of 2020). No person may produce, reproduce or transmit
in any form or by any means this report or any part thereof without the express permission of
the RAIU. This report may be freely used for educational purposes.
Where the report has been altered following its original publication, details on the changes will
be given.
Report structure
The report structure is written as close as possible to the structure set out in the “Commission
Imp

# Langchain Chunk Splitting

In [33]:
def split_text_into_chunks(text, chunk_size=2000, chunk_overlap=300):
    """
    Splits text into smaller overlapping chunks using LangChain's text splitter.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_text(text)
    return chunks

# Split the extracted text
text_chunks = split_text_into_chunks(final_cleaned_text)

# Print the number of chunks and a sample chunk
print(f"Total Chunks: {len(text_chunks)}")

Total Chunks: 88


# Relevant Chunk Retrieval

### Keyword-based Retrieval

In [34]:
# def find_relevant_chunks(chunks, keyword):
#     """
#     Returns chunks that contain a specific keyword.
#     """
#     relevant_chunks = [chunk for chunk in chunks if keyword.lower() in chunk.lower()]
#     return relevant_chunks

# # Example: Find chunks mentioning "location"
# location_chunks = find_relevant_chunks(text_chunks, "location")

# print(f"Found {len(location_chunks)} relevant chunks.")
# print("Sample Chunk:\n", location_chunks[0] if location_chunks else "No relevant chunks found.")

### Vector-based Retrieval

In [14]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Convert text chunks into FAISS vector store
vectorstore = FAISS.from_texts(text_chunks, embeddings)

print(f"Stored {len(text_chunks)} chunks in FAISS.")

Stored 88 chunks in FAISS.


In [None]:
def find_most_relevant_chunks(entities, top_k=1):
    """
    Finds the most relevant text chunks for each entity of interest
    using FAISS similarity search and removes duplicates.
    
    Args:
    - entities (list): List of entity names to query (e.g., ["date", "location", "regulatory_body"])
    - top_k (int): Number of chunks to retrieve per entity
    
    Returns:
    - unique_relevant_chunks (list): Deduplicated relevant chunks
    """
    retrieved_chunks = set()  # Use a set to avoid duplicate chunks

    for entity in entities:
        print(f"Searching for entity: {entity}")
        query = f"Information about {entity}."
        found_chunks = vectorstore.similarity_search(query, k=top_k)

        for chunk in found_chunks:
            retrieved_chunks.add(chunk.page_content)  # Add chunk if not already present

    # Convert set back to a list and join into a single string
    unique_relevant_chunks = list(retrieved_chunks)
    combined_text = "\n".join(unique_relevant_chunks)

    print(f"Found {len(unique_relevant_chunks)} unique relevant chunks.")
    return combined_text

# Define entities of interest
entities_of_interest = ["date", "location", "regulatory_body"]

# Find & combine relevant chunks
relevant_text = find_most_relevant_chunks(entities_of_interest, top_k=1)

# print(f"Most Relevant Chunks Combined:\n{relevant_text}")

## Token Count

In [19]:
def count_tokens(text, model="gpt-4o"):
    """
    Counts the number of tokens in a given text for a specified OpenAI model.
    """
    encoder = tiktoken.encoding_for_model(model)
    tokens = encoder.encode(text)
    return len(tokens)

## Entity Extraction

In [39]:
schema_example = {
        "date": "2023-02-22",
        "location": "Emly, County Tipperary",
        "regulatory_body": "Railway Accident Investigation Unit"
        }

print(f"Your entities of interest are: \n{entities_of_interest}\n")
print(f"Your schema example is:\n{schema_example}")

Your entities of interest are: 
['date', 'location', 'regulatory_body']

Your schema example is:
{'date': '2023-02-22', 'location': 'Emly, County Tipperary', 'regulatory_body': 'Railway Accident Investigation Unit'}


In [40]:
def build_prompt(text):
    """
    Constructs the entity extraction prompt.
    """
    return f"""
    Extract key entities from the following accident report.

    Provide the output in valid JSON format with categories {entities_of_interest}. Ensure that the response is only valid JSON and 
    contains no other text or formatting. Here's an example for you to follow:
    {schema_example}
    If you cannot extract anything, please provide an empty JSON object.

    Here is the text to analyze: {text}
    """

In [41]:
def call_gpt(prompt, temperature=0.3):
    """
    Calls the GPT model with the structured prompt and returns the raw response.
    """
    completion = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are an expert in analyzing railway accident reports. Return output in JSON format only."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature
    )
    
    response_text = completion.choices[0].message.content.strip()
    
    # Remove markdown JSON formatting if present
    response_text = re.sub(r'^```json\n?|```$', '', response_text).strip()
    
    return response_text

In [42]:
def extract_entities(text, token_limit=4096):
    """
    Extracts key entities from an accident report using GPT.
    - First, counts tokens and allows user decision.
    - If within limit, runs GPT and handles errors.
    """
    prompt = build_prompt(text)
    token_count = count_tokens(prompt)

    print(f"Token Count for Prompt: {token_count} (Limit: {token_limit})")
    print(f"Prompt Costs for model {MODEL}: ${token_count * 0.00000015}")

    # Allow user to decide if they want to proceed
    if token_count > token_limit:
        print("Token count is too high! Please reduce the chunk size or refine the prompt.")
        return None  # Stops execution here

    # Confirm before making the API call
    proceed = input("Do you want to proceed with extraction? (yes/no): ").strip().lower()
    if proceed != "yes":
        print("Extraction aborted by user.")
        return None  # Stops execution

    print("Sending request to GPT...")

    response_text = call_gpt(prompt)

    try:
        return json.loads(response_text)  # Ensure valid JSON
    except json.JSONDecodeError as e:
        print("Error parsing JSON:", str(e))
        print("Storing raw response for review...")

        # Save the faulty response for debugging
        with open("failed_gpt_responses.json", "a") as file:
            json.dump({"input_text": text[:1000], "raw_output": response_text}, file, indent=4)
            file.write("\n")

        return {}  # Return empty dictionary in case of failure

In [44]:
entities = extract_entities(relevant_text)

print("Extracted Entities:", entities)

Token Count for Prompt: 1348 (Limit: 4096)
Prompt Costs for model gpt-4o-mini: $0.00020219999999999998
Sending request to GPT...
Extracted Entities: {'date': '2023-09-09', 'location': 'level crossing XM190', 'regulatory_body': 'Railway Accident Investigation Unit'}


In [24]:
print(json.dumps(entities))

{"date": "2023-09-09", "location": "level crossing XM190", "regulatory_body": "Railway Accident Investigation Unit"}


# Neo4j Storage

In [None]:
# Store in Neo4j
def store_in_neo4j(json_data):
    """Store extracted data in Neo4j."""
    if not json_data:
        print("No valid entities to store in Neo4j.")
        return
    
    with driver.session(database=NEO4J_DATABASE) as session:
        for category, item in json_data.items():  # Iterate over key-value pairs
            if isinstance(item, list):  # If it's a list, iterate over items
                for value in item:
                    session.run("""
                        MERGE (n:Entity {name: $name, category: $category})
                    """, name=value, category=category)
            else:  # If it's a single string, store it directly
                session.run("""
                    MERGE (n:Entity {name: $name, category: $category})
                """, name=item, category=category)


In [None]:
# Store in Neo4j with Relationships
def store_in_neo4j(json_data):
    """Store extracted data in Neo4j and create relationships."""
    if not json_data:
        print("No valid entities to store in Neo4j.")
        return
    
    with driver.session(database=NEO4J_DATABASE) as session:
        # Create nodes
        session.run("""
            MERGE (d:Date {name: $date})
        """, date=json_data.get("date", "Unknown"))
        
        session.run("""
            MERGE (l:Location {name: $location})
        """, location=json_data.get("location", "Unknown"))
        
        session.run("""
            MERGE (r:RegulatoryBody {name: $regulatory_body})
        """, regulatory_body=json_data.get("regulatory_body", "Unknown"))
        
        # Create relationships
        session.run("""
            MATCH (d:Date {name: $date}), (l:Location {name: $location})
            MERGE (d)-[:OCCURRED_AT]->(l)
        """, date=json_data.get("date", "Unknown"), location=json_data.get("location", "Unknown"))
        
        session.run("""
            MATCH (l:Location {name: $location}), (r:RegulatoryBody {name: $regulatory_body})
            MERGE (l)-[:REGULATED_BY]->(r)
        """, location=json_data.get("location", "Unknown"), regulatory_body=json_data.get("regulatory_body", "Unknown"))

In [None]:
# Store extracted entities into Neo4j
try:
    db_result = store_in_neo4j(entity_json)
    print("Data stored in Neo4j successfully.")
except Exception as e:
    print("Failed to store data in Neo4j:", str(e))

In [None]:
# Close Neo4j connection
driver.close()