In [34]:
pip install regex

Note: you may need to restart the kernel to use updated packages.


In [41]:
# Load necessary libraries
import pdfplumber
from openai import OpenAI
import json
import os
import regex as re
from neo4j import GraphDatabase
from neo4j.exceptions import AuthError

In [42]:
## Set the API key and model name
MODEL="gpt-4o-mini"

# Load OpenAI API Key from requirements file
with open("gpt-personal-key.txt", "r") as file:
    OPENAI_API_KEY = file.read().strip()

client = OpenAI(api_key=OPENAI_API_KEY)

In [43]:
# Neo4j Connection Setup
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"
NEO4J_DATABASE = "neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

try:
    # Test the connection
    with driver.session() as session:
        session.run("RETURN 1")
    print("Connected to Neo4j successfully.")
except AuthError as e:
    print("Authentication failed. Check your credentials:", str(e))

Connected to Neo4j successfully.


In [75]:
def clear_neo4j_database():
    """Delete all nodes and relationships in the Neo4j database."""
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run("MATCH (n) DETACH DELETE n")
    print("Neo4j database cleared successfully.")

# Run the function to clear the database
clear_neo4j_database()

Neo4j database cleared successfully.


In [45]:
# Extract the text
def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Test PDF extraction
pdf_text = extract_text_from_pdf("raiu_example.pdf")
print("Extracted Text Sample:", pdf_text[:500])  # Print first 500 characters

Extracted Text Sample: Railway Accident
Investigation Unit
Ireland
INVESTIGATION REPORT
Broken Rail near Emly,
County Tipperary, 22nd February 2023
RAIU Investigation Report No: 2024-R002
Published: 22nd March 2024
Broken Rail near Emly, County Tipperary, 22nd February 2023
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed report remains with the RAIU by virtue of in Regulation 9 (7) of European Union
(EU) (Railway Safety) (R


In [59]:
def extract_entities(text):
    """Extract key entities using GPT-4o-mini."""
    prompt = f"""
    Extract key entities from the following accident report:
    {text}
    
    Provide the output in valid JSON format with categories:
    - date
    - location
    - regulatory_body
    Ensure that the response is only valid JSON and contains no other text or formatting.
    """
    
    try:
        completion = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are an expert in analyzing rail accident reports. Return output in JSON format only."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3
        )
        
        response_text = completion.choices[0].message.content.strip()
        
        # Remove markdown JSON formatting if present
        response_text = re.sub(r'^```json\n?|```$', '', response_text).strip()
        
        return json.loads(response_text)  # Ensure valid JSON
    
    except json.JSONDecodeError as e:
        print("Error parsing JSON from OpenAI response:", str(e))
        print("Raw response:", response_text)
        return {}  # Return empty dictionary in case of failure

In [60]:
# Test GPT-4o-mini entity extraction
entity_json = extract_entities(pdf_text)
print("Extracted Entities:", json.dumps(entity_json, indent=4))

Extracted Entities: {
    "date": "22nd February 2023",
    "location": "Emly, County Tipperary, Ireland",
    "regulatory_body": "Railway Accident Investigation Unit (RAIU)"
}


In [74]:
print(json.dumps(entity_json))

{"date": "22nd February 2023", "location": "Emly, County Tipperary, Ireland", "regulatory_body": "Railway Accident Investigation Unit (RAIU)"}


In [70]:
# Store in Neo4j
def store_in_neo4j(json_data):
    """Store extracted data in Neo4j."""
    if not json_data:
        print("No valid entities to store in Neo4j.")
        return
    
    with driver.session(database=NEO4J_DATABASE) as session:
        for category, item in json_data.items():  # Iterate over key-value pairs
            if isinstance(item, list):  # If it's a list, iterate over items
                for value in item:
                    session.run("""
                        MERGE (n:Entity {name: $name, category: $category})
                    """, name=value, category=category)
            else:  # If it's a single string, store it directly
                session.run("""
                    MERGE (n:Entity {name: $name, category: $category})
                """, name=item, category=category)


In [76]:
# Store in Neo4j with Relationships
def store_in_neo4j(json_data):
    """Store extracted data in Neo4j and create relationships."""
    if not json_data:
        print("No valid entities to store in Neo4j.")
        return
    
    with driver.session(database=NEO4J_DATABASE) as session:
        # Create nodes
        session.run("""
            MERGE (d:Date {name: $date})
        """, date=json_data.get("date", "Unknown"))
        
        session.run("""
            MERGE (l:Location {name: $location})
        """, location=json_data.get("location", "Unknown"))
        
        session.run("""
            MERGE (r:RegulatoryBody {name: $regulatory_body})
        """, regulatory_body=json_data.get("regulatory_body", "Unknown"))
        
        # Create relationships
        session.run("""
            MATCH (d:Date {name: $date}), (l:Location {name: $location})
            MERGE (d)-[:OCCURRED_AT]->(l)
        """, date=json_data.get("date", "Unknown"), location=json_data.get("location", "Unknown"))
        
        session.run("""
            MATCH (l:Location {name: $location}), (r:RegulatoryBody {name: $regulatory_body})
            MERGE (l)-[:REGULATED_BY]->(r)
        """, location=json_data.get("location", "Unknown"), regulatory_body=json_data.get("regulatory_body", "Unknown"))

In [77]:
# Store extracted entities into Neo4j
try:
    db_result = store_in_neo4j(entity_json)
    print("Data stored in Neo4j successfully.")
except Exception as e:
    print("Failed to store data in Neo4j:", str(e))

Data stored in Neo4j successfully.


In [None]:
# Close Neo4j connection
driver.close()