In [None]:
python -m spacy download en_core_web_lg

In [2]:
# Load necessary libraries
import pdfplumber
import spacy
import json
from neo4j import GraphDatabase
from neo4j.exceptions import AuthError

In [3]:
# Neo4j Connection Setup
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"
NEO4J_DATABASE = "neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

try:
    # Test the connection
    with driver.session() as session:
        session.run("RETURN 1")
    print("Connected to Neo4j successfully.")
except AuthError as e:
    print("Authentication failed. Check your credentials:", str(e))

Connected to Neo4j successfully.


In [4]:
def clear_neo4j_database():
    """Delete all nodes and relationships in the Neo4j database."""
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run("MATCH (n) DETACH DELETE n")
    print("Neo4j database cleared successfully.")

# Run the function to clear the database
clear_neo4j_database()

Neo4j database cleared successfully.


In [13]:
# Load spaCy model (use an appropriate model or fine-tune for domain-specific data)
nlp = spacy.load("en_core_web_lg")

In [14]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Test PDF extraction
pdf_text = extract_text_from_pdf("../reports_ie/IE-6218-200111 Collision RRME Rosslare.pdf")

In [15]:
# Entity Extraction
def extract_entities(text):
    """Extract key entities using NLP."""
    doc = nlp(text)
    entities = {"ACCIDENT_DATE": [], "ACCIDENT_TIME": [], "ACCIDENT_LOCATION": [], "ACCIDENT_CAUSE": []}
    
    for ent in doc.ents:
        if ent.label_ in ["DATE"]:
            entities["ACCIDENT_DATE"].append(ent.text)
        elif ent.label_ in ["TIME"]:
            entities["ACCIDENT_TIME"].append(ent.text)
        elif ent.label_ in ["LOC"]:
            entities["ACCIDENT_LOCATION"].append(ent.text)
        elif ent.label_ in ["CAUSE"]:
            entities["ACCIDENT_CAUSE"].append(ent.text)
    
    return json.dumps(entities, indent=4)

In [16]:
# Test NLP entity extraction
entity_json = extract_entities(pdf_text)
print("Extracted Entities:", entity_json)

Extracted Entities: {
    "ACCIDENT_DATE": [
        "11th January 2020",
        "16th December 2020",
        "11th January 2020",
        "2005",
        "24 April 2020",
        "11th January 2020",
        "2020",
        "11th January 2020",
        "11th January 2020",
        "11th January 2020",
        "11th January 2020",
        "202004-01",
        "202004-02",
        "202004-03",
        "202004-04",
        "11th January 2020",
        "11th January 2020",
        "11th January 2020\nCausal",
        "11th January 2020",
        "2005",
        "2020",
        "January monthly",
        "the day",
        "11th January 2020",
        "11th January 2020",
        "11th January 2020",
        "11th January 2020",
        "the day",
        "the day",
        "22343",
        "11th January 2020",
        "1169/2010",
        "24th March\n2018",
        "four years",
        "23rd March 2018",
        "five\nyears",
        "twenty-four years",
        "the day",
        "t

In [10]:
# Store in Neo4j
def store_in_neo4j(json_data):
    """Store extracted data in Neo4j."""
    data = json.loads(json_data)
    with driver.session(database=NEO4J_DATABASE) as session:
        for category, items in data.items():
            for item in set(items):
                session.run("""
                    MERGE (n:Entity {name: $name, category: $category})
                """, name=item, category=category)

In [11]:
# Store extracted entities into Neo4j
try:
    db_result = store_in_neo4j(entity_json)
    print("Data stored in Neo4j successfully.")
except Exception as e:
    print("Failed to store data in Neo4j:", str(e))

Data stored in Neo4j successfully.


In [12]:
# Close Neo4j connection
driver.close()