In [10]:
pip install spacy[transformers]

Collecting spacy_transformers<1.4.0,>=1.1.2 (from spacy[transformers])
  Downloading spacy_transformers-1.3.8-cp312-cp312-win_amd64.whl.metadata (7.2 kB)
Collecting transformers<4.50.0,>=3.4.0 (from spacy_transformers<1.4.0,>=1.1.2->spacy[transformers])
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting torch>=1.8.0 (from spacy_transformers<1.4.0,>=1.1.2->spacy[transformers])
  Downloading torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy_transformers<1.4.0,>=1.1.2->spacy[transformers])
  Downloading spacy_alignments-0.9.1-cp312-cp312-win_amd64.whl.metadata (2.7 kB)
Collecting filelock (from torch>=1.8.0->spacy_transformers<1.4.0,>=1.1.2->spacy[transformers])
  Downloading filelock-3.17.0-py3-none-any.whl.metadata (2.9 kB)
Collecting networkx (from torch>=1.8.0->spacy_transformers<1.4.0,>=1.1.2->spacy[transformers])
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (fr

In [11]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------------------------------- ------- 10.5/12.8 MB 59.4 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 53.6 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
# Load necessary libraries
import pdfplumber
import spacy
import json
from neo4j import GraphDatabase
from neo4j.exceptions import AuthError

In [3]:
# Neo4j Connection Setup
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"
NEO4J_DATABASE = "neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

try:
    # Test the connection
    with driver.session() as session:
        session.run("RETURN 1")
    print("Connected to Neo4j successfully.")
except AuthError as e:
    print("Authentication failed. Check your credentials:", str(e))

Connected to Neo4j successfully.


In [4]:
def clear_neo4j_database():
    """Delete all nodes and relationships in the Neo4j database."""
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run("MATCH (n) DETACH DELETE n")
    print("Neo4j database cleared successfully.")

# Run the function to clear the database
clear_neo4j_database()

Neo4j database cleared successfully.


In [5]:
# Load spaCy model (use an appropriate model or fine-tune for domain-specific data)
nlp = spacy.load("en_core_web_sm")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Test PDF extraction
pdf_text = extract_text_from_pdf("raiu_example.pdf")
print("Extracted Text Sample:", pdf_text[:500])  # Print first 500 characters

Extracted Text Sample: Railway Accident
Investigation Unit
Ireland
INVESTIGATION REPORT
Broken Rail near Emly,
County Tipperary, 22nd February 2023
RAIU Investigation Report No: 2024-R002
Published: 22nd March 2024
Broken Rail near Emly, County Tipperary, 22nd February 2023
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed report remains with the RAIU by virtue of in Regulation 9 (7) of European Union
(EU) (Railway Safety) (R


In [7]:
# Entity Extraction
def extract_entities(text):
    """Extract key entities using NLP."""
    doc = nlp(text)
    entities = {"ACCIDENT_DETAILS": [], "RAILWAY_OPERATORS": [], "TECHNICAL_DETAILS": [], "REGULATORY_BODIES": []}
    
    for ent in doc.ents:
        if ent.label_ in ["DATE", "LOC", "CAUSE"]:
            entities["ACCIDENT_DETAILS"].append(ent.text)
        elif ent.label_ in ["ORG"]:
            entities["RAILWAY_OPERATORS"].append(ent.text)
        elif ent.label_ in ["PRODUCT", "FACILITY"]:
            entities["TECHNICAL_DETAILS"].append(ent.text)
        elif ent.label_ in ["LAW", "GPE"]:
            entities["REGULATORY_BODIES"].append(ent.text)
    
    return json.dumps(entities, indent=4)

In [8]:
# Test NLP entity extraction
entity_json = extract_entities(pdf_text)
print("Extracted Entities:", entity_json)

Extracted Entities: {
    "ACCIDENT_DETAILS": [
        "February 2023",
        "2024-R002",
        "22nd March 2024",
        "February 2023",
        "2020",
        "24 April 2020",
        "February 2023",
        "2020",
        "2020",
        "February 2023",
        "23rd February 2023",
        "February 2023",
        "ten years",
        "February 2023",
        "Tuesday 21st February 2023",
        "Tuesday the 21st February 2023",
        "Tuesday",
        "Wednesday",
        "February 2023",
        "Wednesday 22nd February 2023",
        "Thursday the 23rd February\n2023",
        "February 2023",
        "February 2023",
        "2024002-01",
        "February 2023",
        "February 2023",
        "Wednesday 22nd February 2023",
        "the day",
        "February 2023",
        "16th April 2019",
        "date 1st",
        "August 2019",
        "12th August\n2010",
        "7th February 2011",
        "February 2002",
        "2016",
        "2024002-02",
    

In [14]:
print(entity_json)

{
    "ACCIDENT_DETAILS": [
        "February 2023",
        "2024-R002",
        "22nd March 2024",
        "February 2023",
        "2020",
        "24 April 2020",
        "February 2023",
        "2020",
        "2020",
        "February 2023",
        "23rd February 2023",
        "February 2023",
        "ten years",
        "February 2023",
        "Tuesday 21st February 2023",
        "Tuesday the 21st February 2023",
        "Tuesday",
        "Wednesday",
        "February 2023",
        "Wednesday 22nd February 2023",
        "Thursday the 23rd February\n2023",
        "February 2023",
        "February 2023",
        "2024002-01",
        "February 2023",
        "February 2023",
        "Wednesday 22nd February 2023",
        "the day",
        "February 2023",
        "16th April 2019",
        "date 1st",
        "August 2019",
        "12th August\n2010",
        "7th February 2011",
        "February 2002",
        "2016",
        "2024002-02",
        "February 2023",

In [9]:
type(entity_json)

str

In [10]:
# Store in Neo4j
def store_in_neo4j(json_data):
    """Store extracted data in Neo4j."""
    data = json.loads(json_data)
    with driver.session(database=NEO4J_DATABASE) as session:
        for category, items in data.items():
            for item in set(items):
                session.run("""
                    MERGE (n:Entity {name: $name, category: $category})
                """, name=item, category=category)

In [11]:
# Store extracted entities into Neo4j
try:
    db_result = store_in_neo4j(entity_json)
    print("Data stored in Neo4j successfully.")
except Exception as e:
    print("Failed to store data in Neo4j:", str(e))

Data stored in Neo4j successfully.


In [12]:
# Close Neo4j connection
driver.close()