# Package Loading

In [177]:
pip install pandas

Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl (11.5 MB)
Downloading tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.2.3 tzdata-2025.1
Note: you may need to restart the kernel to use updated packages.


In [178]:
# General 
import os

# Data manipulation
import pandas as pd

# PDFs
import pdfplumber
import json
import regex as re

# LLMs
from openai import OpenAI
import tiktoken
import faiss

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from pydantic import Field, BaseModel
from typing import List, Dict, Any, Optional

# Neo4j
from neo4j import GraphDatabase
from neo4j.exceptions import AuthError

# Raw Data Extraction

In [218]:
# Specify file name
pdf_name = "IE-6262-200429 LC Collision XM240.pdf"

In [None]:
# def extract_text_from_pdf(pdf_path, header_lines=1):
#     """
#     Extracts text from a PDF, including removing headers from each page.
#     """
#     text = []
    
#     with pdfplumber.open(pdf_path) as pdf:
#         for i, page in enumerate(pdf.pages):
#             page_text = page.extract_text()
#             if page_text:
#                 text_lines = page_text.split("\n")
#                 text_without_header = "\n".join(text_lines[header_lines:])
#                 text.append(text_without_header)
    
#     return "\n".join(text) # Returns a single string with all pages' text

# # Apply extraction function
# pdf_text = extract_text_from_pdf(f"./reports_ie/{pdf_name}", header_lines=2)

In [None]:
# def extract_text_from_pdf(pdf_path, header_lines=1):
#     """
#     Extracts text from a PDF while allowing for pre-processing,
#     including removing headers from each page and skipping introduction pages.
#     """
#     text = []

#     skip_patterns = {
#         "roman_numerals": r"\b[i|ii|iii|iv|v|vi|vii|viii|ix|x]+\b",  # Detect Roman numerals (intro pages)
#         "table_of_contents": r"\b(contents)\b",  # Detects "Table of Contents" pages
#     }

#     with pdfplumber.open(pdf_path) as pdf:
#         for page in pdf.pages:
#             page_text = page.extract_text()
#             if page_text:
#                 text_lines = page_text.split("\n")
#                 text_without_header = "\n".join(text_lines[header_lines:])
                
#                 # Extract possible footer text (bottom 5 lines)
#                 footer_text = " ".join(text_lines[-5:]).strip().lower()
                
#                 # Check if the page contains any unwanted patterns
#                 if any(re.search(pattern, footer_text) or re.search(pattern, page_text.lower()) 
#                        for pattern in skip_patterns.values()):
#                     continue  # Skip unwanted pages
                
#                 # Append only valid report pages
#                 text.append(f"[Page {page.page_number}]\n{text_without_header}")
    
#     return "\n".join(text)  # Returns a list where each item is a page's text

# pdf_text = extract_text_from_pdf(f"./reports_ie/{pdf_name}", header_lines=2)
# print(pdf_text)

In [219]:
def extract_summary_section(pdf_path, header_lines=1):
    """
    Extracts the text from the pages between 'Summary' and 'Contents'.
    """
    summary_text = ""
    summary_pattern = r"^summary"  # Detects 'Summary' at the start
    contents_pattern = r"^contents"  # Detects 'Contents' at the start
    capturing = False  # Flag to start capturing text
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text_lines = page_text.split("\n")
                text_without_header = text_lines[header_lines:]  # Remove header lines
                
                # Ensure there's enough content after the header
                if text_without_header:
                    first_significant_line = text_without_header[0].strip().lower()
                    
                    # Start capturing if 'Summary' is found
                    if re.match(summary_pattern, first_significant_line):
                        capturing = True
                    
                    # Stop capturing if 'Contents' is found
                    if re.match(contents_pattern, first_significant_line):
                        break
                    
                    # Append text if within summary section
                    if capturing:
                        summary_text += f"[Page {page.page_number}]\n" + "\n".join(text_without_header) + "\n\n"
    
    return summary_text

pdf_text = extract_summary_section(f"./reports_ie/{pdf_name}", header_lines=1)
print(pdf_text)

[Page 4]
Summary
At approximately 13:40 hour (hrs) on the 29th April 2020, the 13:10 hrs passenger service from
Westport to Dublin (Train A809) was approaching Kilnageer Level Crossing (LC) XM240,
located approximately six kilometres (km) from Castlebar, County Mayo. At the same time a
car approached LC XM240 with the gates open (left open by a previous user) and began
travelling through LC XM240. When the driver of Train A809 (Driver A809) saw the car, he
made a full-service brake application; however, the train could not stop in time and struck the
car. Causal factors associated with this accident are:
• The Car Driver failed to stop to look for trains on approach to LC XM240 as required by
the Road Safety Authority’s (RSA) Rules of the Road, in part, as a result of the level
crossing gates being open;
approaching train.
A contributing factor to the accident was:
• There is a high level of misuse and abuse at LC XM240, where the level crossing gates
are continuously left open, despit

# Langchain Chunk Splitting

In [220]:
def split_text_into_chunks(text, chunk_size=2000, chunk_overlap=300):
    """
    Splits text into smaller overlapping chunks using LangChain's text splitter.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_text(text)
    return chunks

# Split the extracted text
text_chunks = split_text_into_chunks(pdf_text)

# Print the number of chunks and a sample chunk
print(f"Total chunks: {len(text_chunks)}\nFirst chunk:\n{text_chunks[0]}")

Total chunks: 2
First chunk:
[Page 4]
Summary
At approximately 13:40 hour (hrs) on the 29th April 2020, the 13:10 hrs passenger service from
Westport to Dublin (Train A809) was approaching Kilnageer Level Crossing (LC) XM240,
located approximately six kilometres (km) from Castlebar, County Mayo. At the same time a
car approached LC XM240 with the gates open (left open by a previous user) and began
travelling through LC XM240. When the driver of Train A809 (Driver A809) saw the car, he
made a full-service brake application; however, the train could not stop in time and struck the
car. Causal factors associated with this accident are:
• The Car Driver failed to stop to look for trains on approach to LC XM240 as required by
the Road Safety Authority’s (RSA) Rules of the Road, in part, as a result of the level
crossing gates being open;
approaching train.
A contributing factor to the accident was:
• There is a high level of misuse and abuse at LC XM240, where the level crossing gates
are c

# Relevant Chunk Retrieval

### Vector-based Retrieval

In [221]:
# Define embeddings 
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [222]:
# Store text chunks into FAISS vector store
vectorstore = FAISS.from_texts(text_chunks, embeddings)

print(f"Stored {len(text_chunks)} chunks in FAISS.")

Stored 2 chunks in FAISS.


In [228]:
# Define entities of interest that you'd like to extract chunks for from the vector store
entities_of_interest = ["accident type", "date", "time", "country"]

# Function for extracting most relevant chunks from vector store
def find_most_relevant_chunks(entities, top_k):
    """
    Finds the most relevant text chunks for each entity of interest
    using FAISS similarity search and removes duplicates (if same chunk retrieved).
    
    Args:
    - entities (list): List of entity names to query (e.g., ["date", "location"])
    - top_k (int): Number of chunks to retrieve per entity
    
    Returns:
    - unique_relevant_chunks (list): Deduplicated relevant chunks
    """
    retrieved_chunks = set()  # Use a set to avoid duplicate chunks

    for entity in entities:
        print(f"Searching for entity: {entity}")
        query = f"Information about {entity}."
        found_chunks = vectorstore.similarity_search(query, k=top_k)

        for chunk in found_chunks:
            retrieved_chunks.add(chunk.page_content)  # Add chunk if not already present

    # Convert set back to a list and join into a single string
    unique_relevant_chunks = list(retrieved_chunks)
    combined_text = "\n".join(unique_relevant_chunks)

    print(f"Found {len(unique_relevant_chunks)} unique relevant chunks.")
    return combined_text

# Find & combine relevant chunks
relevant_text = find_most_relevant_chunks(entities_of_interest, top_k=3)

print(f"\nMost Relevant Chunks Combined:\n{relevant_text}")

Searching for entity: accident type
Searching for entity: date
Searching for entity: time
Searching for entity: country
Found 2 unique relevant chunks.

Most Relevant Chunks Combined:
• 202101-01 – The RSA should update the “Rules of the Road” to include guidance on the
DSS;
• 202101-02 – Iarnród Éireann Infrastructure Manager (IÉ-IM) should update the ‘The SAFE
use of Unattended Railway Level Crossings’ booklet to include guidance on the DSS;
• 202101-03 – Iarnród Éireann Railway Undertaking (IÉ-RU) should put systems in place to
ensure ICR train horns meet the current standards for sound pressure levels;
• 202101-04 – The Commission for Railway Regulation (CRR) should review and update
Section 5, Level Crossings, of their Guidelines for the Design of Railway Infrastructure and
Rolling Stock, to ensure that guidance/reference on the DSS is included.
Railway Accident Investigation Unit iii
[Page 4]
Summary
At approximately 13:40 hour (hrs) on the 29th April 2020, the 13:10 hrs passenge

## Instantiating GPT

In [None]:
## Set the API key and model name
MODEL="gpt-4o-mini"

# Load OpenAI API Key from requirements file
with open("gpt-personal-key.txt", "r") as file:
    OPENAI_API_KEY = file.read().strip()

# Instantiate OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

## Token Count

In [None]:
# Function for calculating tokens
def count_tokens(text, model="gpt-4o"):
    """
    Counts the number of tokens in a given text for a specified OpenAI model.
    """
    encoder = tiktoken.encoding_for_model(model)
    tokens = encoder.encode(text)
    return len(tokens)

## Entity Extraction

In [None]:
class Property(BaseModel):
    """A single property consisting of key and value."""
    key: str = Field(..., description="Property key")
    value: str = Field(..., description="Property value")

class Node(BaseNode):
    """Represents an entity in the railway accident knowledge graph."""
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    """Represents a relationship between two entities in the graph."""
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """A knowledge graph storing railway accident data."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [None]:
def call_gpt(prompt, temperature=1):
    """
    Calls the GPT model with the structured prompt and returns the raw response.
    """
    completion = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are an expert in analyzing railway accident reports. Return output in JSON format only."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature
    )
    
    response_text = completion.choices[0].message.content.strip()
    response_text = re.sub(r'^```json\n?|```$', '', response_text).strip()
    
    return response_text

In [229]:
def build_prompt(text):
    """
    Constructs a structured prompt to extract entities and relationships for railway accidents.
    """
    return f"""
    Analyze the following railway accident report context and extract structured knowledge.

    Return a JSON object with:
    - `nodes`: A list of entities, specifically {entities_of_interest}.
    - `rels`: A list of relationships linking entities.

    Look at this example JSON response and follow the schema closely. Pay attention to date and type formats (e.g., EU date format, 24-hour time).
    Ensure that the `source` and `target` nodes in `rels` are the same entities from the `nodes` list, and not different ones. And think about
    the relationships between the entities, i.e., (node AccidentType occurred_at node Country, or node AccidentType has_date Date).
    {{
        "nodes": [
            {{"id": "Train Derailment", "type": "AccidentType"}},
            {{"id": "23/12/2021", "type": "Date"}}
            {{"id": "16:32", "type": "Time"}},
            {{"id": "Ireland", "type": "Country"}},
            {{"id": "European Rail Agency", "type": "RegulatoryBody"}}
            
        ],
        "rels": [
            {{"source": "Train Derailment", "target": "Ireland", "type": "occurred_at"}},
            {{"source": "Train Derailment", "target": "European Rail Agency", "type": "investigated_by"}}
        ]
    }}

    Text:
    {text}

    JSON:
    """

In [230]:
def extract_knowledge_graph(text, token_limit=4096):
    """
    Extracts entities & relationships from a railway accident report using GPT.
    - First, counts tokens and allows user decision.
    - If within limit, runs GPT and handles errors.
    """

    # Step 1: Build Structured Graph Extraction Prompt
    prompt = build_prompt(text)
    
    # Step 2: Count Tokens
    token_count = count_tokens(prompt)
    estimated_cost = token_count * 0.00000015  # Approximate OpenAI pricing

    print(f"Token Count for Prompt: {token_count} (Limit: {token_limit})")
    print(f"Estimated Cost for {MODEL}: ${estimated_cost:.7f}")

    # Step 3: Check Token Limit
    if token_count > token_limit:
        print("Token count is too high! Please reduce the chunk size or refine the prompt.")
        return None  # Stop execution here

    # Step 4: Confirm Execution
    proceed = input("Do you want to proceed with knowledge graph extraction? (yes/no): ").strip().lower()
    if proceed != "yes":
        print("Extraction aborted by user.")
        return None  # Stop execution

    print("Sending request to GPT...")

    # Step 5: Call GPT for Extraction
    response_text = call_gpt(prompt)

    # Step 6: Process Response & Handle JSON Errors
    try:
        extracted_graph = json.loads(response_text)  # Ensure valid JSON
        return extracted_graph  # Successfully parsed knowledge graph
    except json.JSONDecodeError as e:
        print("Error parsing JSON:", str(e))
        print("Storing raw response for review...")

        # Save the faulty response for debugging
        with open("failed_graph_extractions.json", "a") as file:
            json.dump({"input_text": text[:1000], "raw_output": response_text}, file, indent=4)
            file.write("\n")

        return {}  # Return empty dictionary in case of failure

In [231]:
response_json = extract_knowledge_graph(pdf_text)
response_json

Token Count for Prompt: 885 (Limit: 4096)
Estimated Cost for gpt-4o-mini: $0.0001328
Sending request to GPT...


{'nodes': [{'id': 'Train Collision', 'type': 'AccidentType'},
  {'id': '29/04/2020', 'type': 'Date'},
  {'id': '13:40', 'type': 'Time'},
  {'id': 'Ireland', 'type': 'Country'}],
 'rels': [{'source': 'Train Collision',
   'target': 'Ireland',
   'type': 'occurred_at'},
  {'source': 'Train Collision', 'target': '29/04/2020', 'type': 'has_date'},
  {'source': 'Train Collision', 'target': '13:40', 'type': 'has_time'}]}

In [237]:
# File to store DataFrame
CSV_FILE = "pdf_processing_results.csv"

def append_pdf_json_result(pdf_name, response_json):
    """
    Appends the JSON output of response_json function to a DataFrame.
    If the same PDF is processed again, it adds a new column (iteration).
    If a new PDF is processed, it starts a new entry.
    """
    # Load existing CSV if available, otherwise create a new DataFrame
    if os.path.exists(CSV_FILE):
        df = pd.read_csv(CSV_FILE)
    else:
        df = pd.DataFrame(columns=["pdf_name"])

    # Check if PDF already exists in the DataFrame
    existing_rows = df[df["pdf_name"] == pdf_name]
    
    if not existing_rows.empty:
        # Count how many previous iterations exist for this PDF
        iteration_count = sum(col.startswith("Iteration_") for col in df.columns) + 1
    else:
        # New PDF file, start at iteration 1
        iteration_count = 1

    # Convert JSON response to a string for storage
    json_output = json.dumps(response_json, indent=2)

    # Get all iteration columns for this PDF
    iteration_columns = [col for col in existing_rows.columns if col.startswith("Iteration_")]

    # Check if this JSON already exists in any previous iterations
    if any(existing_rows[iter_col].iloc[0] == json_output for iter_col in iteration_columns):
        print("No changes in JSON across all iterations, skipping new entry.")
        return df  # Exit without adding a duplicate entry

    if not existing_rows.empty:
        # Update existing row by adding a new column for this iteration
        df.loc[df["pdf_name"] == pdf_name, f"Iteration_{iteration_count}"] = json_output
    else:
        # Create a new row for the new PDF using pd.concat()
        new_row = pd.DataFrame({"pdf_name": [pdf_name], f"Iteration_{iteration_count}": [json_output]})
        df = pd.concat([df, new_row], ignore_index=True)

    # Save DataFrame back to CSV
    df.to_csv(CSV_FILE, index=False)

    return df

In [241]:
results_df = append_pdf_json_result(pdf_name, response_json)
results_df

No changes in JSON across all iterations, skipping new entry.


Unnamed: 0,pdf_name,Iteration_1,Iteration_2
0,IE-6218-200111 Collision RRME Rosslare.pdf,"{\n ""nodes"": [\n {\n ""id"": ""11/01/202...",
1,IE-6262-200429 LC Collision XM240.pdf,"{\n ""nodes"": [\n {\n ""id"": ""29/04/202...","{\n ""nodes"": [\n {\n ""id"": ""Train Col..."


In [None]:
# Specify the target PDF name and iteration number
pdf_query = "IE-6262-200429 LC Collision XM240.pdf"
iteration_number = 1 

# Construct the column name dynamically
iteration_column = f"Iteration_{iteration_number}"

# Extract the JSON string if the PDF exists and the iteration column is present
if pdf_query in results_df["pdf_name"].values and iteration_column in results_df.columns:
    extracted_json = results_df.loc[results_df["pdf_name"] == pdf_query, iteration_column].iloc[0]
    print(json.dumps(json.loads(extracted_json), indent=4))  # Print or return the JSON string
else:
    print(f"No data found for {pdf_query} in {iteration_column}")

{
    "nodes": [
        {
            "id": "29/04/2020",
            "type": "Date"
        },
        {
            "id": "13:40",
            "type": "Time"
        },
        {
            "id": "Ireland",
            "type": "Country"
        }
    ],
    "rels": [
        {
            "source": "29/04/2020",
            "target": "13:40",
            "type": "occurred_at"
        },
        {
            "source": "29/04/2020",
            "target": "Ireland",
            "type": "occurred_in"
        },
        {
            "source": "13:40",
            "target": "Ireland",
            "type": "occurred_in"
        }
    ]
}


In [None]:
def props_to_dict(props) -> dict:
    """Converts properties to a dictionary for graph storage."""
    properties = {}
    if not props:
        return properties
    for p in props:
        properties[p["key"]] = p["value"]
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Maps extracted entities to graph nodes."""
    properties = {"name": node.id}
    return BaseNode(
        id=node.id,
        type=node.type.capitalize(),
        properties=properties
    )

def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Maps extracted relationships to graph edges."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}

    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

# Neo4j Storage

In [None]:
# Neo4j Connection Setup
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"
NEO4J_DATABASE = "neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

try:
    # Test the connection
    with driver.session() as session:
        session.run("RETURN 1")
    print("Connected to Neo4j successfully.")
except AuthError as e:
    print("Authentication failed. Check your credentials:", str(e))

In [None]:
def clear_neo4j_database():
    """Delete all nodes and relationships in the Neo4j database."""
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run("MATCH (n) DETACH DELETE n")
    print("Neo4j database cleared successfully.")

# Run the function to clear the database
clear_neo4j_database()

In [None]:
def convert_json_to_graph(response_json, source_text):
    """
    Converts extracted JSON into a graph-compatible format with nodes and relationships.
    """
    if not response_json:
        print("No valid data to convert to a graph.")
        return None

    # Convert Nodes
    graph_nodes = [map_to_base_node(Node(id=node["id"], type=node["type"])) for node in response_json["nodes"]]

    # Convert Relationships
    graph_rels = []
    for rel in response_json["rels"]:
        source_node = Node(id=rel["source"], type="Unknown")  # Temporary, type should be resolved
        target_node = Node(id=rel["target"], type="Unknown")  # Temporary
        graph_rels.append(map_to_base_relationship(Relationship(source=source_node, target=target_node, type=rel["type"])))

    # Create the structured GraphDocument with a source field
    return GraphDocument(nodes=graph_nodes, relationships=graph_rels, source=Document(page_content=source_text))


In [None]:
def store_in_neo4j(graph_document):
    """
    Stores extracted knowledge graph into Neo4j.
    """
    with driver.session() as session:
        # Store nodes
        for node in graph_document.nodes:
            session.run("""
                MERGE (n:Entity {id: $id, type: $type})
                SET n.name = $name
            """, id=node.id, type=node.type, name=node.id)

        # Store relationships
        for rel in graph_document.relationships:
            session.run("""
                MATCH (s:Entity {id: $source})
                MATCH (t:Entity {id: $target})
                MERGE (s)-[:RELATIONSHIP {type: $type}]->(t)
            """, source=rel.source.id, target=rel.target.id, type=rel.type)

In [None]:
def process_railway_accident_report(text):
    
    print("Converting JSON to graph format...")
    graph_document = convert_json_to_graph(response_json, relevant_text)

    if graph_document:
        print("Graph structure created! Storing in Neo4j...")
        store_in_neo4j(graph_document)

In [None]:
# Store extracted entities into Neo4j
try:
    db_result = process_railway_accident_report(response_json)
    print("Data stored in Neo4j successfully.")
except Exception as e:
    print("Failed to store data in Neo4j:", str(e))

In [None]:
# Close Neo4j connection
driver.close()