# Package Loading

In [None]:
pip install pandas

In [None]:
# Standard Library
import json
import os
import re

# Data Manipulation
import pandas as pd

# PDFs
import pdfplumber

# LLMs
import faiss
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.graphs.graph_document import (
    GraphDocument,
    Node as BaseNode,
    Relationship as BaseRelationship,
)
from langchain_huggingface import HuggingFaceEmbeddings
from openai import OpenAI
import tiktoken  # If used

# Neo4j
from neo4j import GraphDatabase
from neo4j.exceptions import AuthError

# Typing & Validation
from pydantic import BaseModel, Field
from typing import Any, Dict, List, Optional

# Raw Data Extraction

In [None]:
results_df

In [None]:
# Specify file name
pdf_name = "IE-10397 - 211207 Clontarf.pdf"

In [None]:
SUMMARY_PATTERN = re.compile(r"^summary", re.IGNORECASE)
CONTENTS_PATTERN = re.compile(r"^contents", re.IGNORECASE)

# Define function to extract the summary section from Ireland reports
def extract_summary_section(pdf_path, header_lines=1):
    """
    Extracts the text from the pages between 'Summary' and 'Contents', as well as the header (which is usually repeated).
    Function adapted only for Ireland report incident PDFs due to consistent structure.
    """
    summary_text = ""
    capturing = False  # Flag to start capturing text
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text_lines = page_text.split("\n")
                text_without_header = text_lines[header_lines:]  # Remove header lines
                
                # Ensure there's enough content after the header
                if text_without_header:
                    first_significant_line = text_without_header[0].strip().lower()
                    
                    # Start capturing if 'Summary' is found
                    if SUMMARY_PATTERN.match(first_significant_line):
                        capturing = True
                    
                    # Stop capturing if 'Contents' is found
                    if CONTENTS_PATTERN.match(first_significant_line):
                        break
                    
                    # Append text if within summary section
                    if capturing:
                        summary_text += f"[Page {page.page_number}]\n" + "\n".join(text_without_header) + "\n\n"
    
    return summary_text

pdf_text = extract_summary_section(f"./reports_ie/{pdf_name}", header_lines=1)
print(pdf_text)

# Langchain Chunk Splitting

In [None]:
def split_text_into_chunks(text, chunk_size=2000, chunk_overlap=300):
    """
    Splits text into smaller overlapping chunks using LangChain's text splitter.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_text(text)
    return chunks

# Split the extracted text
text_chunks = split_text_into_chunks(pdf_text)

# Print the number of chunks and a sample chunk
print(f"Total chunks: {len(text_chunks)}\nFirst chunk:\n{text_chunks[0]}")

# Relevant Chunk Retrieval

### Vector-based Retrieval

In [None]:
# Define embeddings 
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
# Store text chunks into FAISS vector store
vectorstore = FAISS.from_texts(text_chunks, embeddings)

print(f"Stored {len(text_chunks)} chunks in FAISS.")

In [None]:
# Define entities of interest that you'd like to extract chunks for from the vector store
entities_of_interest = ["accident type", "date", "time", "country"]

# Function for extracting most relevant chunks from vector store
def find_most_relevant_chunks(entities, top_k):
    """
    Finds the most relevant text chunks for each entity of interest
    using FAISS similarity search and removes duplicates (if same chunk retrieved).
    
    Args:
    - entities (list): List of entity names to query (e.g., ["date", "location"])
    - top_k (int): Number of chunks to retrieve per entity
    
    Returns:
    - unique_relevant_chunks (list): Deduplicated relevant chunks
    """
    retrieved_chunks = set()  # Use a set to avoid duplicate chunks

    for entity in entities:
        print(f"Searching for entity: {entity}")
        query = f"Information about {entity}."
        found_chunks = vectorstore.similarity_search(query, k=top_k)

        for chunk in found_chunks:
            retrieved_chunks.add(chunk.page_content)  # Add chunk if not already present

    # Convert set back to a list and join into a single string
    unique_relevant_chunks = list(retrieved_chunks)
    combined_text = "\n".join(unique_relevant_chunks)

    print(f"Found {len(unique_relevant_chunks)} unique relevant chunks.")
    return combined_text

# Find & combine relevant chunks
relevant_text = find_most_relevant_chunks(entities_of_interest, top_k=1)

print(f"\nMost Relevant Chunks Combined:\n{relevant_text}")

## Instantiating GPT

In [None]:
# Set the API key and model name
MODEL="gpt-4o-mini"

# Load OpenAI API Key from requirements file
with open("gpt-personal-key.txt", "r") as file:
    OPENAI_API_KEY = file.read().strip()

# Instantiate OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

## Token Count

In [None]:
# Function for calculating tokens
def count_tokens(text, model="gpt-4o"):
    """
    Counts the number of tokens in a given text for a specified OpenAI model.
    """
    encoder = tiktoken.encoding_for_model(model)
    tokens = encoder.encode(text)
    return len(tokens)

## Entity Extraction

In [None]:
# Define classes for the entities extraction
class Property(BaseModel):
    """A single property consisting of key and value."""
    key: str = Field(..., description="Property key")
    value: str = Field(..., description="Property value")

class Node(BaseNode):
    """Represents an entity in the railway accident knowledge graph."""
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    """Represents a relationship between two entities in the graph."""
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """A knowledge graph storing railway accident data."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [None]:
def call_gpt(prompt, temperature=1):
    """
    Calls the GPT model with the structured prompt and returns the raw response.
    """
    completion = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are an expert in analyzing railway accident reports. Return output in JSON format only."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature
    )
    
    response_text = completion.choices[0].message.content.strip()
    response_text = re.sub(r'^```json\n?|```$', '', response_text).strip()
    
    return response_text

In [None]:
def build_prompt(text):
    """
    Constructs a structured prompt to extract entities and relationships for railway accidents.
    """
    return f"""
    Analyze the following railway accident report context and extract structured knowledge.

    Return a JSON object with:
    - `nodes`: A list of entities, specifically {entities_of_interest}.
    - `rels`: A list of relationships linking entities.

    Look at this example JSON response and follow the schema closely. Pay attention to date and type formats (e.g., EU date format, 24-hour time).
    Ensure that the `source` and `target` nodes in `rels` are the same entities from the `nodes` list, and not different ones. 
    Think about the relationships between the entities, i.e., (node AccidentType occurred_at node Country, or node AccidentType has_date Date).
    Make sure to map all nodes with other important entities, e.g., (node UniqueAccident has_date Date, node UniqueAccident occurred_at Country).
    DO NOT map entities like (node Date is_date to node Time) or (node AccidentType is_type to node Country). This is incorrect.

    {{
        "nodes": [
            {{"id": "Dublin-Cork Accident", "type": "UniqueAccident"}},
            {{"id": "Train Derailment", "type": "AccidentType"}},
            {{"id": "23/12/2021", "type": "Date"}}
            {{"id": "16:32", "type": "Time"}},
            {{"id": "Ireland", "type": "Country"}},
            {{"id": "European Rail Agency", "type": "RegulatoryBody"}}
            
        ],
        "rels": [
            {{"source": "Dublin-Cork Accident", "target": "Ireland", "type": "occurred_in"}},
            {{"source": "Train Derailment", "target": "Dublin-Cork Accident", "type": "occurred_at"}},
            {{"source": "Dublin-Cork Accident", "target": "European Rail Agency", "type": "investigated_by"}},
            {{"source": "23/12/2021", "target": "Dublin-Cork Accident", "type": "has_date"}},
            {{"source": "16:32", "target": "Dublin-Cork Accident", "type": "has_time"}}

        ]
    }}

    Accident report context:
    {text}

    JSON:
    """

In [None]:
def extract_knowledge_graph(text):
    """
    Extracts entities & relationships from a railway accident report using GPT.
    - First, counts tokens and allows user decision.
    - If within limit, runs GPT and handles errors.
    """

    # Build prompt
    prompt = build_prompt(text)

    # Call GPT
    response_text = call_gpt(prompt)

    try:
        extracted_graph = json.loads(response_text)  # Ensure valid JSON
        return extracted_graph  # Successfully parsed knowledge graph
    except json.JSONDecodeError as e:
        print("Error parsing JSON:", str(e))
        print("Storing raw response for review...")

        # Save the faulty response for debugging
        with open("failed_graph_extractions.json", "a") as file:
            json.dump({"input_text": text[:1000], "raw_output": response_text}, file, indent=4)
            file.write("\n")

        return {}  # Return empty dictionary in case of failure

In [None]:
# Define token limit for function execution
token_limit = 4096

# Build the prompt and count tokens
prompt = build_prompt(relevant_text)
token_count = count_tokens(prompt)
estimated_cost = token_count * 0.00000015  # Approximate OpenAI pricing

# Check token limit
if token_count > token_limit:
    print(f"Token count is too high: {token_count}\nPlease reduce the chunk size or refine the prompt.")
else:
    print(f"Token count for prompt: {token_count}")

In [None]:
# Confirm Execution
proceed = input("Do you want to proceed with knowledge graph extraction? (yes/no): ").strip().lower()
if proceed != "yes":
    print("Extraction aborted by user.")
else:
    print("Sending request to GPT...")
    response_json = extract_knowledge_graph(pdf_text)

response_json

In [None]:
# File to store DataFrame
CSV_FILE = "pdf_processing_results.csv"

def append_pdf_json_result(pdf_name, response_json):
    """
    Appends the JSON output of response_json function to a DataFrame.
    If the same PDF is processed again, it adds a new column (iteration).
    If a new PDF is processed, it starts a new entry.
    """
    # Load existing CSV if available, otherwise create a new DataFrame
    if os.path.exists(CSV_FILE):
        df = pd.read_csv(CSV_FILE)
    else:
        df = pd.DataFrame(columns=["pdf_name"])

    # Check if PDF already exists in the DataFrame
    existing_rows = df[df["pdf_name"] == pdf_name]

    # Convert JSON response to a string for storage
    json_output = json.dumps(response_json, indent=2)

    if not existing_rows.empty:
        # Get all iteration columns for this PDF
        iteration_columns = [col for col in df.columns if col.startswith("Iteration_")]

        # Ensure at least one iteration column exists before checking for duplicates
        if iteration_columns:
            # Check if this JSON already exists in any previous iterations
            for iter_col in iteration_columns:
                if iter_col in existing_rows.columns and not existing_rows[iter_col].isna().all():
                    # Only compare if the column is not empty
                    if existing_rows[iter_col].iloc[0] == json_output:
                        print(f"No changes in JSON across all iterations for {pdf_name}, skipping new entry.")
                        return df  # Exit without adding a duplicate entry

        # Count how many previous iterations exist for this PDF
        iteration_count = len(iteration_columns) + 1
    else:
        # New PDF file, start at iteration 1
        iteration_count = 1

    if not existing_rows.empty:
        # Update existing row by adding a new column for this iteration
        df.loc[df["pdf_name"] == pdf_name, f"Iteration_{iteration_count}"] = json_output
    else:
        # Create a new row for the new PDF using pd.concat()
        new_row = pd.DataFrame({"pdf_name": [pdf_name], f"Iteration_{iteration_count}": [json_output]})
        df = pd.concat([df, new_row], ignore_index=True)

    # Save DataFrame back to CSV
    df.to_csv(CSV_FILE, index=False)

    print(f"Successfully added {pdf_name} - Iteration {iteration_count} to results!")
    return df

# Example execution
results_df = append_pdf_json_result(pdf_name, response_json)

In [None]:
results_df

In [None]:
# # Specify the target PDF name and iteration number
# pdf_query = "IE-6262-200429 LC Collision XM240.pdf"
# iteration_number = 1 

# # Construct the column name dynamically
# iteration_column = f"Iteration_{iteration_number}"

# # Extract the JSON string if the PDF exists and the iteration column is present
# if pdf_query in results_df["pdf_name"].values and iteration_column in results_df.columns:
#     extracted_json = results_df.loc[results_df["pdf_name"] == pdf_query, iteration_column].iloc[0]
#     print(json.dumps(json.loads(extracted_json), indent=4))  # Print or return the JSON string
# else:
#     print(f"No data found for {pdf_query} in {iteration_column}")

In [None]:
def props_to_dict(props) -> dict:
    """Converts properties to a dictionary for graph storage."""
    properties = {}
    if not props:
        return properties
    for p in props:
        properties[p["key"]] = p["value"]
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Maps extracted entities to graph nodes."""
    properties = {"name": node.id}
    return BaseNode(
        id=node.id,
        type=node.type.capitalize(),
        properties=properties
    )

def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Maps extracted relationships to graph edges."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}

    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

# Neo4j Storage

In [None]:
# Neo4j Connection Setup
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"
NEO4J_DATABASE = "neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

try:
    # Test the connection
    with driver.session() as session:
        session.run("RETURN 1")
    print("Connected to Neo4j successfully.")
except AuthError as e:
    print("Authentication failed. Check your credentials:", str(e))

In [None]:
# Clear database
def clear_neo4j_database():
    """Delete all nodes and relationships in the Neo4j database."""
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run("MATCH (n) DETACH DELETE n")
    print("Neo4j database cleared successfully.")

# Run the function to clear the database
clear_neo4j_database()

In [None]:
results_df

In [None]:
# Define the JSON that you'd like to convert to a graph
json_to_convert = json.loads(results_df["Iteration_1"][5])
print(f"The JSON you chose:\n{json_to_convert}")

# Index first row of a dataframe, first column

In [None]:
def convert_json_to_graph(json_to_convert, source_text):
    """
    Converts extracted JSON into a graph-compatible format with correct entity types.
    """

    def get_node_type(json_data, node_id):
        """
        Helper function to retrieve the correct node type from JSON.
        """
        for node in json_data["nodes"]:
            if node["id"] == node_id:
                return node["type"]
        return "Unknown"  # Fallback if type is missing

    if not json_to_convert:
        print("No valid data to convert to a graph.")
        return None

    # Convert Nodes
    graph_nodes = [map_to_base_node(Node(id=node["id"], type=node["type"])) for node in json_to_convert["nodes"]]

    # Convert Relationships (Ensure correct types)
    graph_rels = []
    for rel in json_to_convert["rels"]:
        source_node = Node(id=rel["source"], type=get_node_type(json_to_convert, rel["source"]))
        target_node = Node(id=rel["target"], type=get_node_type(json_to_convert, rel["target"]))
        graph_rels.append(map_to_base_relationship(Relationship(source=source_node, target=target_node, type=rel["type"])))

    return GraphDocument(nodes=graph_nodes, relationships=graph_rels, source=Document(page_content=source_text))


In [None]:
def store_in_neo4j(graph_document):
    """
    Stores extracted knowledge graph into Neo4j with dynamic labels.
    """
    with driver.session() as session:
        # Store nodes with dynamic labels
        for node in graph_document.nodes:
            session.run(f"""
                MERGE (n:{node.type} {{id: $id}})
                ON CREATE SET n.name = $name
            """, id=node.id, name=node.id)

        # Store relationships
        for rel in graph_document.relationships:
            session.run("""
                MATCH (s {id: $source})
                MATCH (t {id: $target})
                MERGE (s)-[:RELATIONSHIP {type: $type}]->(t)
            """, source=rel.source.id, target=rel.target.id, type=rel.type)


In [None]:
def process_railway_accident_report(json_to_convert):
    
    print("Converting JSON to graph format...")
    graph_document = convert_json_to_graph(json_to_convert, relevant_text)

    if graph_document:
        print("Graph structure created! Storing in Neo4j...")
        store_in_neo4j(graph_document)

In [None]:
# Store extracted entities into Neo4j
try:
    db_result = process_railway_accident_report(json_to_convert)
    print("Data stored in Neo4j successfully.")
except Exception as e:
    print("Failed to store data in Neo4j:", str(e))

In [None]:
# Close Neo4j connection
driver.close()