In [1]:
pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-win_amd64.whl (16.5 MB)
   ---------------------------------------- 0.0/16.5 MB ? eta -:--:--
   ------------------------ --------------- 10.0/16.5 MB 56.7 MB/s eta 0:00:01
   ------------------------------------ --- 14.9/16.5 MB 42.8 MB/s eta 0:00:01
   ---------------------------------------  16.5/16.5 MB 29.8 MB/s eta 0:00:01
   ---------------------------------------- 16.5/16.5 MB 24.8 MB/s eta 0:00:00
Installing collected packages: pymupdf
Successfully installed pymupdf-1.25.3
Note: you may need to restart the kernel to use updated packages.


In [19]:
# Load necessary libraries
import pdfplumber
from openai import OpenAI
import json
import os
import regex as re
from neo4j import GraphDatabase
from neo4j.exceptions import AuthError
import pymupdf
import base64

In [25]:
## Set the API key and model name
MODEL="gpt-4o-mini"

# Load OpenAI API Key from requirements file
with open("gpt-personal-key.txt", "r") as file:
    OPENAI_API_KEY = file.read().strip()

client = OpenAI(api_key=OPENAI_API_KEY)

In [43]:
# Neo4j Connection Setup
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"
NEO4J_DATABASE = "neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

try:
    # Test the connection
    with driver.session() as session:
        session.run("RETURN 1")
    print("Connected to Neo4j successfully.")
except AuthError as e:
    print("Authentication failed. Check your credentials:", str(e))

Connected to Neo4j successfully.


In [75]:
def clear_neo4j_database():
    """Delete all nodes and relationships in the Neo4j database."""
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run("MATCH (n) DETACH DELETE n")
    print("Neo4j database cleared successfully.")

# Run the function to clear the database
clear_neo4j_database()

Neo4j database cleared successfully.


In [None]:
import fitz

pdf_path = "raiu_example.pdf"
doc = pymupdf.open(pdf_path)

images_data = []

for page_index, page in enumerate(doc):
    image_list = page.get_images(full=True)
    text_blocks = page.get_text("blocks")

    if not image_list:
        continue

    # We look for any text on this page that matches "Figure" or "Fig."
    # Then we associate the first such text with all images on the page (very rough approach)
    figure_captions = [
        block[4].strip() for block in text_blocks 
        if "figure" in block[4].lower() or "fig." in block[4].lower()
    ]
    # If there's at least one "Figure" mention, pick the first
    page_caption = figure_captions[0] if figure_captions else f"Page{page_index+1}"

    for img_index, img_info in enumerate(image_list):
        xref = img_info[0]
        
        # Try to extract the image as a Pixmap
        try:
            pix = pymupdf.Pixmap(doc, xref)
        except ValueError:
            # Not a valid raster image
            continue

        # Save the image
        safe_caption = page_caption.replace(" ", "_").replace("/", "_")
        output_filename = f"Page{page_index+1}_Img{img_index+1}_{safe_caption}.png"
        pix.save(output_filename)

        images_data.append({
            "page": page_index + 1,
            "image_index": img_index + 1,
            "filename": output_filename,
            "caption": page_caption
        })

        pix = None

doc.close()


In [17]:
images_data

[{'page': 1,
  'image_index': 1,
  'filename': 'Page1_Img1_Page1.png',
  'caption': 'Page1'},
 {'page': 1,
  'image_index': 2,
  'filename': 'Page1_Img2_Page1.png',
  'caption': 'Page1'},
 {'page': 14,
  'image_index': 1,
  'filename': 'Page14_Img1_40_The_location_of_the_broken_rail_(see_Figure_1)_was_on_the_Up_mainline_from_Cork_to_Heuston.png',
  'caption': '40 The location of the broken rail (see Figure 1) was on the Up mainline from Cork to Heuston'},
 {'page': 14,
  'image_index': 2,
  'filename': 'Page14_Img2_40_The_location_of_the_broken_rail_(see_Figure_1)_was_on_the_Up_mainline_from_Cork_to_Heuston.png',
  'caption': '40 The location of the broken rail (see Figure 1) was on the Up mainline from Cork to Heuston'},
 {'page': 14,
  'image_index': 3,
  'filename': 'Page14_Img3_40_The_location_of_the_broken_rail_(see_Figure_1)_was_on_the_Up_mainline_from_Cork_to_Heuston.png',
  'caption': '40 The location of the broken rail (see Figure 1) was on the Up mainline from Cork to Heuston

In [44]:
def encode_image(image_path):
    """Return a Base64-encoded string of the image file."""
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

# Example usage:
image_path = "Page14_Img1_40_The_location_of_the_broken_rail_(see_Figure_1)_was_on_the_Up_mainline_from_Cork_to_Heuston.png"
base64_image = encode_image(image_path)

caption = next((item["caption"] for item in images_data if item["filename"] == image_path), None)

messages = [
    {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": f"What is in this image? List some important details and names you see in JSON format, mainly: location, streets, cities, objects. Consider also the image's caption: {caption}",
                },
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                },
            ],
        }
    ]

response = client.chat.completions.create(
    model=MODEL,
    messages=messages
)

# Print the assistant's reply
print(response.choices[0].message.content)

```json
{
  "location": "Ireland",
  "streets": [
    "R515",
    "R516",
    "R662",
    "R664",
    "R663",
    "R613"
  ],
  "cities": [
    "Cork",
    "Heuston",
    "Limerick Junction",
    "Tipperary",
    "Emly",
    "Ballinalard",
    "Kilross"
  ],
  "objects": [
    "mainline",
    "rail"
  ]
}
```


In [70]:
# Store in Neo4j
def store_in_neo4j(json_data):
    """Store extracted data in Neo4j."""
    if not json_data:
        print("No valid entities to store in Neo4j.")
        return
    
    with driver.session(database=NEO4J_DATABASE) as session:
        for category, item in json_data.items():  # Iterate over key-value pairs
            if isinstance(item, list):  # If it's a list, iterate over items
                for value in item:
                    session.run("""
                        MERGE (n:Entity {name: $name, category: $category})
                    """, name=value, category=category)
            else:  # If it's a single string, store it directly
                session.run("""
                    MERGE (n:Entity {name: $name, category: $category})
                """, name=item, category=category)


In [76]:
# Store in Neo4j with Relationships
def store_in_neo4j(json_data):
    """Store extracted data in Neo4j and create relationships."""
    if not json_data:
        print("No valid entities to store in Neo4j.")
        return
    
    with driver.session(database=NEO4J_DATABASE) as session:
        # Create nodes
        session.run("""
            MERGE (d:Date {name: $date})
        """, date=json_data.get("date", "Unknown"))
        
        session.run("""
            MERGE (l:Location {name: $location})
        """, location=json_data.get("location", "Unknown"))
        
        session.run("""
            MERGE (r:RegulatoryBody {name: $regulatory_body})
        """, regulatory_body=json_data.get("regulatory_body", "Unknown"))
        
        # Create relationships
        session.run("""
            MATCH (d:Date {name: $date}), (l:Location {name: $location})
            MERGE (d)-[:OCCURRED_AT]->(l)
        """, date=json_data.get("date", "Unknown"), location=json_data.get("location", "Unknown"))
        
        session.run("""
            MATCH (l:Location {name: $location}), (r:RegulatoryBody {name: $regulatory_body})
            MERGE (l)-[:REGULATED_BY]->(r)
        """, location=json_data.get("location", "Unknown"), regulatory_body=json_data.get("regulatory_body", "Unknown"))

In [77]:
# Store extracted entities into Neo4j
try:
    db_result = store_in_neo4j(entity_json)
    print("Data stored in Neo4j successfully.")
except Exception as e:
    print("Failed to store data in Neo4j:", str(e))

Data stored in Neo4j successfully.


In [None]:
# Close Neo4j connection
driver.close()