In [1]:
# Install required packages (added PyPDF2 for PDF text extraction)
import sys
!{sys.executable} -m pip install pydantic neo4j sentence-transformers python-dotenv litellm PyPDF2 nltk

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------- ----- 1.3/1.5 MB 6.1 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 5.0 MB/s eta 0:00:00
Installing collected packages: PyPDF2, nltk
Successfully installed PyPDF2-3.0.1 nltk-3.9.2



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Imports
import os
import json
import hashlib
import time
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, Field, ValidationError
from tqdm import tqdm

import litellm
from neo4j import GraphDatabase, Driver
from dotenv import load_dotenv

from sentence_transformers import SentenceTransformer
import nltk
import PyPDF2
nltk.download('punkt', quiet=True)  # For sentence tokenization

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
# Embedding helper (unchanged)
def get_embedding(text: str) -> List[float]:
    embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    embeddings = embedding_model.encode(text)
    return embeddings.tolist()

In [3]:
# Configuration (unchanged)
load_dotenv()

litellm.api_key = os.getenv("GROQ_API_KEY")
litellm.set_verbose = True

litellm.model_list = [
    {
        "model_name": "groq/llama-3.3-70b-versatile",
        "litellm_params": {
            "model": "groq/llama-3.3-versatile",
            "api_key": os.getenv("GROQ_API_KEY")
        }
    }
]

NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE", "neo4j")

In [4]:
# Pydantic models (added source_type for PDF)
class AtomicFact(BaseModel):
    key_elements: List[str] = Field(..., description="Key nouns, verbs, or entities (e.g., 'Tesla', 'Q4 2023', '$1.2B')")
    atomic_fact: str = Field(..., description="A single, indivisible fact as a concise sentence.")

class ChunkEnrichment(BaseModel):
    atomic_facts: List[AtomicFact]

class DocumentNode(BaseModel):
    fileName: str
    source_type: str = Field(default="unstructured", description="'unstructured', 'pdf', 'markdown', etc.")
    content_hash: str  # To avoid re-ingesting
    chunking_strategy: str = Field(default="fixed", description="Chunking strategy used: 'fixed', 'semantic', 'header_aware', 'pdf_page'")

In [5]:
# NEW: Text extraction functions
def extract_text_from_txt(file_path: str) -> str:
    """Extract plain text from .txt file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def extract_text_from_pdf(file_path: str) -> str:
    """Extract text from PDF using PyPDF2 (page-by-page)."""
    text = ""
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page_num, page in enumerate(reader.pages, start=1):
            page_text = page.extract_text()
            if page_text:
                text += f"\n\n--- Page {page_num} ---\n\n" + page_text
    return text.strip()

def extract_text_from_md(file_path: str) -> str:
    """Extract text from Markdown (same as txt for now)."""
    return extract_text_from_txt(file_path)

def extract_text(file_path: str) -> str:
    """Unified text extraction based on file extension."""
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.pdf':
        print(f"Extracting text from PDF: {file_path}")
        return extract_text_from_pdf(file_path)
    elif ext == '.md':
        return extract_text_from_md(file_path)
    elif ext in ['.txt', '']:
        return extract_text_from_txt(file_path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")

In [6]:
# UPDATED: Chunking with PDF page awareness
def chunk_text_fixed(text: str, separator: str = "\n\n", max_length: int = 1000) -> List[Dict[str, Any]]:
    chunks = text.split(separator)
    final_chunks = []
    for chunk in chunks:
        if len(chunk) > max_length:
            sub_chunks = [chunk[i:i+max_length] for i in range(0, len(chunk), max_length)]
            for sub in sub_chunks:
                final_chunks.append({'text': sub.strip(), 'metadata': {}})
        else:
            final_chunks.append({'text': chunk.strip(), 'metadata': {}})
    return [c for c in final_chunks if c['text']]

def chunk_text_semantic(text: str, similarity_threshold: float = 0.7) -> List[Dict[str, Any]]:
    sentences = nltk.sent_tokenize(text)
    if not sentences:
        return [{'text': text, 'metadata': {'semantic_score': 1.0}}]
    
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    embeddings = model.encode(sentences)
    
    chunks = []
    current_chunk = [sentences[0]]
    current_emb = embeddings[0]
    
    for i in range(1, len(sentences)):
        next_emb = embeddings[i]
        sim = model.similarity(current_emb, next_emb).item()
        if sim > similarity_threshold:
            current_chunk.append(sentences[i])
            current_emb = (len(current_chunk) - 1) / len(current_chunk) * current_emb + 1 / len(current_chunk) * next_emb
        else:
            chunks.append({'text': ' '.join(current_chunk), 'metadata': {'semantic_avg_sim': sim}})
            current_chunk = [sentences[i]]
            current_emb = next_emb
    chunks.append({'text': ' '.join(current_chunk), 'metadata': {'semantic_avg_sim': similarity_threshold}})
    return chunks

def chunk_text_header_aware(text: str, max_length: int = 1000) -> List[Dict[str, Any]]:
    sections = []
    current_header = "No Header"
    current_content = []
    
    lines = text.split('\n')
    for line in lines:
        if line.startswith('## '):
            if current_content:
                sections.append({'text': '\n'.join(current_content), 'metadata': {'header': current_header}})
                current_content = []
            current_header = line[3:].strip()
        else:
            current_content.append(line)
    if current_content:
        sections.append({'text': '\n'.join(current_content), 'metadata': {'header': current_header}})
    
    final_chunks = []
    for sec in sections:
        if len(sec['text']) > max_length:
            sub_chunks = [sec['text'][i:i+max_length] for i in range(0, len(sec['text']), max_length)]
            for sub in sub_chunks:
                final_chunks.append({'text': sub.strip(), 'metadata': sec['metadata']})
        else:
            final_chunks.append(sec)
    return final_chunks

def chunk_text_pdf_page_aware(text: str, max_length: int = 1000) -> List[Dict[str, Any]]:
    """
    PDF-specific: Split by --- Page X --- markers, then apply fixed chunking per page.
    """
    pages = text.split('--- Page ')
    chunks = []
    for page in pages[1:]:  # Skip first empty
        try:
            page_num = int(page.split(' ---')[0])
            page_text = page.split('---\n\n', 1)[1] if '\n\n' in page else page
        except:
            page_num = "unknown"
            page_text = page
        
        page_chunks = chunk_text_fixed(page_text, max_length=max_length)
        for pc in page_chunks:
            pc['metadata']['page'] = page_num
            chunks.append(pc)
    return chunks

def chunk_text(text: str, strategy: str = "fixed", max_length: int = 1000, similarity_threshold: float = 0.7, source_type: str = "unstructured") -> List[Dict[str, Any]]:
    """
    Dynamic chunking dispatcher with source awareness.
    """
    if strategy == "semantic":
        return chunk_text_semantic(text, similarity_threshold)
    elif strategy == "header_aware":
        return chunk_text_header_aware(text, max_length)
    elif strategy == "pdf_page":
        return chunk_text_pdf_page_aware(text, max_length)
    else:  # fixed
        return chunk_text_fixed(text, max_length=max_length)

In [7]:
def get_enrichment_from_chunk(chunk_text: str, max_retries: int = 3) -> Optional[ChunkEnrichment]:
    prompt = f"""
    You are an intelligent assistant. Meticulously extract top 5 structured information
    from the following financial text.

    1. Key Elements: Essential nouns, entities, or numbers pivotal to the text.
    2. Atomic Facts: The smallest, indivisible facts, presented as concise sentences.

    Format Instructions: {ChunkEnrichment.model_json_schema()} following the JSON schema.

    Text:
    {chunk_text}
    """

    for attempt in range(max_retries):
        try:
            response = litellm.completion(
                model="groq/llama-3.3-70b-versatile",
                messages=[{"role": "user", "content": prompt}],
                response_format={"type": "json_object"},
                timeout=30.0,  # seconds
                max_tokens=512
            )
            json_response = response.choices[0].message.content
            parsed_data = json.loads(json_response)
            return ChunkEnrichment(**parsed_data)

        except litellm.Timeout as e:
            print(f"  ! LLM timeout (attempt {attempt + 1}/{max_retries}): {e}")
            if attempt == max_retries - 1:
                return None
            time.sleep(2 ** attempt)  # exponential backoff

        except litellm.RateLimitError as e:
            print(f"  ! Rate limited (attempt {attempt + 1}/{max_retries}): {e}")
            time.sleep(5)
            continue

        except Exception as e:
            print(f"  ! LLM error (attempt {attempt + 1}/{max_retries}): {e}")
            if attempt == max_retries - 1:
                return None
            time.sleep(1)

    return None

In [8]:
# ingest_document (unchanged except metadata handling)
def ingest_document(driver: Driver, doc: DocumentNode, chunks: List[Dict[str, Any]]):
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run(
            """
            MERGE (d:Document {fileName: $fileName})
            ON CREATE SET
                d.content_hash      = $hash,
                d.source_type       = $type,
                d.chunking_strategy = $strategy
            ON MATCH SET
                d.content_hash      = $hash,
                d.source_type       = $type,
                d.chunking_strategy = $strategy
            """,
            fileName=doc.fileName,
            hash=doc.content_hash,
            type=doc.source_type,
            strategy=doc.chunking_strategy
        )
        
        print(f"Ingesting document: {doc.fileName} ({len(chunks)} chunks, strategy: {doc.chunking_strategy})")
        chunk_ids = []
        
        for i, chunk_data in enumerate(tqdm(chunks, desc="Processing chunks")):
            chunk_text = chunk_data['text']
            metadata = json.dumps(chunk_data['metadata']) if chunk_data['metadata'] else '{}'
            chunk_id = f"{doc.fileName}_chunk_{i}"
            chunk_ids.append(chunk_id)
            chunk_embedding = get_embedding(chunk_text)
            
            session.run(
                """
                MATCH (d:Document {fileName: $fileName})
                MERGE (s:SectionChunk {chunk_id: $chunk_id})
                ON CREATE SET s.text = $text, s.embedding = $embedding, s.metadata = $metadata
                MERGE (d)-[:HAS_SECTION]->(s)
                """,
                fileName=doc.fileName, chunk_id=chunk_id, 
                text=chunk_text, embedding=chunk_embedding, metadata=metadata
            )
            
            enrichment = get_enrichment_from_chunk(chunk_text)
            
            if enrichment and enrichment.atomic_facts:
                for fact in enrichment.atomic_facts:
                    fact_embedding = get_embedding(fact.atomic_fact)
                    
                    result = session.run(
                        """
                        MATCH (s:SectionChunk {chunk_id: $chunk_id})
                        MERGE (f:FactNode {fact: $fact_text})
                        ON CREATE SET f.embedding = $embedding
                        MERGE (s)-[:HAS_FACT]->(f)
                        RETURN elementId(f) AS fact_id
                        """,
                        chunk_id=chunk_id, 
                        fact_text=fact.atomic_fact, 
                        embedding=fact_embedding
                    ).single()
                    fact_id = result["fact_id"]
                    
                    if fact.key_elements:
                        session.run(
                            """
                            MATCH (f:FactNode) WHERE elementId(f) = $fact_id
                            WITH f
                            UNWIND $elements AS elem_name
                            MERGE (e:EntityNode {name: elem_name})
                            MERGE (f)-[:HAS_ENTITY]->(e)
                            """,
                            fact_id=fact_id, elements=fact.key_elements
                        )
                print(f"  + Enriched chunk {i} with {len(enrichment.atomic_facts)} facts.")
            else:
                print(f"  ! Fallback: Skipped enrichment for chunk {i}.")
        
        for i in range(len(chunk_ids) - 1):
            session.run(
                """
                MATCH (prev:SectionChunk {chunk_id: $prev_id}),
                      (next:SectionChunk {chunk_id: $next_id})
                MERGE (prev)-[:NEXT]->(next)
                """,
                prev_id=chunk_ids[i], next_id=chunk_ids[i+1]
            )
        print(f"  + Added NEXT relations between {len(chunk_ids) - 1} chunk pairs.")

In [9]:
# UPDATED: ingest_from_file with PDF support and strategy detection
def ingest_from_file(driver: Driver, file_path: str, chunk_size: Optional[int] = None, force_strategy: Optional[str] = None):
    """
    Ingest from any file: PDF, TXT, MD.
    Auto-detects source and chunking strategy.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    ext = os.path.splitext(file_path)[1].lower()
    file_name = os.path.basename(file_path)
    
    # Extract text
    content = extract_text(file_path)
    if not content.strip():
        print(f"Warning: No text extracted from {file_path}. Skipping.")
        return
    
    content_hash = hashlib.md5(content.encode()).hexdigest()
    
    # Determine source_type and strategy
    if ext == '.pdf':
        source_type = 'pdf'
        strategy = force_strategy or 'pdf_page'
    elif ext == '.md':
        source_type = 'markdown'
        strategy = force_strategy or 'header_aware'
    else:
        source_type = 'unstructured'
        strategy = force_strategy or 'fixed'
    
    # Idempotency
    with driver.session(database=NEO4J_DATABASE) as session:
        existing = session.run(
            "MATCH (d:Document {fileName: $name, content_hash: $hash, chunking_strategy: $strategy}) RETURN d",
            name=file_name, hash=content_hash, strategy=strategy
        ).single()
        if existing:
            print(f"Document {file_name} already ingested (hash & strategy match). Skipping.")
            return
    
    max_len = chunk_size or 1000
    chunks = chunk_text(content, strategy=strategy, max_length=max_len, source_type=source_type)
    doc = DocumentNode(
        fileName=file_name,
        source_type=source_type,
        content_hash=content_hash,
        chunking_strategy=strategy
    )
    
    ingest_document(driver, doc, chunks)
    print(f"Successfully ingested {file_name} as {source_type} using {strategy} chunking.")

In [10]:
# Index creation (now includes metadata for page filtering)
def create_indexes(driver: Driver):
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run(
            """
            CREATE VECTOR INDEX fact_embeddings IF NOT EXISTS
            FOR (f:FactNode) ON (f.embedding)
            OPTIONS { indexConfig: { `vector.dimensions`: 768, `vector.similarity_function`: 'cosine' }}
            """
        )
        session.run(
            """
            CREATE VECTOR INDEX section_embeddings IF NOT EXISTS
            FOR (s:SectionChunk) ON (s.embedding)
            OPTIONS { indexConfig: { `vector.dimensions`: 768, `vector.similarity_function`: 'cosine' }}
            """
        )
        session.run(
            """
            CREATE FULLTEXT INDEX text_index IF NOT EXISTS
            FOR (n:FactNode|SectionChunk) ON EACH [n.fact, n.text]
            """
        )
        print("Indexes created successfully.")

In [11]:
# main() supports any file type
def main(file_path: Optional[str] = None, chunk_size: Optional[int] = None, force_strategy: Optional[str] = None):
    try:
        driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
        print("Connecting to Neo4j...")
        print("Using database:", NEO4J_DATABASE)
        print("NEO4J_URI:", NEO4J_URI)
        print("NEO4J_USER:", NEO4J_USER)
        print("NEO4J_PASSWORD:", NEO4J_PASSWORD)
        
        driver.verify_connectivity()
        print("Neo4j connection successful.")
        
        create_indexes(driver)
        
        if file_path:
            ingest_from_file(driver, file_path, chunk_size, force_strategy)
        else:
            doc_content = "Tesla, Inc. (TSLA) reported its fourth-quarter 2024 earnings on January 25, 2025."
            chunks = chunk_text(doc_content, strategy='fixed')
            doc = DocumentNode(
                fileName="sample.txt",
                content_hash=hashlib.md5(doc_content.encode()).hexdigest(),
                chunking_strategy='fixed'
            )
            ingest_document(driver, doc, chunks)
        
        print("Ingestion complete.")
        
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        if 'driver' in locals():
            driver.close()
            print("Neo4j connection closed.")

In [12]:
main(file_path="Basics_of_finmkts.pdf")

Connecting to Neo4j...
Using database: graphreader3
NEO4J_URI: neo4j://127.0.0.1:7687
NEO4J_USER: neo4j
NEO4J_PASSWORD: neo4j1999
Neo4j connection successful.
Indexes created successfully.
Extracting text from PDF: Basics_of_finmkts.pdf
Ingesting document: Basics_of_finmkts.pdf (36 chunks, strategy: pdf_page)




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:   3%|▎         | 1/36 [00:24<14:30, 24.88s/it]

  + Enriched chunk 0 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:   6%|▌         | 2/36 [00:47<13:19, 23.51s/it]

  + Enriched chunk 1 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:   8%|▊         | 3/36 [01:10<12:49, 23.32s/it]

  + Enriched chunk 2 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  11%|█         | 4/36 [01:33<12:27, 23.36s/it]

  + Enriched chunk 3 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  14%|█▍        | 5/36 [01:56<11:56, 23.13s/it]

  + Enriched chunk 4 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  17%|█▋        | 6/36 [02:19<11:30, 23.02s/it]

  + Enriched chunk 5 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  19%|█▉        | 7/36 [02:42<11:09, 23.09s/it]

  + Enriched chunk 6 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  22%|██▏       | 8/36 [03:07<11:00, 23.59s/it]

  + Enriched chunk 7 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  25%|██▌       | 9/36 [03:30<10:33, 23.46s/it]

  + Enriched chunk 8 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  28%|██▊       | 10/36 [03:53<10:05, 23.27s/it]

  + Enriched chunk 9 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  31%|███       | 11/36 [04:17<09:45, 23.42s/it]

  + Enriched chunk 10 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  33%|███▎      | 12/36 [04:40<09:22, 23.45s/it]

  + Enriched chunk 11 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  36%|███▌      | 13/36 [05:03<08:53, 23.19s/it]

  + Enriched chunk 12 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  39%|███▉      | 14/36 [05:26<08:31, 23.25s/it]

  + Enriched chunk 13 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  42%|████▏     | 15/36 [05:49<08:06, 23.16s/it]

  + Enriched chunk 14 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  44%|████▍     | 16/36 [06:13<07:49, 23.47s/it]

  + Enriched chunk 15 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  47%|████▋     | 17/36 [06:21<05:55, 18.73s/it]

  + Enriched chunk 16 with 1 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  50%|█████     | 18/36 [06:47<06:15, 20.85s/it]

  + Enriched chunk 17 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  53%|█████▎    | 19/36 [07:11<06:13, 21.95s/it]

  + Enriched chunk 18 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  56%|█████▌    | 20/36 [07:34<05:56, 22.27s/it]

  + Enriched chunk 19 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  58%|█████▊    | 21/36 [07:58<05:38, 22.57s/it]

  + Enriched chunk 20 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  61%|██████    | 22/36 [08:21<05:20, 22.91s/it]

  + Enriched chunk 21 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  64%|██████▍   | 23/36 [08:45<04:59, 23.01s/it]

  + Enriched chunk 22 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  67%|██████▋   | 24/36 [09:08<04:38, 23.20s/it]

  + Enriched chunk 23 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  69%|██████▉   | 25/36 [09:31<04:12, 22.99s/it]

  + Enriched chunk 24 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  72%|███████▏  | 26/36 [09:53<03:48, 22.87s/it]

  + Enriched chunk 25 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  75%|███████▌  | 27/36 [10:17<03:27, 23.07s/it]

  + Enriched chunk 26 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  78%|███████▊  | 28/36 [10:41<03:07, 23.42s/it]

  + Enriched chunk 27 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  81%|████████  | 29/36 [11:05<02:45, 23.70s/it]

  + Enriched chunk 28 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  83%|████████▎ | 30/36 [11:29<02:21, 23.52s/it]

  + Enriched chunk 29 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  86%|████████▌ | 31/36 [11:52<01:56, 23.38s/it]

  + Enriched chunk 30 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  89%|████████▉ | 32/36 [12:15<01:33, 23.42s/it]

  + Enriched chunk 31 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  92%|█████████▏| 33/36 [12:38<01:10, 23.42s/it]

  + Enriched chunk 32 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  94%|█████████▍| 34/36 [13:03<00:47, 23.71s/it]

  + Enriched chunk 33 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:  97%|█████████▋| 35/36 [13:26<00:23, 23.65s/it]

  + Enriched chunk 34 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks: 100%|██████████| 36/36 [13:51<00:00, 23.08s/it]

  + Enriched chunk 35 with 5 facts.





  + Added NEXT relations between 35 chunk pairs.
Successfully ingested Basics_of_finmkts.pdf as pdf using pdf_page chunking.
Ingestion complete.
Neo4j connection closed.


In [13]:
##################################################################################################

In [14]:
# Run with hardcoded
main()

Connecting to Neo4j...
Using database: graphreader3
NEO4J_URI: neo4j://127.0.0.1:7687
NEO4J_USER: neo4j
NEO4J_PASSWORD: neo4j1999
Neo4j connection successful.
Indexes created successfully.
Ingesting document: sample.txt (1 chunks, strategy: fixed)




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'max_tokens': 512, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


Processing chunks:   0%|          | 0/1 [00:08<?, ?it/s]


Neo4j connection closed.


KeyboardInterrupt: 

Connecting to Neo4j...
Using database: graphreader3
NEO4J_URI: neo4j://127.0.0.1:7687
NEO4J_USER: neo4j
NEO4J_PASSWORD: neo4j1999
Neo4j connection successful.
Indexes created successfully.
Extracting text from PDF: Basics_of_finmkts.pdf
Ingesting document: Basics_of_finmkts.pdf (36 chunks, strategy: pdf_page)




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 0 with 3 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 1 with 8 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 2 with 6 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 3 with 13 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 4 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 5 with 7 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 6 with 4 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}


: 

In [None]:
main(file_path="Principles_for_the_Management_of_Credit_Risk (3).pdf")

Connecting to Neo4j...
Using database: graphreader3
NEO4J_URI: neo4j://127.0.0.1:7687
NEO4J_USER: neo4j
NEO4J_PASSWORD: neo4j1999
Neo4j connection successful.
Indexes created successfully.
Extracting text from PDF: Principles_for_the_Management_of_Credit_Risk (3).pdf
Ingesting document: Principles_for_the_Management_of_Credit_Risk (3).pdf (96 chunks, strategy: pdf_page)




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 0 with 4 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
Neo4j connection closed.


KeyboardInterrupt: 

In [None]:
main(file_path="Spotlight on Real-World Enterprise AI.pdf")


Neo4j connection successful.
Indexes created successfully.
Extracting text from PDF: Spotlight on Real-World Enterprise AI.pdf
Ingesting document: Spotlight on Real-World Enterprise AI.pdf (62 chunks, strategy: pdf_page)




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 0 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 1 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 2 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 3 with 4 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 4 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 5 with 13 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 6 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 7 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 8 with 4 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 9 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 10 with 4 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 11 with 6 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 12 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
An error occurred: The paging file is too small for this operation to complete. (os error 1455)
Neo4j connection closed.


In [None]:
# INGEST PDF EXAMPLE
main(file_path="2412.14093v2.pdf")

# Force semantic on PDF
# main(file_path="path/to/report.pdf", force_strategy="semantic")

Neo4j connection successful.
Indexes created successfully.
Extracting text from PDF: 2412.14093v2.pdf




Ingesting document: 2412.14093v2.pdf (563 chunks, strategy: pdf_page)




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 0 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 1 with 7 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 2 with 8 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 3 with 7 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
  + Enriched chunk 4 with 4 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
Neo4j connection closed.


KeyboardInterrupt: 

In [None]:
# Test PDF extraction (mock)
sample_pdf_text = extract_text_from_pdf("sample.pdf")  # Replace with real path
print(sample_pdf_text[:500])