In [2]:
pip install pydantic neo4j sentence-transformers python-dotenv

Collecting pydantic
  Downloading pydantic-2.12.3-py3-none-any.whl.metadata (87 kB)
Collecting neo4j
  Downloading neo4j-6.0.2-py3-none-any.whl.metadata (5.2 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Collecting annotated-types>=0.6.0 (from pydantic)
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.41.4 (from pydantic)
  Downloading pydantic_core-2.41.4-cp312-cp312-win_amd64.whl.metadata (7.4 kB)
Collecting typing-extensions>=4.14.1 (from pydantic)
  Downloading typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting typing-inspection>=0.4.2 (from pydantic)
  Downloading typing_inspection-0.4.2-py3-none-any.whl.metadata (2.6 kB)
Collecting pytz (from neo4j)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting transformers<5.0.0,>=4.41.0 (f


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
pip install litellm

Collecting litellm
  Downloading litellm-1.79.1-py3-none-any.whl.metadata (30 kB)
Collecting aiohttp>=3.10 (from litellm)
  Downloading aiohttp-3.13.2-cp312-cp312-win_amd64.whl.metadata (8.4 kB)
Collecting click (from litellm)
  Using cached click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting fastuuid>=0.13.0 (from litellm)
  Downloading fastuuid-0.14.0-cp312-cp312-win_amd64.whl.metadata (1.1 kB)
Collecting httpx>=0.23.0 (from litellm)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting importlib-metadata>=6.8.0 (from litellm)
  Using cached importlib_metadata-8.7.0-py3-none-any.whl.metadata (4.8 kB)
Collecting jsonschema<5.0.0,>=4.22.0 (from litellm)
  Downloading jsonschema-4.25.1-py3-none-any.whl.metadata (7.6 kB)
Collecting openai>=1.99.5 (from litellm)
  Downloading openai-2.7.1-py3-none-any.whl.metadata (29 kB)
Collecting tiktoken>=0.7.0 (from litellm)
  Downloading tiktoken-0.12.0-cp312-cp312-win_amd64.whl.metadata (6.9 kB)
Collecting aiohappyeyeballs


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import json
import hashlib
from typing import List, Optional
from pydantic import BaseModel, Field, ValidationError

import litellm
from neo4j import GraphDatabase, Driver
from dotenv import load_dotenv

from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# --- Helper for Embeddings (Placeholder) ---
# We'll use a placeholder. In production, you'd use a real model.
# e.g., from sentence_transformers import SentenceTransformer
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
#
def get_embedding(text: str) -> List[float]:
    """
    Placeholder for your embedding function.
    Replace with your actual embedding model.
    The dimension (e.g., 768) MUST match your index config.
    """
    # In a real implementation:
    embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    embeddings = embedding_model.encode(text)
    return embeddings.tolist()
    # return embedding_model.encode(text).tolist()
    
    # Using a 768-dim list of zeros as a placeholder
    #print(f"Generating placeholder embedding for: {text[:30]}...")
    #return [0.0] * 768

In [4]:
# === 1. Load Config & Set Up Clients ===
load_dotenv()

# --- litellm Configuration ---
litellm.api_key = os.getenv("GROQ_API_KEY")
litellm.set_verbose = True
#litellm._turn_on_debug()

# This sets up your requested Groq -> Ollama fallback
litellm.model_list = [
    {
        "model_name": "groq/llama-3.3-70b-versatile",
        "litellm_params": {
            "model": "groq/llama-3.3-70b-versatile",
            "api_key": os.getenv("GROQ_API_KEY")
        }
    }
]

# --- Neo4j Connection ---
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE", "neo4j")

In [5]:
litellm.model_list

[{'model_name': 'groq/llama-3.3-70b-versatile',
  'litellm_params': {'model': 'groq/llama-3.3-70b-versatile',
   'api_key': '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'}}]

In [6]:
# === 2. Pydantic Models for Ingestion ===
# Based on our discussion and your original ingestor.ipynb

class AtomicFact(BaseModel):
    key_elements: List[str] = Field(..., description="Key nouns, verbs, or entities (e.g., 'Tesla', 'Q4 2023', '$1.2B')")
    atomic_fact: str = Field(..., description="A single, indivisible fact as a concise sentence.")

class ChunkEnrichment(BaseModel):
    atomic_facts: List[AtomicFact]

# New models for the generic graph structure
class DocumentNode(BaseModel):
    fileName: str
    source_type: str = "unstructured"
    content_hash: str # To avoid re-ingesting

In [None]:
# === 3. Core Ingestion Functions ===

def get_enrichment_from_chunk(chunk_text: str) -> Optional[ChunkEnrichment]:
    """
    Calls the Groq LLM via litellm to extract facts.
    """
    prompt = f"""
    You are an intelligent assistant. Meticulously extract structured information
    from the following financial text.
    
    1. Key Elements: Essential nouns, entities, or numbers pivotal to the text.
    2. Atomic Facts: The smallest, indivisible facts, presented as concise sentences.
    
    Format Instructions: {ChunkEnrichment.model_json_schema()} following the JSON schema.
    
    Text:
    {chunk_text}
    """
    try:
        # Use litellm.completion to call Groq (or its fallback)
        response = litellm.completion(
            model="groq/llama-3.3-70b-versatile", # Use the alias from model_list
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        json_response = response.choices[0].message.content
        parsed_data = json.loads(json_response)
        return ChunkEnrichment(**parsed_data)

    except (Exception, ValidationError) as e:
        # This triggers the fallback in the main ingestion loop
        print(f"Error during LLM enrichment or validation: {e}")
        return None

def ingest_document(driver: Driver, doc: DocumentNode, chunks: List[str]):
    """
    Ingests a document, its chunks, and enriched facts into Neo4j.
    Implements the graceful fallback system.
    """
    with driver.session(database=NEO4J_DATABASE) as session:
        # 1. Create/Merge Document Node
        session.run(
            "MERGE (d:Document {fileName: $fileName}) "
            "ON CREATE SET d.content_hash = $hash, d.source_type = $type",
            fileName=doc.fileName, hash=doc.content_hash, type=doc.source_type
        )
        
        print(f"Ingesting document: {doc.fileName}")
        
        for i, chunk_text in enumerate(chunks):
            chunk_id = f"{doc.fileName}_chunk_{i}"
            chunk_embedding = get_embedding(chunk_text)
            
            # 2. Create SectionChunk Node & Link to Document
            session.run(
                """
                MATCH (d:Document {fileName: $fileName})
                MERGE (s:SectionChunk {chunk_id: $chunk_id})
                ON CREATE SET s.text = $text, s.embedding = $embedding
                MERGE (d)-[:HAS_SECTION]->(s)
                """,
                fileName=doc.fileName, chunk_id=chunk_id, 
                text=chunk_text, embedding=chunk_embedding
            )
            
            # 3. --- LLM Enrichment (Try/Except Fallback) ---
            try:
                enrichment = get_enrichment_from_chunk(chunk_text)
                
                if not enrichment or not enrichment.atomic_facts:
                    # This is a soft failure, log it and move on
                    raise Exception("LLM returned no valid atomic_facts.")

                # 4. If success, build the rich graph
                for fact in enrichment.atomic_facts:
                    fact_embedding = get_embedding(fact.atomic_fact)
                    
                    # Create FactNode, link to SectionChunk
                    
                    fact_id = session.run(
                        """
                        MATCH (s:SectionChunk {chunk_id: $chunk_id})
                        MERGE (f:FactNode {fact: $fact_text})
                        ON CREATE SET f.embedding = $embedding
                        MERGE (s)-[:HAS_FACT]->(f)
                        RETURN elementId(f) AS fact_id
                        """,
                        chunk_id=chunk_id, 
                        fact_text=fact.atomic_fact, 
                        embedding=fact_embedding
                    ).single()["fact_id"]
                    
                    # Create EntityNodes, link to FactNode
                    if fact.key_elements:
                        session.run(
                            """
                            MATCH (f:FactNode) WHERE elementId(f) = $fact_id
                            WITH f
                            UNWIND $elements AS elem_name
                            MERGE (e:EntityNode {name: elem_name})
                            MERGE (f)-[:HAS_ENTITY]->(e)
                            """,
                            fact_id=fact_id, elements=fact.key_elements
                        )
                print(f"  + Enriched chunk {i} with {len(enrichment.atomic_facts)} facts.")

            except Exception as e:
                # --- FALLBACK IN ACTION ---
                # If enrichment fails, we just log it.
                # The (Document)-[:HAS_SECTION]->(SectionChunk) link
                # is already created and is sufficient for vector RAG.
                print(f"  ! Fallback: Skipped enrichment for chunk {i}: {e}")
                pass # Continue to the next chunk

In [8]:
def create_indexes(driver: Driver):
    """
    Creates Neo4j Vector and Full-Text indexes
    (as per your https://neo4j.com/docs/cypher-manual/current/indexes/semantic-indexes/vector-indexes/ link)
    """
    with driver.session(database=NEO4J_DATABASE) as session:
        # 1. Vector index for GraphRAG entry (Facts)
        session.run(
            """
            CREATE VECTOR INDEX fact_embeddings IF NOT EXISTS
            FOR (f:FactNode) ON (f.embedding)
            OPTIONS { indexConfig: {
              `vector.dimensions`: 768, 
              `vector.similarity_function`: 'cosine'
            }}
            """
        )
        
        # 2. Vector index for GraphRAG entry (Chunks - Fallback)
        session.run(
            """
            CREATE VECTOR INDEX section_embeddings IF NOT EXISTS
            FOR (s:SectionChunk) ON (s.embedding)
            OPTIONS { indexConfig: {
              `vector.dimensions`: 768, 
              `vector.similarity_function`: 'cosine'
            }}
            """
        )
        
        # 3. Full-Text index for keyword search fallback
        session.run(
            """
            CREATE FULLTEXT INDEX text_index IF NOT EXISTS
            FOR (n:FactNode|SectionChunk) ON EACH [n.fact, n.text]
            """
        )
        print("Indexes created successfully.")

In [9]:
# === 4. Main Execution Function ===
def main():
    # --- Example Financial Document ---
    doc_content = """
    Tesla, Inc. (TSLA) reported its fourth-quarter 2024 earnings on January 25, 2025.
    The company announced total revenue of $25.17 billion, missing analyst expectations.
    Automotive revenue was $21.56 billion. The Cybertruck production ramp-up
    is proceeding, with 1,000 units built in a single week.
    
    For the full year 2024, Tesla delivered 1.81 million vehicles.
    Net income (GAAP) for Q4 2024 was $2.48 billion.
    The company warned of a notably lower volume growth rate in 2025.
    """
    # Simple chunking for this example
    chunks = doc_content.split("\n\n") 
    
    doc = DocumentNode(
        fileName="tesla_q4_2024_earnings.txt",
        content_hash=hashlib.md5(doc_content.encode()).hexdigest()
    )
    
    try:
        print("Connecting to Neo4j...")
        print("Using database:", NEO4J_DATABASE)
        print("NEO4J_URI:", NEO4J_URI)
        print("NEO4J_USER:", NEO4J_USER)
        print("NEO4J_PASSWORD:", NEO4J_PASSWORD)
        driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
        driver.verify_connectivity()
        print("Neo4j connection successful.")
        
        # 1. Create indexes first
        create_indexes(driver)
        
        # 2. Run ingestion
        ingest_document(driver, doc, chunks)
        
        print("Ingestion complete.")
        
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        if 'driver' in locals():
            driver.close()
            print("Neo4j connection closed.")

In [10]:
# This will run the main ingestion function
main()

Connecting to Neo4j...
Using database: graphreader2
NEO4J_URI: neo4j://127.0.0.1:7687
NEO4J_USER: neo4j
NEO4J_PASSWORD: neo4j1999
Neo4j connection successful.
Indexes created successfully.
Ingesting document: tesla_q4_2024_earnings.txt




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}




  + Enriched chunk 0 with 5 facts.




SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}




  + Enriched chunk 1 with 3 facts.
Ingestion complete.
Neo4j connection closed.


In [11]:
sample_chunk = "Tesla, Inc. (TSLA) reported its fourth-quarter 2024 earnings on January 25, 2025."
result = get_enrichment_from_chunk(sample_chunk)
print(result)  # Should output a validated ChunkEnrichment object



SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'fake_stream': True, 'response_format': {'type': 'json_object'}, 'extra_body': {}}
atomic_facts=[AtomicFact(key_elements=['Tesla', 'Q4 2024', 'January 25, 2025'], atomic_fact='Tesla reported its Q4 2024 earnings on January 25, 2025.'), AtomicFact(key_elements=['TSLA', 'Tesla, Inc.', 'fourth-quarter 2024'], atomic_fact='Tesla, Inc. is represented by the stock symbol TSLA.'), AtomicFact(key_elements=['January 25, 2025', 'Q4 2024 earnings'], atomic_fact='The Q4 2024 earnings of Tesla were reported on January 25, 2025.')]
