In [1]:
! pip install google.generativeai
! pip install neo4j




[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\claud\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\claud\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
! pip install --upgrade google-generativeai typing-extensions




[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\claud\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


## Vectorization & Indexing

### **Main Objective**
To transform the static text of legal opinions into high-dimensional vector embeddings using Google Gemini's models, and persist them into the Neo4j Graph to enable semantic search (Vector Search).

### **Technical Logic**
This script is the bridge between a standard Graph Database and a **Graph RAG** system.  It enables the system to understand the "meaning" of legal texts, not just keywords:

* **Vector Index Initialization:** On startup, it automatically configures the Neo4j Vector Index (`text_embeddings`), defining the dimensions (768) and similarity metric (**Cosine Similarity**). This prepares the database engine for fast nearest-neighbor queries later in the frontend.
* **Incremental & Resumable Execution:** The script queries specifically for nodes where `embedding IS NULL`. This ensures **idempotency**: if the script is stopped or crashes, it can be restarted without reprocessing existing vectors, saving time and API costs.
* **Batching & Rate Limiting:** It processes texts in blocks of 50 to optimize network throughput. It includes specific error handling for API rate limits, implementing a sleep mechanism to respect Google's quotas.
* **Context Window Management:** Before sending data to the API, it truncates legal texts to 9000 characters. This prevents errors caused by exceeding the token limit of the `text-embedding-004` model while retaining the core legal reasoning of the opinion.

In [1]:
import os
import time
import google.generativeai as genai
from neo4j import GraphDatabase
from tqdm import tqdm

# --- CONFIGURAZIONE ---
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"

def load_keys():
    try:
        with open("neo4j_pass.txt", "r") as f: pwd = f.read().strip()
        with open("key.txt", "r") as f: key = f.read().strip()
        return pwd, key
    except:
        print("Errore: file credenziali mancanti.")
        exit()

PWD, API_KEY = load_keys()
genai.configure(api_key=API_KEY)
EMBEDDING_MODEL = 'text-embedding-004'

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, PWD))

def get_embedding(text):
    try:
        # Tronchiamo a 9000 chars per sicurezza
        return genai.embed_content(
            model=EMBEDDING_MODEL,
            content=text[:9000],
            task_type="RETRIEVAL_DOCUMENT"
        )['embedding']
    except Exception as e:
        print(f"Error embedding: {e}")
        return None

def main():
    print("--- AVVIO VETTORIZZAZIONE DIRETTA SU NODI :CASE ---")
    
    # 1. Trova i casi che hanno TESTO ma NON hanno EMBEDDING sul nodo CASE
    # Questo è il cambiamento chiave: controlliamo c.embedding, non t.embedding
    query_fetch = """
    MATCH (c:CASE)-[:HAS_TEXT]->(t:TEXT)
    WHERE c.embedding IS NULL
    RETURN elementId(c) as id, t.text as text
    """
    
    with driver.session() as session:
        result = session.run(query_fetch).data()
    
    print(f"Trovati {len(result)} casi nuovi da vettorizzare.")
    
    if len(result) == 0:
        print("Tutto aggiornato!")
        return

    # 2. Processa
    for record in tqdm(result, desc="Processing"):
        node_id = record['id']
        text = record['text']
        
        vector = get_embedding(text)
        
        if vector:
            # Scriviamo direttamente sul nodo CASE
            query_update = """
            MATCH (c:CASE)
            WHERE elementId(c) = $id
            SET c.embedding = $vector
            """
            with driver.session() as session:
                session.run(query_update, id=node_id, vector=vector)
            
            time.sleep(0.5) # Rispetta i rate limits

    print("\n--- FATTO: Embeddings salvati su nodi :CASE ---")
    driver.close()

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


--- AVVIO VETTORIZZAZIONE DIRETTA SU NODI :CASE ---
Trovati 648 casi nuovi da vettorizzare.


Processing: 100%|██████████| 648/648 [11:27<00:00,  1.06s/it]


--- FATTO: Embeddings salvati su nodi :CASE ---



