### About

This notebook uses the following frameworks
* Langchain

* Langgraph

### 1. Installing dependencies

In [None]:
# !pip install -qU scikit-learn transformers rank_bm25 langchain_community langchain-qdrant langchain_huggingface

In [None]:
# !pip install pyppeteer

### 2. Local models

Loading the embedding model "sentence-camembert-large" from hugging face:

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="dangvantuan/sentence-camembert-large")

In [None]:
### LLM
from langchain_ollama import ChatOllama

local_llm = "mistral:7b-instruct"
llm = ChatOllama(model=local_llm, temperature=0)
llm_json_mode = ChatOllama(model=local_llm, temperature=0, format="json")

### 3. Vector store

In [None]:
from qdrant_client import QdrantClient

qdrant = QdrantClient(path="../qdrant_data") #Switched from using qdrant 'production mode' to 'embedded mode'

In [None]:
# import json
# import uuid
# import hashlib
# from qdrant_client import QdrantClient
# from qdrant_client.models import PointStruct, VectorParams, Distance

# # Helper function to convert string IDs to valid UUID-compatible IDs
# def string_to_uuid(string_id):
#     """Convert a string to a deterministic UUID by hashing it"""
#     # Create MD5 hash of the string
#     hash_object = hashlib.md5(string_id.encode())
#     # Convert to hex
#     hex_dig = hash_object.hexdigest()
#     # Create a UUID from the hex string
#     return uuid.UUID(hex_dig)

In [None]:
# # 4. Load your JSON file
# try:
#     with open('output_chunks.json', 'r', encoding='utf-8') as f:
#         documents = json.load(f)
#     print(f"✓ Loaded {len(documents)} documents from output_chunks.json")
# except FileNotFoundError:
#     print("Error: output_chunks.json file not found!")
#     sys.exit(1)
# except json.JSONDecodeError:
#     print("Error: Invalid JSON format in output_chunks.json!")
#     sys.exit(1)

# # 5. Create a collection
# collection_name = "Auditron_legal_chunks"
# print(f"Creating collection '{collection_name}'...")
# qdrant.recreate_collection(
#     collection_name=collection_name,
#     vectors_config=VectorParams(size=model.get_sentence_embedding_dimension(), distance=Distance.COSINE),
# )

# # 6. Process the documents
# print("Processing documents...")
# total_chunks = sum(len(doc.get("chunks", [])) for doc in documents)
# batch_size = 50
# processed_count = 0
# error_count = 0
# batch_points = []
# id_mapping = {}  # To store mapping between original IDs and UUID IDs

# for doc_idx, doc in enumerate(documents):
#     print(f"Processing document {doc_idx+1}/{len(documents)}")
#     chunks = doc.get("chunks", [])
    
#     for chunk_idx, chunk in enumerate(chunks):
#         try:
#             # Extract text and ID
#             text = chunk.get("text", "")
#             original_id = chunk.get("chunk_id", f"unknown_{doc_idx}_{chunk_idx}")
            
#             if not text.strip():  # Skip empty chunks
#                 print(f"Warning: Empty text in chunk {original_id}")
#                 continue
            
#             # Clean and truncate text if necessary to avoid index errors
#             # Some models have maximum input length limitations
#             max_text_length = 512  # Adjust based on your model's limitations
#             text = text.strip()[:max_text_length]
            
#             # Convert string ID to UUID
#             point_id = string_to_uuid(original_id)
            
#             # Save mapping
#             id_mapping[str(point_id)] = original_id
            
#             # Generate embedding with error handling
#             try:
#                 embedding = model.encode(text, show_progress_bar=False)
#             except Exception as embed_error:
#                 print(f"Embedding error for chunk {original_id}: {str(embed_error)}")
#                 # Try with a shorter text if it might be a length issue
#                 if len(text) > 200:
#                     try:
#                         shorter_text = text[:200]
#                         print(f"Retrying with shorter text for {original_id}")
#                         embedding = model.encode(shorter_text, show_progress_bar=False)
#                     except Exception as retry_error:
#                         print(f"Still failed with shorter text: {str(retry_error)}")
#                         error_count += 1
#                         continue
#                 else:
#                     error_count += 1
#                     continue
            
#             # Create point with UUID
#             point = PointStruct(
#                 id=str(point_id),
#                 vector=embedding.tolist(),
#                 payload={
#                     "text": text,
#                     "original_id": original_id,  # Keep original ID in payload
#                     "structures": chunk.get("structures", []),
#                     "document_path": chunk.get("document_path", []),
#                     "metadata": chunk.get("metadata", {})
#                 }
#             )
            
#             # Add to batch
#             batch_points.append(point)
#             processed_count += 1
            
#             # If batch is full, upload to Qdrant
#             if len(batch_points) >= batch_size:
#                 qdrant.upsert(
#                     collection_name=collection_name,
#                     points=batch_points,
#                 )
#                 print(f"Uploaded batch: {processed_count}/{total_chunks} chunks ({error_count} errors so far)")
#                 batch_points = []
                
#         except Exception as e:
#             print(f"Error processing chunk {original_id}: {str(e)}")
#             error_count += 1

# # Upload any remaining points
# if batch_points:
#     qdrant.upsert(
#         collection_name=collection_name,
#         points=batch_points,
#     )
#     print(f"Uploaded final batch: {processed_count}/{total_chunks} chunks")

# # Save ID mapping for reference (optional)
# try:
#     with open('id_mapping.json', 'w', encoding='utf-8') as f:
#         json.dump(id_mapping, f, indent=2)
#     print("✓ Saved ID mapping to id_mapping.json")
# except Exception as e:
#     print(f"Warning: Could not save ID mapping: {str(e)}")

# print(f"✅ Successfully processed {processed_count}/{total_chunks} chunks into Qdrant collection '{collection_name}'!")
# print(f"Total errors encountered: {error_count}")

### 4. Retrieving

**4.1 Semantic search**

In [None]:
# from langchain_qdrant import QdrantVectorStore
# from qdrant_client import QdrantClient
# from qdrant_client.http.models import SearchParams
# from langchain_core.embeddings import Embeddings

# # 3. Create the vector store with LangChain
# vector_store = QdrantVectorStore(
#     client=qdrant,
#     collection_name="Auditron_legal_chunks",  # Your collection name
#     content_payload_key="text", 
#     embedding=embeddings,
# )
# #    search_params=SearchParams(hnsw_ef=128)  # Your search params
# # 4. Create the retriever from the vector store
# retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# # 5. Use the retriever
# query = "Quel est le nouveau taux unifié de retenue à la source applicable aux loyers, rémunérations non commerciales, honoraires et commissions en Tunisie depuis l'adoption de la loi N° 2020-46 du 23 décembre 2020?"
# documents = retriever.invoke(query)

# # The retrieved documents will include content and metadata
# for doc in documents:
#     print(doc.page_content)  # Access the content
#     print(doc.metadata)      # Access the metadata

### Document re-ranking

In [None]:
# import json
# from langchain_core.messages import HumanMessage, SystemMessage

In [None]:
# import json
# from langchain_core.messages import HumanMessage, SystemMessage

# ### Retrieval Grader

# # Doc grader instructions
# doc_grader_instructions = """Vous êtes un modèle chargé d’évaluer la pertinence d’un document récupéré par rapport à une question utilisateur.

# Si le document contient des mots-clés ou un sens sémantique lié à la question, considérez-le comme pertinent."""

# # Grader prompt
# doc_grader_prompt = """Voici le document récupéré : \n\n {document} \n\n Voici la question de l'utilisateur : \n\n {question}.

# Évaluez soigneusement et objectivement si le document contient au moins une information pertinente en lien avec la question.

# Retournez un JSON avec une clé unique, binary_score, qui sera 'oui' ou 'non' pour indiquer si le document contient au moins une information pertinente pour la question."""
# # Test
# question = "Quel est le nouveau taux unifié de retenue à la source applicable aux loyers, rémunérations non commerciales, honoraires et commissions en Tunisie depuis l'adoption de la loi N° 2020-46 du 23 décembre 2020?"

# docs = retriever.invoke(question)
# doc_txt = docs[0].page_content

# doc_grader_prompt_formatted = doc_grader_prompt.format(
#     document=doc_txt, question=question
# )
# result = llm_json_mode.invoke(
#     [SystemMessage(content=doc_grader_instructions)]
#     + [HumanMessage(content=doc_grader_prompt_formatted)]
# )
# json.loads(result.content)

In [None]:
# doc_txt

In [None]:
# docs = retriever.invoke(question)
# doc_txt = docs[2].page_content

# doc_grader_prompt_formatted = doc_grader_prompt.format(
#     document=doc_txt, question=question
# )
# result = llm_json_mode.invoke(
#     [SystemMessage(content=doc_grader_instructions)]
#     + [HumanMessage(content=doc_grader_prompt_formatted)]
# )
# json.loads(result.content)

In [None]:
# doc_txt

**4.2 Implementig Hybrid Search threshold**

In [None]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from langchain_core.embeddings import Embeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
from typing import List, Any
from pydantic import Field
from langchain_huggingface import HuggingFaceEmbeddings

# 1. Updated Qdrant retriever with explicit embeddings
class QdrantScoreRetriever(BaseRetriever):
    vectorstore: QdrantVectorStore = Field(...)
    embeddings: Embeddings = Field(...)  # Explicit embeddings field
    k: int = Field(default=5)
    score_threshold: float = Field(default=0.4)

    def _get_relevant_documents(self, query: str) -> List[Document]:
        # Get embedding using the explicit embeddings model
        query_embedding = self.embeddings.embed_query(query)
        
        # Search Qdrant with score threshold
        results = self.vectorstore.client.search(
            collection_name=self.vectorstore.collection_name,
            query_vector=query_embedding,
            limit=self.k,
            score_threshold=self.score_threshold,
            with_payload=True
        )

        documents = []
        for result in results:
            content = result.payload.get("text", "")
            metadata = {k: v for k, v in result.payload.items() if k != "text"}
            metadata["score"] = result.score  # Store similarity score
            documents.append(Document(
                page_content=content,
                metadata=metadata
            ))
        
        return documents

# 2. Initialize components
qdrant = QdrantClient(path="../qdrant_data")
embeddings = HuggingFaceEmbeddings(model_name="dangvantuan/sentence-camembert-large")

# Create vector store
vectorstore = QdrantVectorStore(
    client=qdrant,
    collection_name="Auditron_legal_chunks",
    content_payload_key="text",
    embedding=embeddings,  # This might not be needed depending on your Qdrant setup
)

# 3. Create Qdrant retriever with explicit embeddings
qdrant_retriever = QdrantScoreRetriever(
    vectorstore=vectorstore,
    embeddings=embeddings,  # Pass embeddings explicitly
    k=5,
    score_threshold=0.4
)

# Load BM25 documents
response = qdrant.scroll(
    collection_name="Auditron_legal_chunks",
    limit=10_000,
    with_payload=True
)

all_documents = [
    Document(
        page_content=point.payload.get("text", ""),
        metadata={k: v for k, v in point.payload.items() if k != "text"}
    )
    for point in response[0]
]

bm25_retriever = BM25Retriever.from_documents(
    all_documents,
    k=5,
    with_score=True
)

# 4. Create ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, qdrant_retriever],
    weights=[0.4, 0.6]
)

# 5. Search function with threshold
def retrieve_with_threshold(query: str, threshold: float = 0.4) -> List[Document]:
    """Retrieve documents with combined score above threshold"""
    docs = ensemble_retriever.invoke(query)
    
    # Filter documents by combined score
    filtered_docs = [
        doc for doc in docs
        if doc.metadata.get("score", 0) >= threshold
    ]
    
    # Print results
    for i, doc in enumerate(filtered_docs, 1):
        print(f"Document {i} [Score: {doc.metadata['score']:.3f}]")
        print(doc.page_content + "\n")
    
    return filtered_docs

# 6. Execute the search
results = retrieve_with_threshold(
    "Quel est le nouveau taux unifié de retenue à la source applicable aux loyers, rémunérations non commerciales, honoraires et commissions en Tunisie depuis l'adoption de la loi N° 2020-46 du 23 décembre 2020?"
)




No sentence-transformers model found with name dangvantuan/sentence-camembert-large. Creating a new one with mean pooling.


Document 1 [Score: 0.630]
Pour la retenue à la source Suite à l'augmentation du taux de l'impôt sur les sociétés de 15% à 20%, les taux de retenue à la source dus au titre des opérations de cession par les personnes morales non résidentes non établies en Tunisie de biens immobiliers situés en Tunisie ou de droits y afférents, ou de droits sociaux dans les sociétés civiles immobilières, ainsi que de titres ou droits y afférents ou de leur rétrocession, réalisés en Tunisie, ont été adaptés par leur augmentation de: 10% à 15% pour le p

Document 2 [Score: 0.622]
10%(1) au titre des honoraires, commissions, courtages, loyers et rémunérations des activités non commerciales qu’elle qu’en soit l’appellation payés par l’Etat, les collectivités locales, les personnes morales ainsi que les personnes physiques soumises à l’impôt sur le revenu selon le régime réel et les personnes visées au paragraphe II de l’article 22 du présent code. (Modifié Art 69-1 LF 2004-90 du 31/12/2004, Art.45-1 LF 2012-

  results = self.vectorstore.client.search(


### 5. Web search

In [None]:
## Tavily web-search
import os
import getpass


def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")


_set_env("TAVILY_API_KEY")
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [None]:
from langchain_community.tools.tavily_search import TavilySearchResults

# Define allowed domains for Tunisian governmental financial institutions
tunisian_gov_domains = [
    "finances.gov.tn",        # Ministry of Finance
    "douane.gov.tn",          # Tunisian Customs
    "impots.finances.gov.tn", # Tax authority
    "cga.gov.tn",             # General Committee of Insurance
    "cmf.tn",                 # Financial Market Council
    "portail.finances.gov.tn" # Finance Ministry Portal
    "swiver.io"
]

# Create the search tool with domain filtering
web_search_tool = TavilySearchResults(
    k=3,
    include_domains=tunisian_gov_domains
)

In [None]:
query = "retenue à la source appliqué sur les honoraires en 2021?"
# Test
docs = web_search_tool.invoke(query)
docs_txt = format_docs(docs)
rag_prompt_formatted = rag_prompt.format(context=docs_txt, question=query)
generation = llm.invoke([HumanMessage(content=rag_prompt_formatted)])
print(generation.content)

NameError: name 'format_docs' is not defined

### 6. Generation

In [None]:
### Generate

# Prompt
rag_prompt = """Vous êtes un assistant pour des tâches de question-réponse.

Voici le contexte à utiliser pour répondre à la question :

{context}

Réfléchissez soigneusement au contexte ci-dessus.

Maintenant, examinez la question de l'utilisateur :

{question}

Fournissez une réponse à cette question en utilisant uniquement le contexte ci-dessus.

Utilisez un maximum de trois phrases et gardez la réponse concise.

Réponse :"""

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


# Test
query = '''TEJ'''
docs = ensemble_retriever.invoke(query)
docs_txt = format_docs(docs)
rag_prompt_formatted = rag_prompt.format(context=docs_txt, question=query)
generation = llm.invoke([HumanMessage(content=rag_prompt_formatted)])
print(generation.content)


Hallucination grader:

In [None]:
### Hallucination Grader

# Hallucination grader instructions
hallucination_grader_instructions = """

Vous êtes un enseignant en train de corriger un quiz.

Vous recevrez des FAITS et une RÉPONSE D'ÉLÈVE.

Voici les critères de notation à suivre :

(1) Assurez-vous que la RÉPONSE DE L'ÉLÈVE est bien fondée sur les FAITS.

(2) Assurez-vous que la RÉPONSE DE L'ÉLÈVE ne contient pas d'informations « hallucinées » qui sortent du cadre des FAITS.

Note :

Une note de oui signifie que la réponse de l'élève respecte tous les critères. C’est la note la plus élevée (meilleure).

Une note de non signifie que la réponse de l'élève ne respecte pas tous les critères. C’est la note la plus basse que vous pouvez attribuer.

Expliquez votre raisonnement étape par étape afin de garantir la justesse de votre raisonnement et de votre conclusion.

Évitez d’énoncer directement la bonne réponse dès le départ.
"""


# Grader prompt
hallucination_grader_prompt = """FAITS : \n\n {documents} \n\n RÉPONSE DE L'ÉLÈVE : {generation}.

Retournez un JSON avec deux clés :  
- binary_score : une valeur 'oui' ou 'non' indiquant si la RÉPONSE DE L'ÉLÈVE est bien fondée sur les FAITS.  
- explanation : une explication justifiant la note attribuée.
"""

# Test using documents and generation from above
hallucination_grader_prompt_formatted = hallucination_grader_prompt.format(
    documents=docs_txt, generation=generation.content
)
result = llm_json_mode.invoke(
    [SystemMessage(content=hallucination_grader_instructions)]
    + [HumanMessage(content=hallucination_grader_prompt_formatted)]
)
json.loads(result.content)

Answer grader

In [None]:
### Answer Grader

# Answer grader instructions
answer_grader_instructions = """Vous êtes un enseignant en train de corriger un quiz.

Vous recevrez une QUESTION et une RÉPONSE D'ÉLÈVE.

Voici les critères de notation à suivre :

(1) La RÉPONSE DE L'ÉLÈVE aide à répondre à la QUESTION.

Note :

Une note de oui signifie que la réponse de l'élève respecte tous les critères. C’est la note la plus élevée (meilleure).

L’élève peut recevoir une note de oui même si la réponse contient des informations supplémentaires qui ne sont pas explicitement demandées dans la question.

Une note de non signifie que la réponse de l'élève ne respecte pas tous les critères. C’est la note la plus basse que vous pouvez attribuer.

Expliquez votre raisonnement étape par étape afin de garantir la justesse de votre raisonnement et de votre conclusion.

Évitez d’énoncer directement la bonne réponse dès le départ.
"""

# Grader prompt
answer_grader_prompt = """QUESTION : \n\n {question} \n\n RÉPONSE DE L'ÉLÈVE : {generation}.

Retournez un JSON avec deux clés :  
- binary_score : une valeur 'oui' ou 'non' indiquant si la RÉPONSE DE L'ÉLÈVE respecte les critères.  
- explanation : une explication justifiant la note attribuée.
"""

# Test
answer = generation.content

# Test using question and generation from above
answer_grader_prompt_formatted = answer_grader_prompt.format(
    question=question, generation=answer
)
result = llm_json_mode.invoke(
    [SystemMessage(content=answer_grader_instructions)]
    + [HumanMessage(content=answer_grader_prompt_formatted)]
)
json.loads(result.content)

### 7. ChatAgent with LangGraph

In [None]:
import json
from langchain_core.messages import HumanMessage, SystemMessage

In [None]:
import operator
from typing_extensions import TypedDict
from typing import List, Annotated, Optional, Dict

class ConversationState(TypedDict):
    """
    Graph state for the Withholding Tax RAG Pipeline containing information
    propagated through each node in the workflow.
    """
    # User input
    original_query: str  # Original user question about withholding taxes
    
    # Query re-writing step
    rewritten_query: str  # Processed query optimized for retrieval
    
    # Retrieval step
    retrieved_documents: List[str]  # Documents from hybrid search (BM25 + semantic)
    relevancy_scores: Optional[List[float]]  # Relevancy scores for retrieved documents
    
    # LLM Document Filtering step - Binary relevance (relevant/irrelevant)
    filtered_documents: List[str]  # Only documents classified as relevant by LLM
    document_relevance: Dict[str, bool]  # Mapping document to relevance (True=relevant, False=irrelevant)
    filter_reasoning: Optional[Dict[str, str]]  # LLM reasoning for each document's relevance decision
    
    # Response generation step
    generated_response: str  # LLM generated response from filtered documents
    
    # Response validation step
    validation_result: str  # "Yes" or "No" - whether response answers the query
    validation_reason: Optional[str]  # Explanation for the validation decision
    
    # Fallback & retry mechanism
    retry_count: Annotated[int, operator.add]  # Track number of retry attempts
    max_retries: int  # Maximum number of retry attempts (default: 3)
    web_search_results: Optional[List[str]]  # Results from web search fallback
    
    # Final response
    final_response: str  # Final validated response to be returned to the user

**Defining the nodes**

In [None]:
from langchain.schema import Document
from langgraph.graph import END

In [None]:
### Nodes
def query_rewriting(state):
    """
    Process and rewrite the initial user query for optimal retrieval.

    Args:
        state (dict): The current graph state with original_query

    Returns:
        state (dict): Updated state with rewritten_query
    """
    print("---QUERY REWRITING---")
    original_query = state["original_query"]
    
    # Rewrite query for optimal retrieval
    rewritten_query = llm.invoke([
        SystemMessage(content="Tu es un expert en reformulation de prompts. "
        "Ton travail est de prendre un prompt complexe, de le simplifier, de le rendre clair, direct et facile à comprendre. "
        "Le résultat doit être en français parfait. Ne change pas le sens du prompt, seulement rends-le plus simple et clair."
        "répond juste par la question."),
        HumanMessage(content=original_query)
    ])
    
    return {"rewritten_query": rewritten_query.content}

In [None]:
from typing import List, Dict, Tuple, Any, Optional
from langchain_community.chat_models import ChatOllama
from langchain_core.messages import HumanMessage

# Initialize Bloom 500 model
llm = ChatOllama(model=local_llm, temperature=0)

import json
from typing import List, Dict, Tuple, Any
from langchain.schema import SystemMessage, HumanMessage

def filter_documents_by_relevance(query: str, documents: List[Any]) -> Tuple[List[Any], Dict[str, bool], Dict[str, str]]:
    """
    Filter documents based on their relevance to the query using a LLM in JSON mode.
    
    Args:
        query (str): The query to evaluate document relevance against
        documents (List[Any]): List of document objects to evaluate
        llm_json_mode: The language model that supports JSON output mode
        
    Returns:
        Tuple containing:
        - filtered_documents (List[Any]): Only the relevant documents
        - document_relevance (Dict[str, bool]): Mapping of document IDs to relevance decision
        - filter_reasoning (Dict[str, str]): Reasoning for each relevance decision
    """
    filtered_documents = []
    document_relevance = {}
    filter_reasoning = {}
    
    # Doc grader instructions
    doc_grader_instructions = """You are a grader assessing relevance of a retrieved document to a user question.

If the document contains keyword(s) or semantic meaning related to the question, grade it as relevant."""

    # Grader prompt template
    doc_grader_prompt = """Here is the retrieved document: \n\n {document} \n\n Here is the user question: \n\n {question}. 

This carefully and objectively assess whether the document contains at least some information that is relevant to the question.

Return JSON with single key, binary_score, that is 'yes' or 'no' score to indicate whether the document contains at least some information that is relevant to the question."""
    
    for doc in documents:
        # Get document content and ID
        doc_content = doc.page_content
        doc_id = doc.metadata.get("id", str(hash(doc_content)))
        
        # Format the grader prompt with the document content and query
        doc_grader_prompt_formatted = doc_grader_prompt.format(
            document=doc_content, 
            question=query
        )
        
        # Get LLM response using the system and human message approach
        response = llm_json_mode.invoke(
            [SystemMessage(content=doc_grader_instructions)] + 
            [HumanMessage(content=doc_grader_prompt_formatted)]
        )
        
        # Extract the text content and store the raw response
        response_text = response.content if hasattr(response, 'content') else str(response)
        filter_reasoning[doc_id] = response_text
        
        try:
            # Parse the JSON response to determine relevance
            parsed_response = json.loads(response_text)
            is_relevant = parsed_response.get("binary_score", "").lower() == "yes"
            
            # Store relevance result
            document_relevance[doc_id] = is_relevant
            
            # Add to filtered documents if relevant
            if is_relevant:
                filtered_documents.append(doc)
                
        except json.JSONDecodeError:
            # Handle case where response is not valid JSON
            document_relevance[doc_id] = False
            filter_reasoning[doc_id] += " (Error: Response not in valid JSON format)"
    
    return filtered_documents, document_relevance, filter_reasoning

def retrieval_with_filtering(state):
    """
    Perform hybrid search (BM25 + semantic) using vector DB with relevancy threshold,
    then filter documents using Bloom LLM for binary relevance assessment.

    Args:
        state (dict): The current graph state with rewritten_query

    Returns:
        state (dict): Updated state with retrieved documents, relevancy scores, and filtering results
    """
    print("---RETRIEVAL WITH LLM FILTERING---")
    print(f"Processing query: {state['rewritten_query']}")
    query = state["rewritten_query"]
    
    # Hybrid search using BM25 + semantic
    documents = retrieve_with_threshold(query)
    print(f"Retrieved {len(documents)} documents")
    
    # Calculate relevancy scores
    relevancy_scores = [doc.metadata.get("score", 0.0) for doc in documents]
    
    # Filter documents by relevance using Bloom LLM
    filtered_docs, doc_relevance, reasoning = filter_documents_by_relevance(query, documents)
    print(f"Filtered to {len(filtered_docs)} relevant documents")
    
    # Create doc_id to document mapping for accessing original documents
    doc_id_mapping = {doc.metadata.get("id", str(hash(doc.page_content))): doc for doc in documents}
    
    # Format document relevance map to use actual documents as keys if needed
    document_relevance = {str(doc_id): relevance for doc_id, relevance in doc_relevance.items()}
    
    # Add metadata about filtering
    processing_metadata = {
        "processed_by": "fouratmansouri",
        "processing_timestamp": "2025-05-09 01:05:46",
        "total_documents": len(documents),
        "relevant_documents": len(filtered_docs)
    }
    
    return {
        "retrieved_documents": documents,
        "relevancy_scores": relevancy_scores,
        "filtered_documents": filtered_docs,
        "document_relevance": document_relevance,
        "filter_reasoning": reasoning,
        "processing_metadata": processing_metadata
    }

  llm = ChatOllama(model=local_llm, temperature=0)


In [None]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
def response_generation(state):
    """
    LLM processes filtered documents to generate a response.
    No longer uses all retrieved documents as fallback.
    Uses web search as fallback when high-quality documents aren't available.

    Args:
        state (dict): The current graph state with rewritten_query, retrieved_documents,
                     and filtered_documents from LLM relevance filtering

    Returns:
        state (dict): Updated state with generated_response
    """
    print("---RESPONSE GENERATION---")
    current_time = "2025-05-09 12:30:02"
    user_login = "fouratmansouri"
    print(f"Process initiated by {user_login} at {current_time}")
    
    query = state["rewritten_query"]
    
    # First check for filtered documents
    if "filtered_documents" in state and state["filtered_documents"]:
        documents = state["filtered_documents"]
        print(f"Using {len(documents)} filtered documents for response generation")
    
    # Instead of using all retrieved documents, get only those above threshold
    else:
        print("No filtered documents available, extracting high-quality documents")
        # Define threshold for document quality
        threshold_value = 0.7  # Adjust based on your scoring system
        
        # Get retrieved documents
        retrieved_docs = state.get("retrieved_documents", [])
        
        # Filter only high-quality documents
        high_quality_docs = []
        for doc in retrieved_docs:
            # Check if document meets quality threshold
            if doc.metadata.get('relevance_score', 0) >= threshold_value:
                high_quality_docs.append(doc)
        
        documents = high_quality_docs
        print(f"Using {len(documents)} high-quality documents for response generation")
        
        # If no high-quality documents found or documents are insufficient, try web search
        if not documents or len(documents) < 2:  # Adjust minimum document threshold as needed
            print("Insufficient high-quality documents, attempting web search")
            
            # Perform web search to get additional information
            try:
                web_docs = web_search_tool.invoke({"query": query})
                web_results = [Document(page_content=d["content"]) for d in web_docs]
                print(f"Retrieved {len(web_results)} documents from web search")
                
                # Combine existing high-quality documents with web results
                documents = high_quality_docs + web_results
                print(f"Using combined {len(documents)} documents for response generation")
                
                # Still check if we have enough data
                if not documents:
                    raise ValueError("No documents found from web search either")
                    
            except Exception as e:
                print(f"Web search failed: {str(e)}")
                # If web search fails or returns no results, return informative response
                insufficient_data_response = f"Je n'ai pas trouvé d'informations suffisamment pertinentes concernant '{query}', même après recherche sur le web. Pourriez-vous reformuler votre question ou fournir plus de détails?"
                
                response_metadata = {
                    "total_documents": len(retrieved_docs),
                    "relevant_documents": 0,
                    "generated_by": user_login,
                    "generation_timestamp": current_time,
                    "status": "insufficient_data",
                    "web_search_attempted": True,
                    "web_search_successful": False
                }
                
                return {
                    "generated_response": insufficient_data_response,
                    "response_metadata": response_metadata,
                    "insufficient_data": True
                }
    
    # Format documents for context
    docs_txt = format_docs(documents)
    
    rag_prompt = """Vous êtes un assistant pour des tâches de question-réponse.

Voici le contexte à utiliser pour répondre à la question :

{context}

Réfléchissez soigneusement au contexte ci-dessus.

Maintenant, examinez la question de l'utilisateur :

{question}

Fournissez une réponse à cette question en utilisant uniquement le contexte ci-dessus.

Utilisez un maximum de trois phrases et gardez la réponse concise.

Réponse :"""
    # Generate response using RAG
    rag_prompt_formatted = rag_prompt.format(
        context=docs_txt, 
        question=query
    )
    
    generation = llm.invoke([HumanMessage(content=rag_prompt_formatted)])
    print(generation.content)
    
    # Determine source of documents used
    web_search_used = "web_search_results" in state or any(getattr(doc, 'source', '') == 'web_search' for doc in documents)
    
    # Add metadata about document filtering to the response
    response_metadata = {
        "total_documents": len(state.get("retrieved_documents", [])),
        "relevant_documents": len(documents),
        "generated_by": user_login,
        "generation_timestamp": current_time,
        "status": "success",
        "web_search_used": web_search_used
    }
    
    return {
        "generated_response": generation.content,
        "response_metadata": response_metadata,
        "documents_used": documents,  # Track which documents were actually used
        "web_search_used": web_search_used  # Flag if web search was used
    }

In [None]:
def response_validation(state):
    """
    LLM judge evaluates if response answers the re-written query.
    Args:
        state (dict): The current graph state
    Returns:
        state (dict): Updated state with validation result and reason
    """
    print("---RESPONSE VALIDATION---")
    rewritten_query = state["rewritten_query"]
    generated_response = state["generated_response"]
    
    # Answer grader instructions
    answer_grader_instructions = """Vous êtes un enseignant en train de corriger un quiz.
Vous recevrez une QUESTION et une RÉPONSE D'ÉLÈVE.
Voici les critères de notation à suivre :
(1) La RÉPONSE DE L'ÉLÈVE aide à répondre à la QUESTION.
Note :
Une note de oui signifie que la réponse de l'élève respecte tous les critères. C'est la note la plus élevée (meilleure).
L'élève peut recevoir une note de oui même si la réponse contient des informations supplémentaires qui ne sont pas explicitement demandées dans la question.
Une note de non signifie que la réponse de l'élève ne respecte pas tous les critères. C'est la note la plus basse que vous pouvez attribuer.
Expliquez votre raisonnement étape par étape afin de garantir la justesse de votre raisonnement et de votre conclusion.
Évitez d'énoncer directement la bonne réponse dès le départ.
"""
    # Grader prompt
    answer_grader_prompt = """QUESTION : \n\n {rewritten_query} \n\n RÉPONSE DE L'ÉLÈVE : {generated_response}.
Retournez un JSON avec deux clés :  
- binary_score : une valeur 'oui' ou 'non' indiquant si la RÉPONSE DE L'ÉLÈVE respecte les critères.  
- explanation : une explication justifiant la note attribuée.
"""
    # Format the prompt correctly using the state variables
    answer_grader_prompt_formatted = answer_grader_prompt.format(
        rewritten_query=rewritten_query, 
        generated_response=generated_response
    )
    
    # Call the LLM
    result = llm_json_mode.invoke(
        [SystemMessage(content=answer_grader_instructions)]
        + [HumanMessage(content=answer_grader_prompt_formatted)]
    )
    
    # Parse the JSON response
    validation_result = json.loads(result.content)
    
    return {
        "validation_result": validation_result["binary_score"],  # "oui" or "non"
        "validation_reason": validation_result["explanation"]
    }

In [None]:
def fallback_retry(state):
    """
    Fall back to web search for additional content when validation fails.
    Implements retry loop with maximum 3 attempts.
    Combines web search results with only the documents that passed the threshold.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updated state with web_search_results and incremented retry_count
    """
    print("---FALLBACK & RETRY---")
    rewritten_query = state["rewritten_query"]
    retry_count = state.get("retry_count", 0)
    
    # Check if we've reached max retries
    if retry_count >= state["max_retries"]:
        # If max retries reached, prepare final response acknowledging limitations
        final_response = f"Après plusieurs tentatives, je n'arrive pas à trouver une réponse complète concernant '{rewritten_query}'. Veuillez envisager de reformuler votre question ou de consulter un professionnel de la fiscalité pour des conseils spécifiques concernant la retenue à la source."
        return {
            "retry_count": retry_count + 1,
            "final_response": final_response
        }
    
    # Perform web search to get additional information
    web_docs = web_search_tool.invoke({"query": rewritten_query})
    web_results = [Document(page_content=d["content"]) for d in web_docs]
    
    # Get current documents and filter only those above threshold
    current_docs = state.get("retrieved_documents", [])
    above_threshold_docs = state.get("above_threshold_docs", [])
    
    # If above_threshold_docs is not available in state, we need to compute it
    if not above_threshold_docs and current_docs:
        # This assumes you have a way to determine which docs are above threshold
        # If you don't have this in state, you'll need to add logic to filter them here
        # For example, if you have relevance scores:
        # above_threshold_docs = [doc for doc in current_docs if doc.metadata.get('relevance_score', 0) >= threshold_value]
        pass
    
    # Combine web results with documents above threshold
    combined_docs = above_threshold_docs + web_results
    
    return {
        "retrieved_documents": combined_docs,
        "web_search_results": web_results,
        "above_threshold_docs": above_threshold_docs,  # Keep track of above-threshold docs
        "retry_count": retry_count + 1
    }

In [None]:
### Edge Functions
def does_response_answer_query(state):
    """
    Determines whether the response answers the rewritten query based on validation.

    Args:
        state (dict): The current graph state

    Returns:
        str: "oui" if response answers query, "non" if not
    """
    print("---DECISION POINT: DOES RESPONSE ANSWER QUERY?---")
    
    validation_result = state["validation_result"]
    
    if validation_result == "oui":
        print("---DECISION: RESPONSE VALIDATED SUCCESSFULLY---")
        # Set final response to return to user
        state["final_response"] = state["generated_response"]
        return "oui"
    else:
        print(f"---DECISION: VALIDATION FAILED - {state['validation_reason']}---")
        return "non"

**Full graph**

In [None]:
# # Run this once before rendering
# import pyppeteer.chromium_downloader
# pyppeteer.chromium_downloader.download_chromium()

In [None]:
from langgraph.graph import StateGraph, END
from IPython.display import Image, display
import logging

def configure_logging():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    )
    logger = logging.getLogger(__name__)
    # Add file handler to save logs to file
    file_handler = logging.FileHandler("conversation_system.log")
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
    logger.addHandler(file_handler)
    return logger

def build_graph():
    logger = configure_logging()
    logger.info("Building Withholding Tax RAG Pipeline")
    
    workflow = StateGraph(ConversationState)
    
    # Define the nodes according to the schema
    workflow.add_node("query_rewriting", query_rewriting)                    # 2. Query Re-writing
    workflow.add_node("retrieval_with_filtering", retrieval_with_filtering)  # 3. Combined Retrieval & Filtering 
    workflow.add_node("response_generation", response_generation)            # 4. Response Generation
    workflow.add_node("response_validation", response_validation)            # 5. Response Validation
    workflow.add_node("fallback_retry", fallback_retry)                      # 6. Fallback & Retry
    
    # Build graph
    # Start with user query (1) then goes to query rewriting (2)
    workflow.set_entry_point("query_rewriting")
    
    # Linear flow from query rewriting to retrieval with filtering
    workflow.add_edge("query_rewriting", "retrieval_with_filtering")
    
    # Retrieval with filtering to response generation
    workflow.add_edge("retrieval_with_filtering", "response_generation")
    
    # Response generation to validation
    workflow.add_edge("response_generation", "response_validation")
    
    # Conditional edge from validation
    workflow.add_conditional_edges(
        "response_validation",
        does_response_answer_query,
        {
            "oui": END,  # Return validated response to user
            "non": "fallback_retry"  # Go to fallback mechanism
        },
    )
    
    # Fallback can retry the retrieval+filtering step
    workflow.add_edge("fallback_retry", "retrieval_with_filtering")
    
    # Compile
    logger.info("Compiling Withholding Tax RAG Pipeline with LLM Document Filtering")
    logger.info(f"Pipeline built by {{'user': 'fouratmansouri', 'timestamp': '2025-05-09 01:02:53'}}")
    graph = workflow.compile()

    return graph

In [None]:
if __name__ == "__main__":
    graph = build_graph()
    # Initialize the state with a user query
    initial_state = {"original_query": "c'est quoi la plateforme TEJ ?"}
    
    # Execute the graph
    for state in graph.stream(initial_state):
        step = state.get("__step__", "")
        if step:
            print(f"Step: {step}")
            if step == "response_generation":
                print(f"Response: {state['response']}")

2025-05-09 13:36:52,965 - __main__ - INFO - Building Withholding Tax RAG Pipeline
2025-05-09 13:36:52,967 - __main__ - INFO - Compiling Withholding Tax RAG Pipeline with LLM Document Filtering
2025-05-09 13:36:52,968 - __main__ - INFO - Pipeline built by {'user': 'fouratmansouri', 'timestamp': '2025-05-09 01:02:53'}
2025-05-09 13:36:52,979 - urllib3.connectionpool - DEBUG - Starting new HTTP connection (1): localhost:11434


---QUERY REWRITING---


2025-05-09 13:37:00,174 - urllib3.connectionpool - DEBUG - http://localhost:11434 "POST /api/chat HTTP/1.1" 200 None
  results = self.vectorstore.client.search(
2025-05-09 13:37:02,745 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): api.tavily.com:443


---RETRIEVAL WITH LLM FILTERING---
Processing query:  Qu'est-ce que la plateforme TEJ ?
Retrieved 0 documents
Filtered to 0 relevant documents
---RESPONSE GENERATION---
Process initiated by fouratmansouri at 2025-05-09 12:30:02
No filtered documents available, extracting high-quality documents
Using 0 high-quality documents for response generation
Insufficient high-quality documents, attempting web search


2025-05-09 13:37:04,840 - urllib3.connectionpool - DEBUG - https://api.tavily.com:443 "POST /search HTTP/1.1" 200 3093
2025-05-09 13:37:04,847 - urllib3.connectionpool - DEBUG - Starting new HTTP connection (1): localhost:11434


Retrieved 5 documents from web search
Using combined 5 documents for response generation


2025-05-09 13:37:21,764 - urllib3.connectionpool - DEBUG - http://localhost:11434 "POST /api/chat HTTP/1.1" 200 None
2025-05-09 13:37:31,843 - httpcore.connection - DEBUG - close.started
2025-05-09 13:37:31,845 - httpcore.connection - DEBUG - close.complete
2025-05-09 13:37:31,845 - httpcore.connection - DEBUG - connect_tcp.started host='127.0.0.1' port=11434 local_address=None timeout=None socket_options=None
2025-05-09 13:37:31,847 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x000001D458DD8290>
2025-05-09 13:37:31,847 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'POST']>
2025-05-09 13:37:31,848 - httpcore.http11 - DEBUG - send_request_headers.complete
2025-05-09 13:37:31,849 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'POST']>
2025-05-09 13:37:31,849 - httpcore.http11 - DEBUG - send_request_body.complete
2025-05-09 13:37:31,850 - httpcore.http11 - DEBUG - re

 La plateforme TEJ est une plateforme utilisée pour consulter et effectuer des services depuis votre ordinateur, notamment en matière de télé-liquidation et télé-amende.
---RESPONSE VALIDATION---


2025-05-09 13:37:38,375 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Content-Type', b'application/x-ndjson'), (b'Date', b'Fri, 09 May 2025 12:37:38 GMT'), (b'Transfer-Encoding', b'chunked')])
2025-05-09 13:37:38,375 - httpx - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-05-09 13:37:38,376 - httpcore.http11 - DEBUG - receive_response_body.started request=<Request [b'POST']>
2025-05-09 13:38:01,981 - httpcore.http11 - DEBUG - receive_response_body.complete
2025-05-09 13:38:01,981 - httpcore.http11 - DEBUG - response_closed.started
2025-05-09 13:38:01,981 - httpcore.http11 - DEBUG - response_closed.complete


---DECISION POINT: DOES RESPONSE ANSWER QUERY?---
---DECISION: RESPONSE VALIDATED SUCCESSFULLY---
