In [1]:
print("Hello")

Hello


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import pandas as pd

# QA
inputs = [
    "For customer-facing applications, which company's models dominate the top rankings?",
    "What percentage of respondents are using RAG in some form?",
    "How often are most respondents updating their models?",
]

outputs = [
    "OpenAI models dominate, with 3 of the top 5 and half of the top 10 most popular models for customer-facing apps.",
    "70% of respondents are using RAG in some form.",
    "More than 50% update their models at least monthly, with 17% doing so weekly.",
]

# Dataset
qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]
df = pd.DataFrame(qa_pairs)

# Write to csv
csv_path = "data/qa_dataset.csv"
df.to_csv(csv_path, index=False)

In [4]:
from langsmith import Client
client = Client()
dataset = client.create_dataset(dataset_name="RAG Report QA Dataset", description="Input & expected output pairs from RAG report")


client.create_examples(dataset_id=dataset.id, inputs=[{"question": q} for q in inputs], outputs=[{"answer": a} for a in outputs])



{'example_ids': ['8a7ff678-83f3-470c-9cd5-e58e2dde943e',
  '57bb0360-1d48-425c-a00e-53966a7d4e1d',
  'f7f9b516-f9ee-4241-a9b0-1c4391ec04d3'],
 'count': 3}

In [5]:
import sys
import os
from pathlib import Path

# Add project root to path
sys.path.append("/Users/debdoot/Desktop/LLMOPS_RAG")

from multi_doc_chat.src.ingestion import DocumentIngestionPipeline
from multi_doc_chat.src.retriever import DocumentRetriever
import uuid


def answer_ai_report_question(
    inputs: dict,
    data_path: str = "/Users/debdoot/Desktop/LLMOPS_RAG/data/ai_report.txt",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    k: int = 5,
    session_id: str = None
) -> dict:
    """
    Answer questions about the AI Engineering Report using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the AI Engineering Report text file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
        session_id: Optional session ID for vector store isolation
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        
        # Generate or use provided session ID
        if session_id is None:
            session_id = str(uuid.uuid4())[:12]
        
        # Define vector store path
        vector_store_base = os.path.join(os.path.dirname(data_path), "vector_store")
        session_vector_store = os.path.join(vector_store_base, f"session_{session_id}")
        
        # Check if vector store already exists for this session
        if not os.path.exists(session_vector_store):
            # Build vector store using DocumentIngestionPipeline
            ingestor = DocumentIngestionPipeline(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                vector_store_path=vector_store_base,
                session_id=session_id
            )
            
            # Process the document
            ingestor.process_documents(
                file_paths=[data_path],
                metadata={"source": "AI_Engineering_Report", "type": "evaluation"}
            )
        
        # Create retriever
        retriever = DocumentRetriever(
            vector_store_path=vector_store_base,
            session_id=session_id,
            top_k=k,
            score_threshold=0.3
        )
        
        # Query documents using standard RAG
        answer, docs = retriever.query_documents(
            query=question,
            prompt_type="standard",
            chat_history=[]
        )
        
        return {"answer": answer}
        
    except Exception as e:
        return {"answer": f"Error: {str(e)}"}



In [6]:
# Test the function with a sample question
test_input = {"question": "For customer-facing applications, which company's models dominate the top rankings?"}
result = answer_ai_report_question(test_input)
print("Question:", test_input["question"])
print("\nAnswer:", result["answer"])

2025-10-10 17:03:07,926 - multi_doc_chat.src.ingestion - INFO - Initialized document ingestion pipeline with chunk_size=1000, chunk_overlap=200, session_id=9c91b37c-ab2
2025-10-10 17:03:07,926 - multi_doc_chat.src.ingestion - INFO - Starting document ingestion for 1 files with session_id=9c91b37c-ab2
2025-10-10 17:03:07,927 - loaders.py - INFO - Loaded 1 documents from /Users/debdoot/Desktop/LLMOPS_RAG/data/ai_report.txt
2025-10-10 17:03:07,927 - loaders.py - INFO - Loaded 1 documents from /Users/debdoot/Desktop/LLMOPS_RAG/data/ai_report.txt
2025-10-10 17:03:07,927 - multi_doc_chat.src.ingestion - INFO - Loaded 1 documents successfully
2025-10-10 17:03:07,927 - multi_doc_chat.src.ingestion - INFO - Split documents into 1 chunks
2025-10-10 17:03:07,928 - multi_doc_chat.src.ingestion - INFO - Average chunk length: 721.00 characters
2025-10-10 17:03:13,298 - httpx - INFO - HTTP Request: POST https://dept-podcast-openai.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?

Question: For customer-facing applications, which company's models dominate the top rankings?

Answer: For customer-facing applications, OpenAI's models dominate the top rankings, with 3 out of the top 5 and half of the top 10 most popular models being from OpenAI (Document 1).


In [8]:
print("Testing all questions from the dataset:\n")
for i, q in enumerate(inputs, 1):
    test_input = {"question": q}
    result = answer_ai_report_question(test_input)
    print(f"Q{i}: {q}")
    print(f"A{i}: {result['answer']}\n")
    print("-" * 80 + "\n")

2025-10-10 17:05:52,857 - multi_doc_chat.src.ingestion - INFO - Initialized document ingestion pipeline with chunk_size=1000, chunk_overlap=200, session_id=45f37fcf-296
2025-10-10 17:05:52,857 - multi_doc_chat.src.ingestion - INFO - Starting document ingestion for 1 files with session_id=45f37fcf-296
2025-10-10 17:05:52,858 - loaders.py - INFO - Loaded 1 documents from /Users/debdoot/Desktop/LLMOPS_RAG/data/ai_report.txt
2025-10-10 17:05:52,858 - loaders.py - INFO - Loaded 1 documents from /Users/debdoot/Desktop/LLMOPS_RAG/data/ai_report.txt
2025-10-10 17:05:52,858 - loaders.py - INFO - Loaded 1 documents from /Users/debdoot/Desktop/LLMOPS_RAG/data/ai_report.txt
2025-10-10 17:05:52,860 - multi_doc_chat.src.ingestion - INFO - Loaded 1 documents successfully
2025-10-10 17:05:52,861 - multi_doc_chat.src.ingestion - INFO - Split documents into 1 chunks
2025-10-10 17:05:52,861 - multi_doc_chat.src.ingestion - INFO - Average chunk length: 721.00 characters


Testing all questions from the dataset:



2025-10-10 17:05:54,793 - httpx - INFO - HTTP Request: POST https://dept-podcast-openai.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
2025-10-10 17:05:54,796 - multi_doc_chat.src.ingestion - INFO - Created vector store with 1 embedded chunks
2025-10-10 17:05:54,799 - multi_doc_chat.src.ingestion - INFO - Saved vector store to /Users/debdoot/Desktop/LLMOPS_RAG/data/vector_store/session_45f37fcf-296
2025-10-10 17:05:54,842 - retriever.py - INFO - Loaded vector store from /Users/debdoot/Desktop/LLMOPS_RAG/data/vector_store/session_45f37fcf-296
2025-10-10 17:05:54,842 - retriever.py - INFO - Loaded vector store from /Users/debdoot/Desktop/LLMOPS_RAG/data/vector_store/session_45f37fcf-296
2025-10-10 17:05:54,869 - retriever.py - INFO - Initialized LLM with model gpt-4o-mini
2025-10-10 17:05:54,869 - retriever.py - INFO - Initialized LLM with model gpt-4o-mini
2025-10-10 17:05:54,870 - retriever.py - INFO - Initialized 

Q1: For customer-facing applications, which company's models dominate the top rankings?
A1: For customer-facing applications, OpenAI's models dominate the top rankings, with 3 out of the top 5 and half of the top 10 most popular models being from OpenAI (Document 1).

--------------------------------------------------------------------------------



2025-10-10 17:05:58,724 - httpx - INFO - HTTP Request: POST https://dept-podcast-openai.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
2025-10-10 17:05:58,728 - multi_doc_chat.src.ingestion - INFO - Created vector store with 1 embedded chunks
2025-10-10 17:05:58,730 - multi_doc_chat.src.ingestion - INFO - Saved vector store to /Users/debdoot/Desktop/LLMOPS_RAG/data/vector_store/session_fe2f0147-2b3
2025-10-10 17:05:58,777 - retriever.py - INFO - Loaded vector store from /Users/debdoot/Desktop/LLMOPS_RAG/data/vector_store/session_fe2f0147-2b3
2025-10-10 17:05:58,777 - retriever.py - INFO - Loaded vector store from /Users/debdoot/Desktop/LLMOPS_RAG/data/vector_store/session_fe2f0147-2b3
2025-10-10 17:05:58,805 - retriever.py - INFO - Initialized LLM with model gpt-4o-mini
2025-10-10 17:05:58,805 - retriever.py - INFO - Initialized LLM with model gpt-4o-mini
2025-10-10 17:05:58,806 - retriever.py - INFO - Initialized 

Q2: What percentage of respondents are using RAG in some form?
A2: 70% of respondents are using RAG (Retrieval-Augmented Generation) in some form (Document 1).

--------------------------------------------------------------------------------



2025-10-10 17:06:02,938 - httpx - INFO - HTTP Request: POST https://dept-podcast-openai.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"
2025-10-10 17:06:02,941 - multi_doc_chat.src.ingestion - INFO - Created vector store with 1 embedded chunks
2025-10-10 17:06:02,943 - multi_doc_chat.src.ingestion - INFO - Saved vector store to /Users/debdoot/Desktop/LLMOPS_RAG/data/vector_store/session_6d1c5e64-89e
2025-10-10 17:06:02,985 - retriever.py - INFO - Loaded vector store from /Users/debdoot/Desktop/LLMOPS_RAG/data/vector_store/session_6d1c5e64-89e
2025-10-10 17:06:02,985 - retriever.py - INFO - Loaded vector store from /Users/debdoot/Desktop/LLMOPS_RAG/data/vector_store/session_6d1c5e64-89e
2025-10-10 17:06:03,013 - retriever.py - INFO - Initialized LLM with model gpt-4o-mini
2025-10-10 17:06:03,013 - retriever.py - INFO - Initialized LLM with model gpt-4o-mini
2025-10-10 17:06:03,013 - retriever.py - INFO - Initialized 

Q3: How often are most respondents updating their models?
A3: Most respondents are updating their models at least monthly, with more than 50% indicating this frequency of updates (Document 1).

--------------------------------------------------------------------------------



In [None]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator
from langchain_openai import AzureChatOpenAI
import uuid
import os


evaluation_session_id = str(uuid.uuid4())[:12]
print(f"Using evaluation session ID: {evaluation_session_id}")

# Configure Azure OpenAI for the evaluator
azure_llm = AzureChatOpenAI(
    azure_endpoint=os.getenv("OPENAI_ENDPOINT"),
    api_key=os.getenv("OPENAI_API_KEY"),
    api_version=os.getenv("API_VERSION"),
    deployment_name=os.getenv("OPENAI_CHAT_DEPLOYMENT_NAME"),
    temperature=0.0
)


qa_evaluator = [
    LangChainStringEvaluator(
        "cot_qa",
        config={"llm": azure_llm} 
    )
]


dataset_name = "RAG Report QA Dataset"  


def answer_with_session(inputs: dict) -> dict:
    """Wrapper to use consistent session_id across all evaluations."""
    return answer_ai_report_question(
        inputs=inputs,
        session_id=evaluation_session_id,
        chunk_size=1000,
        chunk_overlap=200,
        k=5
    )


experiment_results = evaluate(
    answer_with_session,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="rag-faiss-ai-report",
    # Experiment metadata
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
        "session_id": evaluation_session_id,
        "score_threshold": 0.3,
        "evaluator_model": os.getenv("OPENAI_CHAT_DEPLOYMENT_NAME")
    },
)

print(f"\nEvaluation complete! Results: {experiment_results}")


Using evaluation session ID: a4319021-65d
View the evaluation results for experiment: 'rag-faiss-ai-report-ad7370e2' at:
https://smith.langchain.com/o/f4193c4f-4476-4810-9690-8c19b2be5138/datasets/16dd0f1e-f329-46ba-8457-27711255827b/compare?selectedSessions=62e9c221-9a0b-4036-8acd-4cc26ae1bf11


View the evaluation results for experiment: 'rag-faiss-ai-report-ad7370e2' at:
https://smith.langchain.com/o/f4193c4f-4476-4810-9690-8c19b2be5138/datasets/16dd0f1e-f329-46ba-8457-27711255827b/compare?selectedSessions=62e9c221-9a0b-4036-8acd-4cc26ae1bf11




0it [00:00, ?it/s]2025-10-10 17:16:43,379 - multi_doc_chat.src.ingestion - INFO - Initialized document ingestion pipeline with chunk_size=1000, chunk_overlap=200, session_id=a4319021-65d
2025-10-10 17:16:43,379 - multi_doc_chat.src.ingestion - INFO - Starting document ingestion for 1 files with session_id=a4319021-65d
2025-10-10 17:16:43,381 - loaders.py - INFO - Loaded 1 documents from /Users/debdoot/Desktop/LLMOPS_RAG/data/ai_report.txt
2025-10-10 17:16:43,381 - loaders.py - INFO - Loaded 1 documents from /Users/debdoot/Desktop/LLMOPS_RAG/data/ai_report.txt
2025-10-10 17:16:43,381 - loaders.py - INFO - Loaded 1 documents from /Users/debdoot/Desktop/LLMOPS_RAG/data/ai_report.txt
2025-10-10 17:16:43,381 - loaders.py - INFO - Loaded 1 documents from /Users/debdoot/Desktop/LLMOPS_RAG/data/ai_report.txt
2025-10-10 17:16:43,381 - loaders.py - INFO - Loaded 1 documents from /Users/debdoot/Desktop/LLMOPS_RAG/data/ai_report.txt
2025-10-10 17:16:43,381 - loaders.py - INFO - Loaded 1 documents 


Evaluation complete! Results: <ExperimentResults rag-faiss-ai-report-ad7370e2>



