In [1]:
# LangGraph + Evol Instruct Synthetic QA Generation Notebook

# ✅ 0. Setup
import os
from uuid import uuid4
from getpass import getpass

os.environ["OPENAI_API_KEY"] = getpass("OpenAI API Key:")
os.environ["LANGCHAIN_API_KEY"] = getpass("LangChain API Key:")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"AI Makerspace Session07 Advanced Build"

In [5]:
import ragas
import ragas.testset
import ragas.testset.evolutions

ModuleNotFoundError: No module named 'ragas.testset.evolutions'

In [2]:
# ✅ 1. Load LangChain Documents
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader

loader = DirectoryLoader("data/", glob="*.pdf", loader_cls=PyMuPDFLoader)
docs = loader.load()

# ✅ 2. Chunk Documents
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
runtime_docs = splitter.split_documents(docs[:10])  # Subset for cost

# ✅ 3. Embedding + Vector Store
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Qdrant

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Qdrant.from_documents(
    documents=runtime_docs,
    embedding=embedding_model,
    location=":memory:",
    collection_name="synthetic_rag"
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [3]:
#✅ 4. LangGraph Agent Tools with Enhanced Tracing
from langchain_core.tools import tool
from langsmith import traceable
import random

@tool
@traceable(name="Evol_Instruct_Evolution")
def evolve_question(base_question: str) -> dict:
    """Evolves a base question using Evol Instruct strategy."""
    types = ["Simple", "Multi-Context", "Reasoning"]
    evo_type = random.choice(types)
    evolved = f"[{evo_type}] {base_question}"
    
    # Log more details for tracing
    print(f"Evolution: {evo_type} | Original: {base_question[:50]}...")
    
    return {
        "evolved_question": evolved,
        "evolution_type": evo_type
    }

@tool
@traceable(name="Vector_Search_Retrieval")
def retrieve_context(question: str) -> list:
    """Retrieves top-k relevant context from vectorstore."""
    docs = retriever.invoke(question)
    contexts = [doc.page_content for doc in docs]
    
    # Log retrieval stats
    print(f"Retrieved {len(contexts)} chunks for: {question[:50]}...")
    
    return contexts

from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4.1-mini")

@tool
@traceable(name="RAG_Answer_Generation")
def generate_answer(question: str, context: list) -> str:
    """Answers question using retrieved context."""
    context_str = "\n".join(context)
    prompt = f"""Answer the following question using ONLY the context below.\n
Context:\n{context_str}\n
Question: {question}\n
If the answer is not in the context, say 'I don't know'."""
    
    answer = llm.invoke(prompt).content
    
    # Log answer stats
    print(f"Generated answer ({len(answer)} chars) for: {question[:50]}...")
    
    return answer

In [4]:
# ✅ 5. LangGraph State & Graph Definition with Enhanced LangSmith Tracing
from langgraph.graph import StateGraph, END
from typing import TypedDict, List, Optional
from langsmith import traceable

class QAState(TypedDict):
    id: str
    base_question: str
    evolved_question: Optional[str]
    evolution_type: Optional[str]
    context: Optional[List[str]]
    answer: Optional[str]

# Create node functions with better tracing
@traceable(name="Question_Evolution")
def evolve_node(state: QAState) -> QAState:
    """Node function for evolving questions"""
    result = evolve_question.invoke({"base_question": state["base_question"]})
    state["evolved_question"] = result["evolved_question"]
    state["evolution_type"] = result["evolution_type"]
    return state

@traceable(name="Context_Retrieval")
def retrieve_node(state: QAState) -> QAState:
    """Node function for retrieving context"""
    question = state.get("evolved_question", state["base_question"])
    context = retrieve_context.invoke({"question": question})
    state["context"] = context
    return state

@traceable(name="Answer_Generation")
def answer_node(state: QAState) -> QAState:
    """Node function for generating answers"""
    question = state.get("evolved_question", state["base_question"])
    context = state.get("context", [])
    answer = generate_answer.invoke({"question": question, "context": context})
    state["answer"] = answer
    return state

builder = StateGraph(QAState)
builder.add_node("evolve", evolve_node)
builder.add_node("retrieve", retrieve_node)
builder.add_node("answer", answer_node)

builder.set_entry_point("evolve")
builder.add_edge("evolve", "retrieve")
builder.add_edge("retrieve", "answer")
builder.add_edge("answer", END)

graph = builder.compile()

# Enhanced wrapper function with better tracing
@traceable(
    name="Synthetic_QA_Pipeline",
    metadata={"pipeline_version": "v1.0", "evol_instruct": True}
)
def process_qa_question(base_question: str, question_id: str = None) -> dict:
    """
    Process a single QA question through the Evol-Instruct pipeline.
    
    Args:
        base_question: The original seed question
        question_id: Unique identifier for tracking
    
    Returns:
        Complete QA result with evolved question, context, and answer
    """
    # Use meaningful ID if provided
    if question_id is None:
        question_id = f"qa_{hash(base_question) % 10000}"
    
    # Process through graph
    result = graph.invoke({
        "id": question_id,
        "base_question": base_question
    })
    
    # Return structured output for better tracing
    return {
        "question_id": result["id"],
        "original_question": result["base_question"],
        "evolved_question": result["evolved_question"],
        "evolution_type": result["evolution_type"],
        "context_chunks": len(result.get("context", [])),
        "final_answer": result["answer"],
        "full_result": result
    }

In [1]:

# ✅ 6. AUTOMATIC Question Generation using RAGAS TestsetGenerator + LangGraph Processing

# Import RAGAS components for automatic question generation
from ragas.testset import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Initialize RAGAS TestsetGenerator (this will auto-generate questions from documents)
generator_llm = ChatOpenAI(model="gpt-4o-mini")
critic_llm = ChatOpenAI(model="gpt-4o")
ragas_embeddings = OpenAIEmbeddings()

ragas_generator = TestsetGenerator.from_langchain(
    generator_llm, 
    critic_llm, 
    ragas_embeddings
)

# Define evolution distribution for RAGAS (this replaces hardcoded questions)
evolution_distribution = {
    simple: 0.4,        # 40% Simple evolution
    multi_context: 0.4, # 40% Multi-Context evolution  
    reasoning: 0.2      # 20% Reasoning evolution
}

print("🤖 Generating synthetic questions automatically from documents using RAGAS...")

# Generate synthetic testset from documents (NO hardcoded questions!)
ragas_testset = ragas_generator.generate_with_langchain_docs(
    runtime_docs,  # Our loaded LangChain documents
    testset_size=5,  # Number of questions to generate
    distributions=evolution_distribution
)

# Convert RAGAS testset to our format and process through LangGraph
print("🔄 Processing RAGAS-generated questions through LangGraph...")

results = []
processed_results = []

# Extract questions from RAGAS testset and process through our LangGraph
ragas_df = ragas_testset.to_pandas()

for i, row in ragas_df.iterrows():
    question_id = f"RAGAS_Q{i+1:02d}"
    base_question = row['question']  # RAGAS auto-generated question
    
    # Process through our LangGraph (keeping the evol-instruct evolution)
    processed_result = process_qa_question(
        base_question=base_question,
        question_id=question_id
    )
    
    # Store results with RAGAS metadata
    processed_result['ragas_ground_truth'] = row.get('ground_truth', 'N/A')
    processed_result['ragas_contexts'] = row.get('contexts', [])
    
    processed_results.append(processed_result)
    results.append(processed_result["full_result"])

print(f"✅ Successfully processed {len(results)} auto-generated questions!")
print("📊 Questions came from RAGAS document analysis, NOT hardcoded seeds!")

ModuleNotFoundError: No module named 'ragas.testset.evolutions'

In [None]:
# ✅ 7. Original Structure Output (for comparison)

# Evolved Questions
evolved_questions = [
    {
        "id": r["id"],
        "evolved_question": r["evolved_question"],
        "evolution_type": r["evolution_type"]
    } for r in results
]

# Answers
answers = [
    {
        "id": r["id"],
        "answer": r["answer"]
    } for r in results
]

# Contexts (adding this to meet the explicit requirement)
contexts = [
    {
        "id": r["id"],
        "context": r["context"]
    } for r in results
]

import pandas as pd
from IPython.display import display

print("📋 ORIGINAL OUTPUT FORMAT (for backward compatibility):")

print("\nEvolved Questions:")
evolved_df = pd.DataFrame(evolved_questions)
display(evolved_df)

print("\nAnswers:")
answers_df = pd.DataFrame(answers)
display(answers_df)

print("\nContexts:")
contexts_df = pd.DataFrame(contexts)
display(contexts_df)

print("\n🔍 BEFORE vs AFTER RAGAS INTEGRATION:")
print("BEFORE: Hardcoded seed questions (manual, limited, not document-driven)")
print("AFTER:  RAGAS auto-generated questions from document analysis")
print("\nBEFORE: Single evolution strategy (random selection)")
print("AFTER:  RAGAS evolution distribution + LangGraph Evol-Instruct")
print("\nBEFORE: Question IDs like 'FAFSA_Q01', 'FAFSA_Q02'")
print("AFTER:  Question IDs like 'RAGAS_Q01', 'RAGAS_Q02'")
print("\nBEFORE: Manual seed questions required careful crafting")
print("AFTER:  Fully automated, scalable, document-driven generation")


📋 ORIGINAL OUTPUT FORMAT (for backward compatibility):

Evolved Questions:


Unnamed: 0,id,evolved_question,evolution_type
0,FAFSA_Q01,[Simple] How do I correctly fill out the FASFA...,Simple
1,FAFSA_Q02,[Simple] How do signatures and approval requir...,Simple
2,FAFSA_Q03,[Multi-Context] How does the ISIR relate to th...,Multi-Context
3,FAFSA_Q04,[Multi-Context] What are the eligibility requi...,Multi-Context
4,FAFSA_Q05,[Simple] What are the consequences of not fili...,Simple



Answers:


Unnamed: 0,id,answer
0,FAFSA_Q01,I don't know.
1,FAFSA_Q02,Signatures and approval requirements for FAFSA...
2,FAFSA_Q03,The ISIR (Institutional Student Information Re...
3,FAFSA_Q04,I don't know.
4,FAFSA_Q05,I don't know.



Contexts:


Unnamed: 0,id,context
0,FAFSA_Q01,"[any other person. Therefore, you will not be ..."
1,FAFSA_Q02,"[consent and approval for the access, disclosu..."
2,FAFSA_Q03,[2. The disclosure of their FTI by the IRS to ...
3,FAFSA_Q04,[student aid.\nhttps://studentaid.gov/fsa-id/c...
4,FAFSA_Q05,"[has defaulted on a federal student loan, the ..."



🔍 BEFORE vs AFTER LANGSMITH COMPARISON:
BEFORE: Generic 'LangGraph' traces with 'q1', 'temp_id' inputs
AFTER:  Descriptive 'Synthetic_QA_Pipeline' with 'FAFSA_Q01' inputs

BEFORE: Raw evolution type outputs like '[Reasoning]', '[Simple]'
AFTER:  Full question context with evolution metadata

BEFORE: Duplicate runs (2x API calls per question)
AFTER:  Single optimized run per question


In [None]:
# ✅ 8. Enhanced Results Display - RAGAS + LangGraph Integration
import pandas as pd
from IPython.display import display, HTML

print("🚀 RAGAS + LangGraph Synthetic Data Generation Complete!")
print(f"Project: {os.environ.get('LANGCHAIN_PROJECT', 'Not Set')}")
print(f"Tracing: {os.environ.get('LANGCHAIN_TRACING_V2', 'Not Set')}")
print("\n" + "="*80)
print("🎯 AUTOMATED QUESTION GENERATION:")
print("• Questions auto-generated from documents using RAGAS TestsetGenerator")
print("• NO hardcoded seed questions - fully document-driven")
print("• RAGAS evolution types: Simple, Multi-Context, Reasoning") 
print("• LangGraph processes each question through Evol-Instruct pipeline")
print("="*80)

# Enhanced results with better structure
enhanced_results = []
for i, result in enumerate(processed_results):
    enhanced_results.append({
        "ID": result["question_id"],
        "RAGAS Question": result["original_question"][:60] + "..." if len(result["original_question"]) > 60 else result["original_question"],
        "LangGraph Evolution": result["evolution_type"],
        "Final Evolved Question": result["evolved_question"][:80] + "..." if len(result["evolved_question"]) > 80 else result["evolved_question"],
        "Context Chunks": result["context_chunks"],
        "Answer Preview": result["final_answer"][:100] + "..." if len(result["final_answer"]) > 100 else result["final_answer"],
        "Has Answer": "✅ Yes" if result["final_answer"] != "I don't know." else "❌ No Context"
    })

enhanced_df = pd.DataFrame(enhanced_results)
print("📊 ENHANCED SYNTHETIC QA RESULTS:")
display(enhanced_df)

print("\n🎯 PIPELINE ARCHITECTURE:")
print("1. RAGAS TestsetGenerator → Auto-generates questions from documents")
print("2. LangGraph Agent → Processes each question through Evol-Instruct")
print("3. Vector Retrieval → Finds relevant context for evolved questions")
print("4. Answer Generation → Synthesizes final answers from context")

print("\n🔍 LANGSMITH TRACE NAMES:")
print("• Synthetic_QA_Pipeline (main pipeline)")
print("• Question_Evolution (Evol-Instruct step)")
print("• Vector_Search_Retrieval (RAG retrieval)")
print("• RAG_Answer_Generation (answer synthesis)")
print("• Evol_Instruct_Evolution (question evolution)")

print("\n📈 KEY IMPROVEMENTS OVER HARDCODED APPROACH:")
print("• ✅ Document-driven question generation (no manual seeds)")
print("• ✅ RAGAS evolution strategies (Simple, Multi-Context, Reasoning)")
print("• ✅ LangGraph processing with enhanced tracing")
print("• ✅ Question IDs like 'RAGAS_Q01', 'RAGAS_Q02' for easy tracking")
print("• ✅ Dual evolution: RAGAS + LangGraph Evol-Instruct")
print("• ✅ Complete integration of RAGAS synthetic data generation")


🚀 LangSmith Tracing Enhanced!
Project: AI Makerspace Session07 Advanced Build
Tracing: true

📊 ENHANCED SYNTHETIC QA RESULTS:


Unnamed: 0,ID,Original Question,Evolution Type,Evolved Question,Context Chunks,Answer Preview,Has Answer
0,FAFSA_Q01,How do I correctly fill out the FASFA form and...,Simple,[Simple] How do I correctly fill out the FASFA...,5,I don't know.,❌ No Context
1,FAFSA_Q02,How do signatures and approval requirements fo...,Simple,[Simple] How do signatures and approval requir...,5,Signatures and approval requirements for FAFSA...,✅ Yes
2,FAFSA_Q03,How does the ISIR relate to the FTI disclosure...,Multi-Context,[Multi-Context] How does the ISIR relate to th...,5,The ISIR (Institutional Student Information Re...,✅ Yes
3,FAFSA_Q04,What are the eligibility requirements for the ...,Multi-Context,[Multi-Context] What are the eligibility requi...,5,I don't know.,❌ No Context
4,FAFSA_Q05,What are the consequences of not filing the FA...,Simple,[Simple] What are the consequences of not fili...,5,I don't know.,❌ No Context



🎯 LANGSMITH TRACE NAMES YOU'LL NOW SEE:
• Synthetic_QA_Pipeline (main pipeline)
• Question_Evolution (Evol-Instruct step)
• Vector_Search_Retrieval (RAG retrieval)
• RAG_Answer_Generation (answer synthesis)
• Evol_Instruct_Evolution (question evolution)

📈 IMPROVED TRACING FEATURES:
• Meaningful trace names instead of 'LangGraph'
• Descriptive inputs/outputs showing actual questions and evolution types
• Question IDs like 'FAFSA_Q01', 'FAFSA_Q02' for easy tracking
• No duplicate runs - single execution per question
• Metadata tags for filtering and organization
• Console logs showing processing stats
