In [1]:
# IMPORTANT: Patch OpenAI for tracing BEFORE any deepeval imports
import openai
from deepeval.openai.patch import patch_openai
patch_openai(openai)

# Fix Unicode encoding issues on Windows
import os
if os.name == 'nt':
    # Set environment variable for UTF-8 encoding
    os.environ['PYTHONIOENCODING'] = 'utf-8'
    
    # Try to set UTF-8 locale
    import locale
    try:
        # Try different UTF-8 locale names
        for loc in ['en_US.UTF-8', 'C.UTF-8', 'UTF-8']:
            try:
                locale.setlocale(locale.LC_ALL, loc)
                break
            except locale.Error:
                continue
        else:
            print("Warning: Could not set UTF-8 locale. Unicode display may be limited.")
    except:
        print("Warning: Locale setting failed. Unicode display may be limited.")

print("Unicode encoding setup complete.")

Unicode encoding setup complete.


In [2]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
import os
from tqdm import tqdm
import time
import json

LLM_MODEL = "llama3.1:8b"
EMBEDDING_MODEL = "mxbai-embed-large:335m"

RAW_FILES = r"C:\GitRepos\LangChainCourse\documentation_assistant\raw_documents"

# 1. Set up a basic RAG pipeline

Plan is to build the best possible RAG architecture with local Ollama models. But use OpenAI for evaluation, as evaluation is the much harder task.

In [None]:
persist_directory = "./chroma_db"
top_k_retrieved_documents = 3

# Initialize Ollama embeddings
embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL)

# Check if vector store already exists
if os.path.exists(persist_directory) and os.listdir(persist_directory):
    print("Loading existing vector store...")
    vector_store = Chroma(
        persist_directory=persist_directory,
        embedding_function=embeddings,
        collection_name="rag_collection"
    )
    print(f"Loaded existing vector store with {vector_store._collection.count()} documents")
else:
    print("Creating new vector store...")
    # Load documents from directory with subfolders
    loader = DirectoryLoader(
        path=RAW_FILES,
        glob="**/*.txt",  # This will find all .txt files in all subdirectories
        loader_cls=TextLoader,  # Use TextLoader for .txt files
        show_progress=True  # Optional: shows loading progress
    )
    documents = loader.load()

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
    chunks = text_splitter.split_documents(documents)

    print(f"Loaded {len(documents)} documents and created {len(chunks)} chunks")

    start_time = time.time()
    # Create vector store with progress tracking
    with tqdm(total=len(chunks), desc="Creating embeddings") as pbar:
        # We'll create the vector store in batches to show progress
        batch_size = 100
        vector_store = None

        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i + batch_size]
            if vector_store is None:
                # First batch - create the vector store
                vector_store = Chroma.from_documents(
                    documents=batch,
                    embedding=embeddings,
                    collection_name="rag_collection",
                    persist_directory=persist_directory
                )
            else:
                # Subsequent batches - add to existing store
                vector_store.add_documents(batch)

            pbar.update(len(batch))
    elapsed_time = time.time() - start_time
    print(f"Vector store created and persisted in {elapsed_time:.2f} seconds")

# Initialize Ollama LLM
llm = Ollama(model=LLM_MODEL)

# Set up RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": top_k_retrieved_documents})  # Retrieve top 4 documents
)

# Test query
query = "What did Satoshi say about quantum computing? Is it a threat?"
result = qa_chain.run(query)
print("Query:", query)
print("Answer:", result)

Loading existing vector store...


  vector_store = Chroma(
Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
  llm = Ollama(model=LLM_MODEL)
  result = qa_chain.run(query)


Loaded existing vector store with 1067 documents


Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


Query: What did Satoshi say about quantum computing? Is it a threat?
Answer: The answer is not directly stated in the provided sources. However, I can look for any mentions of quantum computing by Satoshi Nakamoto.

In one email (Source: https://satoshi.nakamotoinstitute.org/emails/cryptography/11/), there's a discussion about the Byzantine Generals problem and its solution using proof-of-work chain. Quantum computing is not mentioned in this context.

In another post on BitcoinTalk (Source: https://satoshi.nakamotoinstitute.org/posts/bitcointalk/418/), Satoshi Nakamoto discusses the concept of an alert system for the Bitcoin network but doesn't mention quantum computing.

Unfortunately, I don't have any information about Satoshi's views on quantum computing. If you're looking for a more detailed answer or confirmation from other sources, I recommend searching further.


# 2 Testing the RAG pipeline with Goldens
Dataset obtained from notebooks 1 and 2.


### Loading goldens, preparing test cases, generating predictions.

In [None]:
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, ContextualRecallMetric, ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase

goldens_file = 'filtered_goldens_quality_0.8_similarity_0.85.json'


def load_and_prepare_goldens(json_file_path, max_samples=None):
    """Load goldens and transform to deepeval format"""
    with open(json_file_path, 'r') as f:
        goldens_data = json.load(f)

    print(f"Loaded {len(goldens_data)} goldens from {json_file_path}")

    # Transform to deepeval LLMTestCase objects
    # Your JSON: {"input": query, "expected_output": ground_truth, "context": [contexts]}
    # LLMTestCase expects: input, expected_output, context (as list of strings)
    test_cases = []
    data_to_process = goldens_data[:max_samples] if max_samples else goldens_data

    for item in data_to_process:
        test_case = LLMTestCase(
            input=item["input"],
            expected_output=item["expected_output"],
            context=item["context"]  # In Confident-AI will show up as Context. The ideal document to be found.
        )
        test_cases.append(test_case)

    print(f"Prepared {len(test_cases)} LLMTestCase objects")
    return test_cases


# Optional: from deepeval.tracing import observe, then place the @observe()  decorator above this function:
def generate_predictions(test_cases, qa_chain, vector_store):
    """Generate predictions for each test case using RAG pipeline and populate actual_output/retrieval_context"""
    for i, test_case in enumerate(test_cases):
        print(f"Generating prediction {i+1}/{len(test_cases)}: {test_case.input[:60]}...")

        try:
            # Run your RAG pipeline to get answer
            answer = qa_chain.run(test_case.input)

            # Get actually retrieved contexts (what your RAG system retrieved, important to be consistent here. i.e. the same top-k)
            # This is then what shows up in Confident-AI under "Retrieved Context"
            retrieved_docs = vector_store.as_retriever(search_kwargs={"k": top_k_retrieved_documents}).get_relevant_documents(test_case.input)
            retrieved_contexts = [doc.page_content for doc in retrieved_docs]

            # Populate the test case with actual output and retrieval context
            test_case.actual_output = answer
            test_case.retrieval_context = retrieved_contexts

        except Exception as e:
            print(f"Error generating prediction for query {i+1}: {e}")
            test_case.actual_output = f"Error: {str(e)}"
            test_case.retrieval_context = []

    print(f"Generated predictions for {len(test_cases)} test cases")
    return test_cases

test_cases = load_and_prepare_goldens(goldens_file, max_samples=5)  # For debugging, dont just run hundreds yet, start small.

Loaded 181 goldens from filtered_goldens_quality_0.8_similarity_0.85.json
Prepared 5 LLMTestCase objects


In [5]:
test_cases = generate_predictions(test_cases, qa_chain, vector_store)

Generating prediction 1/5: What is the significance of the Bitcoin v0.1 release?...


  retrieved_docs = vector_store.as_retriever().get_relevant_documents(test_case.input)


Generating prediction 2/5: What is the main idea behind Bitcoin?...
Generating prediction 3/5: What is the main idea behind Satoshi Nakamoto's proposal?...
Generating prediction 4/5: What is the main concern regarding CPU power in the Bitcoin ...
Generating prediction 5/5: What happens if a broadcast transaction does not reach all n...
Generated predictions for 5 test cases


### Running evaluation 
Strongly recommended to use OpenAI LLM for evaluation.

In [8]:
from deepeval.models import GPTModel

llm_gpt = GPTModel(
    model="gpt-4o-mini"
)

def run_evaluation(test_cases, batch_size=2, delay_between_batches=2):
    """Run deepeval evaluation with rate limiting to avoid API spam"""
    metrics = [
        AnswerRelevancyMetric(model=llm_gpt),
        FaithfulnessMetric(model=llm_gpt),
        ContextualRecallMetric(model=llm_gpt),
        ContextualPrecisionMetric(model=llm_gpt)
    ]

    print(f"Running evaluation on {len(test_cases)} test cases with {len(metrics)} metrics...")
    print(f"Using batch size: {batch_size}, delay between batches: {delay_between_batches}s")

    # Process in batches to avoid rate limits
    all_results = []
    for i in range(0, len(test_cases), batch_size):
        batch = test_cases[i:i + batch_size]
        batch_num = (i // batch_size) + 1
        total_batches = (len(test_cases) + batch_size - 1) // batch_size

        print(f"\nProcessing batch {batch_num}/{total_batches} (test cases {i+1}-{min(i+batch_size, len(test_cases))})...")

        try:
            batch_results = evaluate(
                test_cases=batch,
                metrics=metrics
            )
            all_results.extend(batch_results.test_results)

            if batch_num < total_batches:
                print(f"Waiting {delay_between_batches} seconds before next batch...")
                time.sleep(delay_between_batches)

        except Exception as e:
            print(f"Error in batch {batch_num}: {e}")
            # Continue with next batch rather than failing completely
            continue

    # Create a mock results object with all test results
    class MockResults:
        def __init__(self, test_results):
            self.test_results = test_results
            self.confident_link = getattr(batch_results, 'confident_link', None) if 'batch_results' in locals() else None

        def __str__(self):
            return f"EvaluationResults(test_results={len(self.test_results)})"

    results = MockResults(all_results)

    print("\n" + "="*50)
    print("EVALUATION RESULTS")
    print("="*50)
    print(f"Total test results: {len(results.test_results)}")
    if results.confident_link:
        print(f"Confident link: {results.confident_link}")
    return results


# Run evaluation (automatically traced with Confident-AI)
results = run_evaluation(test_cases)

Running evaluation on 5 test cases with 4 metrics...
Using batch size: 2, delay between batches: 2s

Processing batch 1/3 (test cases 1-2)...


Output()



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 1.00 because the response directly addresses the main idea behind Bitcoin without any irrelevant statements., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 1.00 because there are no contradictions present, indicating that the actual output aligns perfectly with the retrieval context., error: None)
  - ✅ Contextual Recall (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 1.00 because all sentences in the expected output are directly supported by the information in node(s) in retrieval context, clearly articulating the core concepts of Bitcoin without any contradictions., error: None)
  - ✅ Contextual Precision (score: 0.8333333333333333, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 0.83 b

Waiting 2 seconds before next batch...

Processing batch 2/3 (test cases 3-4)...


Output()



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 1.00 because the response directly addresses the main concern regarding CPU power in the Bitcoin network without any irrelevant statements., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 1.00 because there are no contradictions present, indicating that the actual output aligns perfectly with the retrieval context., error: None)
  - ❌ Contextual Recall (score: 0.0, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 0.00 because none of the sentences in the expected output can be attributed to any node(s) in the retrieval context, as they all introduce concepts not present in the context., error: None)
  - ❌ Contextual Precision (score: 0.0, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 0.00 because all 

Waiting 2 seconds before next batch...

Processing batch 3/3 (test cases 5-5)...


Output()



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 1.00 because the response directly addresses the question about broadcast transactions without any irrelevant statements., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 1.00 because there are no contradictions present, indicating that the actual output aligns perfectly with the retrieval context., error: None)
  - ✅ Contextual Recall (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 1.00 because all sentences in the expected output are directly supported by the information from node(s) in retrieval context, clearly explaining how transactions are handled in the blockchain., error: None)
  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 1.00 because all 


EVALUATION RESULTS
Total test results: 5
Confident link: https://app.confident-ai.com/project/cmfcas3kc0gmz3zfa5jqdn7ts/evaluation/test-runs/cmg7w1w3s09ob9532w9475xqp/compare-test-results
