In [None]:
# IMPORTANT: Patch OpenAI for tracing BEFORE any deepeval imports
import openai
from deepeval.openai.patch import patch_openai
patch_openai(openai)

In [5]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
import os
from tqdm import tqdm
import time
import json

LLM_MODEL = "llama3.1:8b"
EMBEDDING_MODEL = "mxbai-embed-large:335m"

RAW_FILES = r"C:\GitRepos\LangChainCourse\documentation_assistant\raw_documents"

# 1. Set up a basic RAG pipeline

In [2]:
persist_directory = "./chroma_db"

# Initialize Ollama embeddings
embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL)

# Check if vector store already exists
if os.path.exists(persist_directory) and os.listdir(persist_directory):
    print("Loading existing vector store...")
    vector_store = Chroma(
        persist_directory=persist_directory,
        embedding_function=embeddings,
        collection_name="rag_collection"
    )
    print(f"Loaded existing vector store with {vector_store._collection.count()} documents")
else:
    print("Creating new vector store...")
    # Load documents from directory with subfolders
    loader = DirectoryLoader(
        path=RAW_FILES,
        glob="**/*.txt",  # This will find all .txt files in all subdirectories
        loader_cls=TextLoader,  # Use TextLoader for .txt files
        show_progress=True  # Optional: shows loading progress
    )
    documents = loader.load()

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
    chunks = text_splitter.split_documents(documents)

    print(f"Loaded {len(documents)} documents and created {len(chunks)} chunks")

    start_time = time.time()
    # Create vector store with progress tracking
    with tqdm(total=len(chunks), desc="Creating embeddings") as pbar:
        # We'll create the vector store in batches to show progress
        batch_size = 100
        vector_store = None

        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i + batch_size]
            if vector_store is None:
                # First batch - create the vector store
                vector_store = Chroma.from_documents(
                    documents=batch,
                    embedding=embeddings,
                    collection_name="rag_collection",
                    persist_directory=persist_directory
                )
            else:
                # Subsequent batches - add to existing store
                vector_store.add_documents(batch)

            pbar.update(len(batch))

elapsed_time = time.time() - start_time
print(f"Vector store created and persisted in {elapsed_time:.2f} seconds")

# Initialize Ollama LLM
llm = Ollama(model=LLM_MODEL)

# Set up RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": 3})  # Retrieve top 4 documents
)

# Test query
query = "What did Satoshi say about quantum computing? Is it a threat?"
result = qa_chain.run(query)
print("Query:", query)
print("Answer:", result)

Creating new vector store...


100%|██████████| 617/617 [00:00<00:00, 5932.78it/s]


Loaded 617 documents and created 1067 chunks


Creating embeddings:   0%|          | 0/1067 [00:00<?, ?it/s]Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Creating embeddings: 100%|██████████| 1067/1067 [00:16<00:00, 65.39it/s]
  llm = Ollama(model=LLM_MODEL)
  result = qa_chain.run(query)
Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


Vector store created and persisted in 16.32 seconds
Query: What did Satoshi say about quantum computing? Is it a threat?
Answer: I don't know. The provided sources do not mention anything related to quantum computing or its potential threat to Bitcoin. They primarily discuss the concept of proof-of-work, the Byzantine Generals' Problem, and the security features of Bitcoin.


# 2 Testing the RAG pipeline with Goldens
Dataset obtained from notebooks 1 and 2.


### Loading goldens, preparing test cases, generating predictions.

In [11]:
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, ContextualRecallMetric, ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase

goldens_file = 'filtered_goldens_quality_0.8_similarity_0.85.json'


def load_and_prepare_goldens(json_file_path, max_samples=None):
    """Load goldens and transform to deepeval format"""
    with open(json_file_path, 'r') as f:
        goldens_data = json.load(f)

    print(f"Loaded {len(goldens_data)} goldens from {json_file_path}")

    # Transform to deepeval LLMTestCase objects
    # Your JSON: {"input": query, "expected_output": ground_truth, "context": [contexts]}
    # LLMTestCase expects: input, expected_output, context (as list of strings)
    test_cases = []
    data_to_process = goldens_data[:max_samples] if max_samples else goldens_data

    for item in data_to_process:
        test_case = LLMTestCase(
            input=item["input"],
            expected_output=item["expected_output"],
            context=item["context"]  # This should be a list of strings
        )
        test_cases.append(test_case)

    print(f"Prepared {len(test_cases)} LLMTestCase objects")
    return test_cases

def generate_predictions(test_cases, qa_chain, vector_store):
    """Generate predictions for each test case using RAG pipeline and populate actual_output/retrieval_context"""
    for i, test_case in enumerate(test_cases):
        print(f"Generating prediction {i+1}/{len(test_cases)}: {test_case.input[:60]}...")

        try:
            # Run your RAG pipeline to get answer
            answer = qa_chain.run(test_case.input)

            # Get actually retrieved contexts (what your RAG system retrieved)
            retrieved_docs = vector_store.as_retriever().get_relevant_documents(test_case.input)
            retrieved_contexts = [doc.page_content for doc in retrieved_docs]

            # Populate the test case with actual output and retrieval context
            test_case.actual_output = answer
            test_case.retrieval_context = retrieved_contexts

        except Exception as e:
            print(f"Error generating prediction for query {i+1}: {e}")
            test_case.actual_output = f"Error: {str(e)}"
            test_case.retrieval_context = []

    print(f"Generated predictions for {len(test_cases)} test cases")
    return test_cases

test_cases = load_and_prepare_goldens(goldens_file, max_samples=5)  # For debugging, dont just run hundreds yet, start small.

Loaded 181 goldens from filtered_goldens_quality_0.8_similarity_0.85.json
Prepared 5 LLMTestCase objects


### Running evaluation 
Strongly recommended to use OpenAI LLM for evaluation.

In [12]:
# NOTE: You need to have vector_store and qa_chain defined/imported
test_cases = generate_predictions(test_cases, qa_chain, vector_store)



Generating prediction 1/5: What is the significance of the Bitcoin v0.1 release?...
Generating prediction 2/5: What is the main idea behind Bitcoin?...
Generating prediction 3/5: What is the main idea behind Satoshi Nakamoto's proposal?...
Generating prediction 4/5: What is the main concern regarding CPU power in the Bitcoin ...
Generating prediction 5/5: What happens if a broadcast transaction does not reach all n...
Generated predictions for 5 test cases


In [10]:
from deepeval.models import GPTModel

llm_gpt = GPTModel(
    model="gpt-4o-mini"
)


def run_evaluation(test_cases):
  """Run deepeval evaluation"""
  metrics = [
      AnswerRelevancyMetric(),
      FaithfulnessMetric(),
      ContextualRecallMetric(),
      ContextualPrecisionMetric()
  ]

  print("Running evaluation...")
  results = evaluate(
      test_cases=test_cases,
      metrics=metrics
  )

  print("\n" + "="*50)
  print("EVALUATION RESULTS")
  print("="*50)
  print(results)
  return results


# Run evaluation
results = run_evaluation(test_cases)

Running evaluation...


Output()


EVALUATION RESULTS
test_results=[] confident_link=None


In [13]:
results

EvaluationResult(test_results=[], confident_link=None)