In [1]:
%load_ext autoreload
%autoreload 2

# The Problem

Figure out a Metrics-driven approach to make sense of this
![](https://media.licdn.com/dms/image/D4D22AQEgjWxKXokOPA/feedshare-shrink_800/0/1708498751086?e=1711584000&v=beta&t=xaT95vKS8m4qTybofpKqQfXOGoFs8lQXBuOk2Fr45AE)

## Our Solution: `Metrics Driven Development with Ragas`

![](https://docs.ragas.io/en/latest/_static/imgs/component-wise-metrics.png)

In [2]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

## The Data

In [3]:
from langchain.document_loaders import DirectoryLoader
loader = DirectoryLoader("./data/")
documents = loader.load()

In [12]:
for document in documents:
    document.metadata['file_name'] = document.metadata['source']

In [13]:
docs = documents
len(docs)

26

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

## Testset Generation

In [61]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

# generator with openai models
generator = TestsetGenerator.with_openai()

# generate testset
testset = generator.generate_with_langchain_docs(
    documents, 
    test_size=10, 
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25}
)

embedding nodes:   0%|          | 0/120 [00:00<?, ?it/s]

Generating:   0%|          | 0/10 [00:00<?, ?it/s]

In [62]:
test_df = testset.to_pandas()
test_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,episode_done
0,How do the 6-week or 8-week cycles and the con...,[How We Work\n\nCycles\n\nWe work in 6-week or...,The 6-week or 8-week cycles and the concept of...,simple,True
1,"What does it mean when it is said that ""market...",[37signals Is You\n\nEveryone working at 37sig...,"When it is said that ""marketing is everyone's ...",simple,True
2,"What is the meaning of ""PDI"" and how is it use...",[Vocabulary\n\n37signals has over time develop...,"The meaning of ""PDI"" in the context of project...",simple,True
3,How does 37signals recognize mastery and how d...,"[ how we recognize mastery, it’s by no means a...",37signals recognizes mastery by using titles t...,simple,True
4,How does 37signals determine pay and promotion...,"[ how we recognize mastery, it’s by no means a...",37signals determines pay and promotions based ...,simple,True
5,What are the criteria used to assess the level...,[Titles for Designers\n\nWe use the following ...,The criteria used to assess the level and titl...,reasoning,True
6,How does 37signals contribute to the success o...,[37signals Is You\n\nEveryone working at 37sig...,37signals contributes to the success of open s...,reasoning,True
7,How does Basecamp help small businesses with p...,"[ thinking. Since day one, we’ve always done t...","Basecamp helps small businesses with projects,...",multi_context,True
8,"""What is the connection between 37signals' ope...",[37signals Is You\n\nEveryone working at 37sig...,The connection between 37signals' open source ...,multi_context,True
9,What are some of the key influences that have ...,[What Influenced Us\n\nIf you want to learn th...,Some of the key influences that have shaped th...,simple,True


In [63]:
# upload to langsmith

from langsmith import Client
from langsmith.utils import LangSmithNotFoundError

client = Client()
dataset_name = "basecamp"
dataset_desc = "Synthetic testset data for basecamp"

try:
    # check if dataset exists
    dataset = client.read_dataset(dataset_name=dataset_name)
    print("using existing dataset: ", dataset.name)
except LangSmithError:
    # if not create a new one with the generated query examples    
    dataset = client.upload_dataframe(
        df=test_df, 
        name=dataset_name, 
        input_keys=["question"], 
        output_keys=["ground_truth"],
        description=dataset_desc
    )
    
    print("Created a new dataset: ", dataset.name)

Created a new dataset:  basecamp


Failed to batch ingest runs: LangSmithError('Failed to post https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\'400 Client Error: Bad Request for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Request body is not valid JSON"}\')')
Failed to batch ingest runs: LangSmithError('Failed to post https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\'400 Client Error: Bad Request for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Request body is not valid JSON"}\')')
Failed to batch ingest runs: LangSmithError('Failed to post https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\'400 Client Error: Bad Request for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Request body is not valid JSON"}\')')


Now you have a nice way to view them and select the ones you want.

## Baselines

now lets build 2 baselines and compare them with metrics available through Ragas. The first metric will be `AnswerCorrectness`.

The question I'm curious about is
> Is RAG actually better than just LLM's for this data distribution?

well - lets compare shall we! We'll take 2 examples
1. Vanilla RAG from Langchain
2. GPT 3.5 and my humble prompts

first create both

### RAG

In [9]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [109]:
q = testset.to_pandas().question[0]
print("Q: ", q)
resp = rag_chain.invoke(q)
print("A: ", resp)

Q:  How do the 6-week or 8-week cycles and the concept of a "scope hammer" contribute to the work process at 37signals?
A:  The 6-week or 8-week cycles at 37signals provide a fixed cadence that creates an internal sense of urgency and serves as a scope hammer to prevent projects from expanding. The concept helps break big projects into smaller ones that can be completed within the designated time frame and encourages bundling smaller tasks into a manageable scope of work for discussion. This approach is particularly important for product teams to designate work with scope in mind upfront and avoid projects exceeding the budgeted time frame.


but we want the source documents also

In [96]:
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

resp = rag_chain_with_source.invoke("What is Task Decomposition")
resp

{'context': [Document(page_content='All that being said, you should still ensure that there is ample overlap with the people you work with most of the time. While most roadblocks can just as well be cleared in 15-30-60 minutes, they become real annoying if it’s a one-day turn-around every time.\n\nIn certain departments, like Support and Ops, it’s even more important that people are dependently available when they say they will be. That work has a lot of interrupt-based jobs that simply needs to be done right here, right now. So what applies to almost all work for design and programming and QA may well apply a little less frequently there.\n\nIn self-sufficient, independent teams\n\nOrganizational theory is thick with descriptions of the trade-offs between functional and project company structures. We seek to be more project than functional. This means a single project team should be able to go from idea to deploy as independently as possible.', metadata={'source': 'data/how-we-work.md

### LLM

In [38]:
from langchain_core.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

Question: {question}

Helpful Answer:"""
llm_prompt = PromptTemplate.from_template(template)

just_llm = (
    {"question": RunnablePassthrough()}
    | llm_prompt
    | llm
    | StrOutputParser()
)

In [39]:
q = testset.to_pandas().question[0]
print("Q: ", q)
resp = just_llm.invoke(q)
print("A: ", resp)

Q:  What are the criteria for assessing the level and title of ops at 37signals, specifically for the Junior Site Reliability Engineer position?
A:  The criteria for assessing the level and title of ops at 37signals, specifically for the Junior Site Reliability Engineer position, are based on technical skills, experience with relevant tools and technologies, and the ability to work effectively in a team. Thanks for asking!


## Evaluate

In [98]:
from ragas.metrics import faithfulness, answer_correctness, answer_similarity
from ragas.llms import llm_factory
from ragas.embeddings import embedding_factory

In [83]:
from langchain.smith import RunEvalConfig, run_on_dataset
from langsmith.evaluation import EvaluationResult, run_evaluator

In [106]:
answer_similarity.embeddings=embedding_factory()
answer_correctness.llm = llm_factory()
answer_correctness.answer_similarity = answer_similarity

@run_evaluator
def ragas_scores(run, example) -> EvaluationResult:
    pred = run.outputs.get("output") or ""
    q = example.inputs["question"]
    g = example.outputs["ground_truth"]
    if isinstance(pred, str):
        contexts = [""]
        a = pred
    else:
        contexts = [d.page_context for d in pred.output["context"]]
        a = pred.outputs["answer"]
        
    score = answer_correctness.score({
            "question": example.inputs["question"], 
            "contexts": contexts,
            "answer": pred,
            "ground_truth": g
        })
    return EvaluationResult(key="answer_correctness", score=score)

In [110]:
eval_config = RunEvalConfig(
    custom_evaluators=[ragas_scores],
)
run = client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=just_llm,
    evaluation=eval_config,
    verbose=True,
    # Any experiment metadata can be specified here
    project_metadata={"version": "1.0.0"},
)

View the evaluation results for project 'ordinary-building-39' at:
https://smith.langchain.com/o/9bfbddc5-b88e-41e5-92df-2a62f0c64b4b/datasets/f3408987-0cab-46c2-9219-3dc52f525f0a/compare?selectedSessions=6d8cf192-919f-43c3-bf7e-4236ab739439

View all tests for Dataset basecamp at:
https://smith.langchain.com/o/9bfbddc5-b88e-41e5-92df-2a62f0c64b4b/datasets/f3408987-0cab-46c2-9219-3dc52f525f0a
[------------------------------------------------->] 10/10

Unnamed: 0,feedback.answer_correctness,error,execution_time,run_id
count,10.0,0.0,10.0,10
unique,,0.0,,10
top,,,,7810e0e1-0e31-4f56-a403-f9a72a2b79d0
freq,,,,1
mean,0.536848,,1.2802,
std,0.172317,,0.213458,
min,0.215803,,0.997127,
25%,0.42228,,1.086276,
50%,0.533798,,1.278777,
75%,0.694764,,1.465362,


## Need help with improving RAG at your company? -> we are conducting Office-Hours

More info will be available on bookface shortly but chat with us if you want to know more.