# Evaluations with LangSmith

In [1]:
# Load the .env file
#pip install -U python-dotenv
import os
from dotenv import load_dotenv
load_dotenv(encoding='utf-8')

True

## Create a simple RAG pipeline

In [2]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from pymongo import MongoClient

In [3]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI()
embedding_model=OpenAIEmbeddings(disallowed_special=())

In [4]:
os.environ["ATLAS_CONNECTION_STRING"] = os.getenv("ATLAS_CONNECTION_STRING")
client = MongoClient(os.environ["ATLAS_CONNECTION_STRING"])
db_name = "tech_innovators_db"
collection_name = "tech_innovators_collection"
atlas_collection = client[db_name][collection_name]
index_name = "vector_index_erp"

In [5]:
def get_vector_store_retriver(index_name, embedding_model, collection):

  vector_store = MongoDBAtlasVectorSearch(
      embedding = embedding_model,
      collection = atlas_collection,
      index_name = index_name
  )

  retriever = vector_store.as_retriever(
      search_type = "similarity",
      search_kwargs = { "k": 10 }
  )

  return(vector_store, retriever)

vector_store, retriever = get_vector_store_retriver("vector_index_erp", embedding_model, atlas_collection)

In [6]:
# get one example question for the dataset for testing
from langsmith import Client

client = Client()
examples = list(client.list_examples(dataset_name="hr test"))

q = examples[0].inputs
q

{'question': "What information should be included in the policies and procedures section of the company's welcome message?"}

In [7]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain import hub

In [8]:
# Retrieve and generate using the relevant snippets from the docs
# Instantiate Atlas Vector Search as a retriever
vectorstore_retriever  = vector_store.as_retriever(
   search_type = "similarity",
   search_kwargs = { "k": 10 }
)

# load a RAG prompt from Langchain HUB
prompt = hub.pull("rlm/rag-prompt")
# our llm of choice
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


def ragas_output_parser(docs):
    return [doc.page_content for doc in docs]

  prompt = loads(json.dumps(prompt_object.manifest))


In [9]:
from langchain_core.runnables import RunnableParallel

generator = prompt | llm | StrOutputParser()

retriever = RunnableParallel(
    {
        "context": vectorstore_retriever | format_docs,
        "question": RunnablePassthrough(),
    }
)

filter_langsmith_dataset = RunnableLambda(
    lambda x: x["question"] if isinstance(x, dict) else x
)

rag_chain = RunnableParallel(
    {
        "question": filter_langsmith_dataset,
        "answer": filter_langsmith_dataset | retriever | generator,
        "contexts": filter_langsmith_dataset
        | vectorstore_retriever
        | ragas_output_parser,
    }
)

In [10]:
# check with the example question to see if everything is working
get_answer = RunnableLambda(lambda x: x["answer"])
resp = (rag_chain | get_answer).invoke(q)
resp

"The policies and procedures section of the company's welcome message should include information on employee conduct, dress code, attendance expectations, and any other important guidelines for working at the company. It should also cover topics such as safety protocols, data security measures, and communication channels within the organization. Additionally, the policies and procedures section should outline the process for reporting any issues or concerns to the appropriate personnel."

## Open a test dataset in LangSmith

In [11]:
# dataset creation
from langsmith import Client
from langsmith.utils import LangSmithError

client = Client()
dataset_name = "hr test"

try:
    # check if dataset exists
    dataset = client.read_dataset(dataset_name=dataset_name)
    print("using existing dataset: ", dataset.name)
except LangSmithError:
    print("No dataset exist: ", dataset.name)

using existing dataset:  hr test


## Create RAGAS evaluation chain

In [None]:
#from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
#from ragas.integrations.langchain import EvaluatorChain
#from ragas.metrics import (
#    faithfulness,
#    answer_relevancy,
#    context_precision,
#    context_recall,
#)

# create evaluation chains
#faithfulness_chain = EvaluatorChain(metric=faithfulness)
#answer_rel_chain = EvaluatorChain(metric=answer_relevancy)
#context_rel_chain = EvaluatorChain(metric=context_precision)
#context_recall_chain = EvaluatorChain(metric=context_recall)

In [None]:
#from langchain.smith import RunEvalConfig, run_on_dataset

#evaluation_config = RunEvalConfig(
#    custom_evaluators=[
#        faithfulness_chain,
#        answer_rel_chain,
#        context_rel_chain,
#        context_recall_chain,
#    ],
#    prediction_key="result",
#)

#result = run_on_dataset(
#    client,
#    dataset_name,
#    create_qa_chain,
#    evaluation=evaluation_config,
#    input_mapper=lambda x: x,
#)

## Run evaluation on LangSmith

### Baseline (Without RAG)

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

Question: {question}

Helpful Answer:"""
llm_prompt = PromptTemplate.from_template(template)

just_llm = (
    {"question": RunnablePassthrough()}
    | llm_prompt
    | ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
    | StrOutputParser()
    | RunnableParallel(
        {
            "answer": RunnablePassthrough(),
            "contexts": RunnableLambda(lambda _: [""]),
        }
    )
)

In [None]:
from ragas.integrations.langchain import EvaluatorChain
# the metric we will be using
from ragas.metrics import answer_correctness
from ragas.integrations.langsmith import evaluate

# evaluate just llm
run = evaluate(
    dataset_name=dataset_name,
    llm_or_chain_factory=just_llm,
    experiment_name="just_llm_1",
    metrics=[answer_correctness],
    verbose=True,
)

### With RAG

In [18]:
from ragas.integrations.langchain import EvaluatorChain
# the metric we will be using
from ragas.metrics import answer_correctness
from ragas.integrations.langsmith import evaluate
#from ragas import evaluate

In [20]:
dataset_name = "hr test"
# evaluate rag_chain
run = evaluate(
    dataset_name=dataset_name,
    llm_or_chain_factory=rag_chain,
    experiment_name="rag_chain_5",
    #metrics=[answer_correctness],
    verbose=True,
)

View the evaluation results for project 'rag_chain_5' at:
https://smith.langchain.com/o/08bc9556-81b3-56d7-98aa-4f87d6cdfca5/datasets/f04f14f3-f165-48c3-8d94-dbf759844c7d/compare?selectedSessions=da581b53-3f5c-4dac-a714-f328ea6f9693

View all tests for Dataset hr test at:
https://smith.langchain.com/o/08bc9556-81b3-56d7-98aa-4f87d6cdfca5/datasets/f04f14f3-f165-48c3-8d94-dbf759844c7d
[>                                                 ] 0/9

Error evaluating run bbf5e936-5fea-436f-89e3-3e217e5a5b1d with EvaluatorChain: RuntimeError("There is no current event loop in thread 'ThreadPoolExecutor-40_1'.")
Traceback (most recent call last):
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain_core\tracers\evaluation.py", line 127, in _evaluate_in_project
    evaluation_result = evaluator.evaluate_run(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\ragas\integrations\langchain.py", line 210, in evaluate_run
    eval_output = self.invoke(chain_eval, include_run_info=True)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 164, in invoke
    raise e
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 154, in invoke
    self._call(inputs, run_manager=run_manager)
  File "d:\Document\GitHu

[----->                                            ] 1/9

Error evaluating run 8f3c4545-8bcf-414b-9b0b-12cfa58a6d16 with EvaluatorChain: RuntimeError("There is no current event loop in thread 'ThreadPoolExecutor-40_3'.")
Traceback (most recent call last):
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain_core\tracers\evaluation.py", line 127, in _evaluate_in_project
    evaluation_result = evaluator.evaluate_run(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\ragas\integrations\langchain.py", line 210, in evaluate_run
    eval_output = self.invoke(chain_eval, include_run_info=True)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 164, in invoke
    raise e
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 154, in invoke
    self._call(inputs, run_manager=run_manager)
  File "d:\Document\GitHu

[---------->                                       ] 2/9

Error in EvaluatorCallbackHandler.on_chain_end callback: RuntimeError("There is no current event loop in thread 'ThreadPoolExecutor-40_4'.")
Error evaluating run 4c8eb819-b5e6-4562-a7c3-32500687d470 with EvaluatorChain: RuntimeError("There is no current event loop in thread 'ThreadPoolExecutor-40_0'.")
Traceback (most recent call last):
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain_core\tracers\evaluation.py", line 127, in _evaluate_in_project
    evaluation_result = evaluator.evaluate_run(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\ragas\integrations\langchain.py", line 210, in evaluate_run
    eval_output = self.invoke(chain_eval, include_run_info=True)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 164, in invoke
    raise e
  File "d:\Document\GitHub\cohere-rag-eval\ve

[--------------------->                            ] 4/9

Error in EvaluatorCallbackHandler.on_chain_end callback: RuntimeError("There is no current event loop in thread 'ThreadPoolExecutor-40_2'.")


[--------------------------->                      ] 5/9

Error evaluating run bc81cee9-46a8-4166-8928-24e1d6f2a6c7 with EvaluatorChain: RuntimeError("There is no current event loop in thread 'ThreadPoolExecutor-40_1'.")
Traceback (most recent call last):
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain_core\tracers\evaluation.py", line 127, in _evaluate_in_project
    evaluation_result = evaluator.evaluate_run(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\ragas\integrations\langchain.py", line 210, in evaluate_run
    eval_output = self.invoke(chain_eval, include_run_info=True)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 164, in invoke
    raise e
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 154, in invoke
    self._call(inputs, run_manager=run_manager)
  File "d:\Document\GitHu

[-------------------------------->                 ] 6/9

Error evaluating run 27e06bc8-ac04-47ac-bf05-31e9a4848db6 with EvaluatorChain: RuntimeError("There is no current event loop in thread 'ThreadPoolExecutor-40_0'.")
Traceback (most recent call last):
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain_core\tracers\evaluation.py", line 127, in _evaluate_in_project
    evaluation_result = evaluator.evaluate_run(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\ragas\integrations\langchain.py", line 210, in evaluate_run
    eval_output = self.invoke(chain_eval, include_run_info=True)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 164, in invoke
    raise e
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 154, in invoke
    self._call(inputs, run_manager=run_manager)
  File "d:\Document\GitHu

[------------------------------------------->      ] 8/9

Error evaluating run b30a0e8d-1712-429d-9f54-e3b36face358 with EvaluatorChain: RuntimeError("There is no current event loop in thread 'ThreadPoolExecutor-40_3'.")
Traceback (most recent call last):
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain_core\tracers\evaluation.py", line 127, in _evaluate_in_project
    evaluation_result = evaluator.evaluate_run(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\ragas\integrations\langchain.py", line 210, in evaluate_run
    eval_output = self.invoke(chain_eval, include_run_info=True)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 164, in invoke
    raise e
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 154, in invoke
    self._call(inputs, run_manager=run_manager)
  File "d:\Document\GitHu

[------------------------------------------------->] 9/9

Unnamed: 0,error,execution_time,run_id
count,0.0,9.0,9
unique,0.0,,9
top,,,4c8eb819-b5e6-4562-a7c3-32500687d470
freq,,,1
mean,,1.931884,
std,,0.455481,
min,,1.218915,
25%,,1.609093,
50%,,1.936864,
75%,,2.215408,


In [1]:
from langchain import hub
from langchain_openai import ChatOpenAI

# Grade prompt
grade_prompt_answer_accuracy = prompt = hub.pull("langchain-ai/rag-answer-vs-reference")

def answer_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer accuracy
    """

    # Get question, ground truth answer, RAG chain answer
    input_question = example.inputs["input_question"]
    reference = example.outputs["output_answer"]
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_answer_accuracy | llm

    # Run evaluator
    score = answer_grader.invoke({"question": input_question,
                                  "correct_answer": reference,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_v_reference_score", "score": score}