# Visualising the Evaluations with LangSmith

In [1]:
# Load the .env file
#pip install -U python-dotenv
import os
from dotenv import load_dotenv
load_dotenv(encoding='utf-8')

True

## Create a simple RAG pipeline

In [21]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from pymongo import MongoClient
from langchain.chains import RetrievalQA
import os
from dotenv import load_dotenv
load_dotenv(encoding='utf-8')

True

In [22]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI()
embedding_model=OpenAIEmbeddings(disallowed_special=())

In [23]:
os.environ["ATLAS_CONNECTION_STRING"] = os.getenv("ATLAS_CONNECTION_STRING")
client = MongoClient(os.environ["ATLAS_CONNECTION_STRING"])
db_name = "tech_innovators_db"
collection_name = "tech_innovators_collection"
atlas_collection = client[db_name][collection_name]
index_name = "vector_index_erp"

In [24]:
def get_vector_store_retriver(index_name, embedding_model, collection):

  vector_store = MongoDBAtlasVectorSearch(
      embedding = embedding_model,
      collection = atlas_collection,
      index_name = index_name
  )

  retriever = vector_store.as_retriever(
      search_type = "similarity",
      search_kwargs = { "k": 10 }
  )

  return(vector_store, retriever)

vector_store, retriever = get_vector_store_retriver("vector_index_erp", embedding_model, atlas_collection)

In [25]:
# Instantiate Atlas Vector Search as a retriever
retriever = vector_store.as_retriever(
   search_type = "similarity",
   search_kwargs = { "k": 10 }
)

In [27]:
# Define a prompt template
template = """
Imagine you are an expert in corporate assistant and try to answer the below question
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
"""
custom_rag_prompt = PromptTemplate.from_template(template)

def format_docs(docs):
   return "\n\n".join(doc.page_content for doc in docs)

# Construct a chain to answer questions on your data
rag_chain = (
   { "context": retriever | format_docs, "question": RunnablePassthrough()}
   | custom_rag_prompt
   | llm
   | StrOutputParser()
)

In [28]:
qa_chain = RetrievalQA.from_chain_type(
    llm, retriever=retriever, return_source_documents=True
)

In [29]:
# factory function that return a new qa chain
def create_qa_chain(return_context=True):
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vector_store.as_retriever(search_type = "similarity",search_kwargs = { "k": 10 }),
        return_source_documents=return_context,
    )
    return qa_chain

## Open a test dataset in LangSmith

In [18]:
# dataset creation
from langsmith import Client
from langsmith.utils import LangSmithError

client = Client()
dataset_name = "hr test"

try:
    # check if dataset exists
    dataset = client.read_dataset(dataset_name=dataset_name)
    print("using existing dataset: ", dataset.name)
except LangSmithError:
    # if not create a new one with the generated query examples
    #dataset = client.create_dataset(
    #    dataset_name=dataset_name, description="HR department test dataset"
    #)
    #for q in question:
    #   client.create_example(
    #        inputs={"query": q},
    #       dataset_id=dataset.id,
    #    )
    print("No dataset exist: ", dataset.name)

using existing dataset:  hr test


## Create RAGAS evaluation chain

In [19]:
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from ragas.integrations.langchain import EvaluatorChain
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)

# create evaluation chains
faithfulness_chain = EvaluatorChain(metric=faithfulness)
answer_rel_chain = EvaluatorChain(metric=answer_relevancy)
context_rel_chain = EvaluatorChain(metric=context_precision)
context_recall_chain = EvaluatorChain(metric=context_recall)

In [30]:
from langchain.smith import RunEvalConfig, run_on_dataset

evaluation_config = RunEvalConfig(
    custom_evaluators=[
        faithfulness_chain,
        answer_rel_chain,
        context_rel_chain,
        context_recall_chain,
    ],
    prediction_key="result",
)

result = run_on_dataset(
    client,
    dataset_name,
    create_qa_chain,
    evaluation=evaluation_config,
    input_mapper=lambda x: x,
)

def construct_chain():
    my_chain = ...
    input_mapper = {'other_key': 'MyOtherInput', 'my_input_key': x}
    return input_mapper | my_chain
run_on_dataset(..., llm_or_chain_factory=construct_chain)
(See https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.base.RunnableLambda.html)


TypeError: 'Database' object is not callable

# Run evaluation on LangSmith

In [33]:
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

Question: {question}

Helpful Answer:"""
llm_prompt = PromptTemplate.from_template(template)

just_llm = (
    {"question": RunnablePassthrough()}
    | llm_prompt
    | ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
    | StrOutputParser()
    | RunnableParallel(
        {
            "answer": RunnablePassthrough(),
            "contexts": RunnableLambda(lambda _: [""]),
        }
    )
)

In [34]:
from ragas.integrations.langchain import EvaluatorChain

# the metric we will be using
from ragas.metrics import answer_correctness
from ragas.integrations.langsmith import evaluate

In [35]:
dataset_name = "hr test"
# evaluate rag_chain
run = evaluate(
    dataset_name=dataset_name,
    llm_or_chain_factory=just_llm,
    experiment_name="just_llm_1",
    metrics=[answer_correctness],
    verbose=True,
)

View the evaluation results for project 'just_llm_1' at:
https://smith.langchain.com/o/08bc9556-81b3-56d7-98aa-4f87d6cdfca5/datasets/f04f14f3-f165-48c3-8d94-dbf759844c7d/compare?selectedSessions=3a32c6dc-6e03-4644-9502-6312370d0c8b

View all tests for Dataset hr test at:
https://smith.langchain.com/o/08bc9556-81b3-56d7-98aa-4f87d6cdfca5/datasets/f04f14f3-f165-48c3-8d94-dbf759844c7d
[>                                                 ] 0/9

Error evaluating run 807be56d-fc72-441f-95ea-3c2634e8a2c3 with EvaluatorChain: RuntimeError("There is no current event loop in thread 'ThreadPoolExecutor-4_2'.")
Traceback (most recent call last):
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain_core\tracers\evaluation.py", line 127, in _evaluate_in_project
    evaluation_result = evaluator.evaluate_run(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\ragas\integrations\langchain.py", line 210, in evaluate_run
    eval_output = self.invoke(chain_eval, include_run_info=True)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 164, in invoke
    raise e
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 154, in invoke
    self._call(inputs, run_manager=run_manager)
  File "d:\Document\GitHub

[---------------->                                 ] 3/9

Error evaluating run a91cf5be-4187-4097-9912-daea18202b74 with EvaluatorChain: RuntimeError("There is no current event loop in thread 'ThreadPoolExecutor-4_1'.")
Traceback (most recent call last):
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain_core\tracers\evaluation.py", line 127, in _evaluate_in_project
    evaluation_result = evaluator.evaluate_run(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\ragas\integrations\langchain.py", line 210, in evaluate_run
    eval_output = self.invoke(chain_eval, include_run_info=True)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 164, in invoke
    raise e
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 154, in invoke
    self._call(inputs, run_manager=run_manager)
  File "d:\Document\GitHub

[--------------------------->                      ] 5/9

Error evaluating run c1fb5128-3f57-4a8b-9dd4-6c8c75899feb with EvaluatorChain: RuntimeError("There is no current event loop in thread 'ThreadPoolExecutor-4_2'.")
Traceback (most recent call last):
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain_core\tracers\evaluation.py", line 127, in _evaluate_in_project
    evaluation_result = evaluator.evaluate_run(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\ragas\integrations\langchain.py", line 210, in evaluate_run
    eval_output = self.invoke(chain_eval, include_run_info=True)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 164, in invoke
    raise e
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 154, in invoke
    self._call(inputs, run_manager=run_manager)
  File "d:\Document\GitHub

[-------------------------------------->           ] 7/9

Error evaluating run f57bae72-f8b1-4221-b9b7-397e2fd595c6 with EvaluatorChain: RuntimeError("There is no current event loop in thread 'ThreadPoolExecutor-4_0'.")
Traceback (most recent call last):
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain_core\tracers\evaluation.py", line 127, in _evaluate_in_project
    evaluation_result = evaluator.evaluate_run(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\ragas\integrations\langchain.py", line 210, in evaluate_run
    eval_output = self.invoke(chain_eval, include_run_info=True)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 164, in invoke
    raise e
  File "d:\Document\GitHub\cohere-rag-eval\venv\Lib\site-packages\langchain\chains\base.py", line 154, in invoke
    self._call(inputs, run_manager=run_manager)
  File "d:\Document\GitHub

[------------------------------------------------->] 9/9

Unnamed: 0,error,execution_time,run_id
count,0.0,9.0,9
unique,0.0,,9
top,,,82276ec0-037e-4ae6-a10a-f231f47bd712
freq,,,1
mean,,0.940501,
std,,0.180954,
min,,0.666629,
25%,,0.785931,
50%,,0.968467,
75%,,1.107662,
