# Evaluating Langchain QA Chains

- build a chain
- generate questions
- evaluate with ragas
- save it

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator

loader = TextLoader("./nyc_wikipedia/nyc_text.txt")
index = VectorstoreIndexCreator().from_loaders([loader])

In [3]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI()
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=index.vectorstore.as_retriever(),
    return_source_documents=True
)

question = "How did New York City get its name?"
result = qa_chain({"query": question})
result["result"]

'New York City was named after the Duke of York, who would later become King James II of England. In 1664, when England seized the territory of New Netherland from Dutch control, King Charles II appointed the Duke as the proprietor of the former territory, including the city of New Amsterdam. The Duke of York then renamed the city to New York.'

In [4]:
def create_qa_chain(return_context=True):
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=index.vectorstore.as_retriever(),
        return_source_documents=return_context
    )
    return qa_chain

TODOs
- also support async chains

In [5]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, Document
from llama_index.evaluation import DatasetGenerator

with open("./nyc_wikipedia/nyc_text.txt") as f:
    docs = [Document(text=f.read())]
    
question_generator = DatasetGenerator.from_documents(docs)
eval_questions = question_generator.generate_questions_from_nodes(5)

eval_questions

['What is the population of New York City as of 2020?',
 'Which borough of New York City has the highest population?',
 'What is the economic significance of New York City?',
 'How did New York City get its name?',
 'What is the significance of the Statue of Liberty in New York City?']

In [6]:
from ragas.langchain.evalchain import RagasEvaluatorChain
from ragas.metrics import faithfulness, answer_relevancy, context_relevancy

In [7]:
faithfulness_chain = RagasEvaluatorChain(metric=faithfulness)
answer_rel_chain = RagasEvaluatorChain(metric=answer_relevancy)
context_rel_chain = RagasEvaluatorChain(metric=context_relevancy)

In [15]:
result = qa_chain(eval_questions[0])

In [16]:
faithfulness_chain(result)
answer_rel_chain(result)
context_rel_chain(result)

[]

ValueError: Missing some input keys: {'source_documents'}

In [13]:
from tqdm import tqdm

examples = []
predictions = []

# generate answers
for q in tqdm(eval_questions):
    e = {"query": q}
    p = qa_chain(e)
    examples.append(e)
    predictions.append(p)

100%|████████████████████████████████████████████████████████████| 5/5 [00:21<00:00,  4.29s/it]


In [14]:
r = faithfulness_chain.evaluate(examples, predictions)
r

ValueError: "source_documents" is required in each prediction for the metric[faithfulness] you have chosen.

## Evaluate with langsmith

In [35]:
# dataset creation

from langsmith import Client
from langsmith.utils import LangSmithError

client = Client()
dataset_name = "NYC test"

try:
    # check if dataset exists
    dataset = client.read_dataset(dataset_name=dataset_name)
    print("using existing dataset: ", dataset.name) 
except LangSmithError:
    # if not create a new one with the generated query examples
    dataset = client.create_dataset(
        dataset_name=dataset_name, description="NYC test dataset"
    )
    for q in eval_questions:
        client.create_example(
            inputs={"query": q},
            dataset_id = dataset.id,
        )
    
    print("Created a new dataset: ", dataset.name)

using existing dataset:  NYC test


In [37]:
from langchain.smith import RunEvalConfig, run_on_dataset

evaluation_config = RunEvalConfig(
    custom_evaluators = [
        faithfulness_chain,
        answer_rel_chain,
        context_rel_chain
    ],
    prediction_key="result"
)

result = run_on_dataset(
    client,
    dataset_name,
    create_qa_chain,
    evaluation=evaluation_config,
    input_mapper=lambda x: x
)

View the evaluation results for project '2023-08-18-08-23-27-RetrievalQA' at:
https://smith.langchain.com/projects/p/b0cbcd8e-dd06-4f00-95cb-11195ea2b0b7?eval=true
