# LangChain: Evaluation

## Outline:

* Example generation
* Manual evaluation (and debuging)
* LLM-assisted evaluation
* LangChain evaluation platform

In [None]:
import os

from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())  # read local .env file

In [None]:
# account for deprecation of LLM model
import datetime

# Get the current date
current_date = datetime.datetime.now().date()

# Define the date after which the model should be set to "gpt-3.5-turbo"
target_date = datetime.date(2024, 6, 12)

# Set the model variable based on the current date
if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

## Create our QandA application

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch

In [None]:
file = "OutdoorClothingCatalog_1000.csv"
loader = CSVLoader(file_path=file, encoding="utf-8")
data = loader.load()

In [None]:
index = VectorstoreIndexCreator(vectorstore_cls=DocArrayInMemorySearch).from_loaders(
    [loader]
)

In [None]:
llm = ChatOpenAI(temperature=0.0, model=llm_model)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=index.vectorstore.as_retriever(),
    verbose=True,
    chain_type_kwargs={"document_separator": "<<<<>>>>>"},
)

### Coming up with test datapoints

In [None]:
data[10]

In [None]:
data[11]

### Hard-coded examples

Below we are setting up hard-coded test cases.  We've sampled the data and contrived some questions that should return answers for the above products from our dataset.


In [112]:
examples = [
    {
        "query": "Do the Cozy Comfort Pullover Set\
        have side pockets?",
        "answer": "No",
    },
    {
        "query": "What collection is the Ultra-Lofty \
        850 Stretch Down Hooded Jacket from?",
        "answer": "The Drown Tech collection",
    },
]

### LLM-Generated examples

In [None]:
from langchain.evaluation.qa import QAGenerateChain

In [97]:
gen_chain = QAGenerateChain.from_llm(ChatOpenAI(model=llm_model))

In [98]:
gen_examples = gen_chain.apply_and_parse([{"doc": t} for t in data[:5]])

In [100]:
gen_examples[0]

{'qa_pairs': {'query': "What is the weight of each pair of Women's Campside Oxfords?",
  'answer': "The approximate weight of each pair of Women's Campside Oxfords is 1 lb. 1 oz."}}

In [None]:
data[0]

In [113]:
qa.run(examples[0]["query"])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'The Cozy Comfort Pullover Set, Stripe has side pockets.'

### Combine examples

In [107]:
examples += gen_examples
examples

[{'qa_pairs': {'query': 'Do the Cozy Comfort Pullover Set        have side pockets?',
   'answer': 'Yes'}},
 {'qa_pairs': {'query': 'What collection is the Ultra-Lofty         850 Stretch Down Hooded Jacket from?',
   'answer': 'The Drown Tech collection'}},
 {'qa_pairs': {'query': "What is the weight of each pair of Women's Campside Oxfords?",
   'answer': "The approximate weight of each pair of Women's Campside Oxfords is 1 lb. 1 oz."}},
 {'qa_pairs': {'query': 'What are the dimensions of the small and medium Recycled Waterhog Dog Mats?',
   'answer': 'The dimensions of the small mat are 18" x 28" and the dimensions of the medium mat are 22.5" x 34.5".'}},
 {'qa_pairs': {'query': "What are some features of the Infant and Toddler Girls' Coastal Chill Swimsuit?",
   'answer': "The swimsuit has bright colors, ruffles, and exclusive whimsical prints. The fabric is four-way-stretch and chlorine-resistant, and has a UPF rating of 50+ to block 98% of the sun's harmful rays. The crossover no

## Manual Evaluation

This shows how to debug.  With debug turned on, you can see the thought and retrieval process that the LLM is doing.

In [None]:
import langchain

langchain.debug = True

In [108]:
qa.run(examples[1]["qa_pairs"])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'The Ultra-Lofty 850 Stretch Down Hooded Jacket is from the DownTek collection.'

In [None]:
# Turn off the debug mode
langchain.debug = False

## LLM assisted evaluation


In [114]:
predictions = qa.apply(examples)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [115]:
from langchain.evaluation.qa import QAEvalChain

In [116]:
llm = ChatOpenAI(temperature=0, model=llm_model)
eval_chain = QAEvalChain.from_llm(llm)

In [117]:
graded_outputs = eval_chain.evaluate(examples, predictions)

In [118]:
graded_outputs

[{'results': 'INCORRECT'}, {'results': 'INCORRECT'}]

In [119]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]["query"])
    print("Real Answer: " + predictions[i]["answer"])
    print("Predicted Answer: " + predictions[i]["result"])
    print("Predicted Grade: " + graded_outputs[i]["results"])
    print()

Example 0:
Question: Do the Cozy Comfort Pullover Set        have side pockets?
Real Answer: No
Predicted Answer: The Cozy Comfort Pullover Set, Stripe has side pockets on the pull-on pants.
Predicted Grade: INCORRECT

Example 1:
Question: What collection is the Ultra-Lofty         850 Stretch Down Hooded Jacket from?
Real Answer: The Drown Tech collection
Predicted Answer: The Ultra-Lofty 850 Stretch Down Hooded Jacket is from the DownTek collection.
Predicted Grade: INCORRECT



In [120]:
graded_outputs[0]

{'results': 'INCORRECT'}

## LangChain evaluation platform

The LangChain evaluation platform, LangChain Plus, can be accessed here https://www.langchain.plus/.  
Use the invite code `lang_learners_2023`