## LangChain Evaluation/Benchmarking using TruLens

### Init the Environment

In [None]:
! pip3 install -U trulens-eval # includes lang-chain as a dependency
! pip3 install -U ipython ipywidgets # required for the trulens UI to run from inside the notebook
! pip3 install -U llama-index # for the llamaindex-cli tool to download datasets
! pip3 install -U astrapy # to access AstraDB vector store

You should restart your environment after installing the above.

In [None]:
# this notebook assumes the following env vars exist in a .env file:
#
# ASTRA_DB_API_ENDPOINT=https://<uuid>-<region>.apps.astra.datastax.com
# ASTRA_DB_APPLICATION_TOKEN=AstraCS:<secret>:<secret>
# AZURE_OPENAI_ENDPOINT=https://<domain>.openai.azure.com/
# AZURE_OPENAI_API_KEY=<secret>
# OPENAI_API_TYPE=azure
# OPENAI_API_VERSION=2023-05-15

# and optionally this var if you want to use an external database for TruLens:
# TRULENS_DB_CONN_STRING=<db connection string>

from dotenv import load_dotenv

load_dotenv()

In [None]:
astra_db_collection_name = "langchain_openai"

### Setup Azure LLMs for LangChain

In [None]:
# Setup Azure-based models
from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import AzureOpenAIEmbeddings

gpt_35_turbo = AzureChatOpenAI(
    azure_deployment="gpt-35-turbo",
    openai_api_version="2023-05-15",
    model_version="0613",
    temperature=0.0,
)

open_ai_embeddings = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-ada-002",
    openai_api_version="2023-05-15"
)

### Init an AstraDB vector store

In [None]:
from langchain_astradb import AstraDBVectorStore
import os

astra_db_vstore = AstraDBVectorStore(
    collection_name=astra_db_collection_name,
    embedding=open_ai_embeddings,
    token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"),
    api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT")
)

### Download Data

**Mini Squad V2**

Description: This is a subset of the original SquadV2 dataset. In particular, it considers only the top 10 Wikipedia pages in terms of having questions about them.

Number Of Examples: 195

Examples Generated By: Human

| Baseline | Context Similarity | Correct-ness | Faithful-ness | Relev-ancy | LLM | Chunk Size | Similarity Top-K | Embed Model |
| ---      | ---                | ---          | ---           | ---        | --- | ---        | ---              | ---         |
| [llamaindex](https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/mini_squadv2/llamaindex_baseline.py) | 0.878 | 3.464 | 0.815 | 0.697 | gpt-3.5-turbo | 1024 | 2 | text-embedding-ada-002 |

Source(s): https://huggingface.co/datasets/squad_v2

In [None]:
! llamaindex-cli download-llamadataset MiniSquadV2Dataset --download-dir ./data/mini_squad_v2

### Load documents into memory, chunk, create embeddings, store in AstraDB

In [None]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import TokenTextSplitter

loader = DirectoryLoader('data/mini_squad_v2/source_files', glob="*.txt", loader_cls=TextLoader)
splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=0)

astra_db_vstore.add_documents(splitter.split_documents(loader.load()))

### Get the list of queries and build the golden set truths

In [None]:
import json

queries = []
golden_set = []

with open("./data/mini_squad_v2/rag_dataset.json") as f:
    examples = json.load(f)['examples']
    for e in examples:
        queries.append(e["query"])
        golden_set.append({
            "query": e["query"],
            "response": e["reference_answer"],
        })

### Build a LCEL chain

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser

prompt_template = """
Answer the question based only on the supplied context. If you don't know the answer, say: "I don't know".
Context: {context}
Question: {question}
Your answer:
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

rag_chain = (
    {"context": astra_db_vstore.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | gpt_35_turbo
    | StrOutputParser()
)

In [None]:
# try a query
response = rag_chain.invoke("What show in New Zealand was the inspiration for the British Series Pop Idol?")
print(response)

## Setup Evaluation 

### Init TruLens

In [None]:
from trulens_eval import Tru

if os.getenv("TRULENS_DB_CONN_STRING"):
   tru = Tru(database_url=os.getenv("TRULENS_DB_CONN_STRING"))
else:
    tru = Tru()

### Optional: Reset the database

In [None]:
# tru.reset_database()

### Optional: Start the Dashboard UI

Note that the dashboard may error on the first attempt. It should start on the 2nd try.

In [None]:
tru.run_dashboard()

### Initialize Feedback Functions 

In [None]:
from trulens_eval.feedback.provider import AzureOpenAI
from trulens_eval.feedback import Groundedness, GroundTruthAgreement
from trulens_eval import TruChain, Feedback
from trulens_eval.app import App
import numpy as np
# Initialize provider class
azureOpenAI = AzureOpenAI(deployment_name="gpt-35-turbo")

context = App.select_context(rag_chain)

# Define a groundedness feedback function
grounded = Groundedness(groundedness_provider=azureOpenAI)
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name="groundedness")
    .on(context.collect()).on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = (
    Feedback(azureOpenAI.relevance_with_cot_reasons, name="answer_relevance")
    .on_input_output()
)

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(azureOpenAI.qs_relevance_with_cot_reasons, name="context_relevance")
    .on_input().on(context)
    .aggregate(np.mean)
)

# GroundTruth for comparing the Answer to the Ground-Truth Answer
ground_truth_collection = GroundTruthAgreement(golden_set, provider=azureOpenAI)
f_answer_correctness = (
    Feedback(ground_truth_collection.agreement_measure, name="answer_correctness")
    .on_input_output()
)

## Run Evaluation

We use the deferred mode, to ensure all evaluations run to completion.

1. Start the script `tru_evaluate.py` in a terminal. Note that this file depends on `tru_shared.py` and your `.env` file.
1. Run the code below to initiate the evaluation process
1. Wait for the script to claim it is finished. Stop the Script.
   * You are looking for something like `✅✅✅ Finished evaluating deferred feedback functions.`
   * But the first few times it shows that, it might be incorrect. (It has finished the initial deferred functions, but it needs to re-check if there are more to still evaluate)

Notes:
* You will see this warning often: `Callback class OpenAICallback is registered for handling create but there are no endpoints waiting to receive the result.`
  * It is a known issue and doesn't impact the results
* It will take about 20 minutes to finish the evaluations for the 195 queries in the dataset

In [None]:
app_id="langchain_astra_512"

In [None]:
tru_recorder = TruChain(
    rag_chain,
    app_id=app_id,
    feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness, f_answer_correctness],
    feedback_mode="deferred",
)
for query in queries:
    with tru_recorder as recording:
        rag_chain.invoke(query)

## Perform Analysis

### Download Result Data

In [None]:
# this downloads the full set of records from the database for an app(s)
dfRecords, feedbackColumns = tru.get_records_and_feedback([app_id])

In [None]:
# it also includes all the trace information from each call
# which I drop to save memory

# note that token & cost data collection is currently broken with AzureOpenAI

columns_to_keep = feedbackColumns + [
    "record_id", "input", "output", "tags",
    "latency", "total_tokens", "total_cost"]

columns_to_drop = [col for col in dfRecords.columns if col not in columns_to_keep]

dfRecords.drop(columns=columns_to_drop, inplace=True)

dfRecords

### Analysis

Compute the mean, median, 95th percentile, 99th percentile of the evaluations

In [None]:
import numpy as np
import pandas as pd

tests = feedbackColumns + ["latency", "total_tokens", "total_cost"]

results = pd.DataFrame(columns=["records", "mean", "median", "95th_percentile", "99th_percentile"])
results = results.astype({"records": "int", "mean": "float", "median": "float", "95th_percentile": "float", "99th_percentile": "float"})

for test in tests:
    data = dfRecords[test].dropna().to_list()

    records = len(data)
    mean = np.mean(data)
    median = np.median(data)
    percentile_95 = np.percentile(data, 95)
    percentile_99 = np.percentile(data, 99)

    results.loc[test] = [records, mean, median, percentile_95, percentile_99]

results

if the number of records for each test don't match, then the evaluations have not yet completed