In [None]:
! pip3 install -U trulens-eval

In [None]:
! pip3 uninstall -y langchain llama-index trulens-eval

In [None]:
! pip3 list | grep trulens

In [None]:
collection_name = "open_ai_512"

In [None]:
from dotenv import load_dotenv

load_dotenv()

# this notebook assumes the following env vars exist in a .env file:
#
# ASTRA_DB_ENDPOINT
# ASTRA_DB_TOKEN
# AZURE_OPENAI_ENDPOINT
# AZURE_OPENAI_API_KEY
# OPENAI_API_VERSION

## Setup Azure LLMs for LangChain

In [None]:
# Setup Azure-based models
from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import AzureOpenAIEmbeddings

temperature = 0.0

gpt_35_turbo = AzureChatOpenAI(
    azure_deployment="gpt-35-turbo",
    openai_api_version="2023-05-15",
    model_version="0613",
    temperature=temperature,
)

gpt_35_turbo_16k = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_deployment="gpt-35-turbo-16k",
    model_version="0613",
    temperature=temperature,
)

gpt_4 = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_deployment="gpt-4",
    model_version="1106-preview",
    temperature=temperature,
)

gpt_4_32k = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_deployment="gpt-4-32k",
    model_version="0613",
    temperature=temperature,
)

embeddings = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-ada-002",
    openai_api_version="2023-05-15"
)

## Init an AstraDB vector store

In [None]:
from langchain.vectorstores.astradb import AstraDB
import os

astra_db_vstore = AstraDB(
    collection_name=collection_name,
    embedding=embeddings,
    token=os.getenv("ASTRA_DB_TOKEN"),
    api_endpoint=os.getenv("ASTRA_DB_ENDPOINT")
)

## Load Datasets

In [None]:
import json

base_path = "./data/"

datasets = {}
golden_set = []

for name in os.listdir(base_path):
    if os.path.isdir(os.path.join(base_path, name)):
        datasets[name] = []
        with open(os.path.join(base_path, name, "rag_dataset.json")) as f:
            examples = json.load(f)['examples']
            index = 0
            for e in examples:
                datasets[name].append(e["query"])
                golden_set.append({
                    "query": e["query"],
                    "response": e["reference_answer"],
                })
                index += 1
            print("Loaded dataset: ", name)

## Build a LCEL chain

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser


# build a prompt
prompt_template = """
Answer the question based only on the supplied context. If you don't know the answer, say: "I don't know".
Context: {context}
Question: {question}
Your answer:
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

rag_chain = (
    {"context": astra_db_vstore.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | gpt_35_turbo
    | StrOutputParser()
)

In [None]:
# try a query
response = rag_chain.invoke("What are the symptoms?")
print(response)

## Setup Evaluation 

### Init TruLens

In [None]:
from trulens_eval import Tru
tru = Tru(database_url=os.getenv("TRULENS_DB_CONN_STRING"))

### Start Dashboard

In [None]:
tru.run_dashboard()

### Initialize Feedback Functions 

In [None]:
from trulens_eval.feedback.provider import AzureOpenAI
from trulens_eval.feedback import Groundedness, GroundTruthAgreement
from trulens_eval import TruChain, Feedback
from trulens_eval.app import App
import numpy as np
# Initialize provider class
azureOpenAI = AzureOpenAI(deployment_name="gpt-35-turbo")

context = App.select_context(rag_chain)

# Define a groundedness feedback function
grounded = Groundedness(groundedness_provider=azureOpenAI)
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name="groundedness")
    .on(context.collect()).on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = (
    Feedback(azureOpenAI.relevance_with_cot_reasons, name="answer_relevance")
    .on_input_output()
)

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(azureOpenAI.qs_relevance_with_cot_reasons, name="context_relevance")
    .on_input().on(context)
    .aggregate(np.mean)
)

# GroundTruth for comparing the Answer to the Ground-Truth Answer
ground_truth_collection = GroundTruthAgreement(golden_set, provider=azureOpenAI)
f_answer_correctness = (
    Feedback(ground_truth_collection.agreement_measure, name="answer_correctness")
    .on_input_output()
)

## Run Evaluation

### Deferred

In [None]:
for name in datasets:
    app = f"{name}_{collection_name}"
    tru_recorder = TruChain(
        rag_chain,
        app_id=app,
        feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness, f_answer_correctness],
        feedback_mode="deferred",
    )
    for query in datasets[name]:
        with tru_recorder as recording:
            rag_chain.invoke(query)

### Inline

In [None]:
import time
from datetime import datetime

def waitForResults(app, index):
    # it normally takes about 10 seconds to get results
    # so delay until that time, and then check more frequently
    print(f"waiting for results on app: {app} index: {index}")
    start = datetime.now()
    time.sleep(7)
    while True:
        time.sleep(2)
        df, feedbackColumns = tru.get_records_and_feedback([app])
        row = df.loc[index]
        completeCount = 0
        for fbCol in feedbackColumns:
            if not np.isnan(row[fbCol]):
                completeCount += 1
        if completeCount == len(feedbackColumns):
            return
        else:
            print(f"index: {index} has completeCount: {completeCount}, continuing to wait")
        if (datetime.now() - start).total_seconds() > 30:
            print("timeout, giving up")
            return

In [None]:
import uuid
count = 0

for name in datasets:
    shortUuid = str(uuid.uuid4())[9:13]
    app = f"{name}_{collection_name}_{shortUuid}"
    tru_recorder = TruChain(
        rag_chain,
        app_id=app,
        feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness, f_answer_correctness],
    )
    index = 0
    for query in datasets[name]:
        with tru_recorder as recording:
            rag_chain.invoke(query)
        waitForResults(app, index)
        index +=1
        count +=1
        if count > 10:
            break
    if count > 10:
        break