In [None]:
%pip install -q -U ragstack-ai trulens_eval

In [10]:
! pip3 list | grep trulens

trulens                                  0.13.4
trulens-eval                             0.20.1


In [1]:
collection_name = "llama_512"

In [2]:
from dotenv import load_dotenv

load_dotenv()

# this notebook assumes the following env vars exist in a .env file:
#
# ASTRA_DB_ENDPOINT
# ASTRA_DB_TOKEN
# AZURE_OPENAI_ENDPOINT
# AZURE_OPENAI_API_KEY
# OPENAI_API_VERSION

True

## Setup Azure LLMs for LlamaIndex

In [3]:
from llama_index.llms import AzureOpenAI as AzureChatOpenAI
from llama_index.embeddings import AzureOpenAIEmbedding
import os

temperature = 0.0

gpt_35_turbo = AzureChatOpenAI(
    deployment_name="gpt-35-turbo",
    model="gpt-35-turbo",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    model_version="0613",
    temperature=temperature,
)

gpt_35_turbo_16k = AzureChatOpenAI(
    deployment_name="gpt-35-turbo-16k",
    model="gpt-35-turbo-16k",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    model_version="0613",
    temperature=temperature,
)

gpt_4 = AzureChatOpenAI(
    deployment_name="gpt-4",
    model="gpt-4",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    model_version="1106-preview",
    temperature=temperature,
)

gpt_4_32k = AzureChatOpenAI(
    deployment_name="gpt-4-32k",
    model="gpt-4-32k",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    model_version="0613",
    temperature=temperature,
)

embeddings = AzureOpenAIEmbedding(
    deployment_name="text-embedding-ada-002",
    model="text-embedding-ada-002",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    temperature=temperature,
)

## Init an AstraDB vector store

In [4]:
from llama_index.vector_stores import AstraDBVectorStore
import os

astra_db_vstore = AstraDBVectorStore(
    collection_name=collection_name,
    api_endpoint=os.getenv("ASTRA_DB_ENDPOINT"),
    token=os.getenv("ASTRA_DB_TOKEN"),
    embedding_dimension=1536,
)

## Load Datasets

In [5]:
import json

base_path = "./data/"

datasets = {}
golden_set = []

for name in os.listdir(base_path):
    if os.path.isdir(os.path.join(base_path, name)):
        datasets[name] = []
        with open(os.path.join(base_path, name, "rag_dataset.json")) as f:
            examples = json.load(f)['examples']
            for e in examples:
                datasets[name].append(e["query"])
                golden_set.append({
                    "query": e["query"],
                    "response": e["reference_answer"],
                })
            print("Loaded dataset: ", name)

Loaded dataset:  patronus_ai_financebench
Loaded dataset:  uber_10k
Loaded dataset:  blockchain_solana
Loaded dataset:  covid_qa
Loaded dataset:  llama_2_paper
Loaded dataset:  evaluating_llm_survey_paper
Loaded dataset:  mini_squad_v2
Loaded dataset:  origin_of_covid_19
Loaded dataset:  braintrust_coda_help_desk
Loaded dataset:  paul_grahman_essay


## Setup Query Engine

In [6]:
from llama_index import get_response_synthesizer, VectorStoreIndex, StorageContext, ServiceContext
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.postprocessor import SimilarityPostprocessor

service_context = ServiceContext.from_defaults(
    llm=gpt_35_turbo,
    embed_model=embeddings,
)

storage_context = StorageContext.from_defaults(
    vector_store=astra_db_vstore,
)

index = VectorStoreIndex.from_vector_store(
    vector_store=astra_db_vstore,
    service_context=service_context,
)

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=4,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    service_context=service_context
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    # node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

In [7]:
# try a query
response = query_engine.query("What are the symptoms?")
print(response)

The symptoms of COVID-19 include persistent pain or pressure in the chest, high temperature (above 38°C), irritability, confusion, reduced consciousness, anxiety, depression, sleep disorders, headache, muscle or joint pain, different types of skin rash, nausea or vomiting, diarrhea, chills, dizziness, shortness of breath, loss of appetite, loss of taste or smell, nasal congestion, conjunctivitis (red eyes), sore throat, and gastrointestinal disturbances such as diarrhea and nausea. In severe cases, symptoms may also include inflammation, organ damage, blood clots, strokes, and brain impairments.


## Setup Evaluation 

### Init TruLens

In [8]:
from trulens_eval import Tru
tru = Tru(database_url=os.getenv("TRULENS_DB_CONN_STRING"))

🦑 Tru initialized with db url postgresql://postgres:***@127.0.0.1:5432 .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


### Start Dashboard

In [None]:
tru.run_dashboard()

### Initialize Feedback Functions 

In [9]:
from trulens_eval.feedback.provider import AzureOpenAI
from trulens_eval.feedback import Groundedness, GroundTruthAgreement
from trulens_eval import TruLlama, Feedback
from trulens_eval.app import App
import numpy as np
# Initialize provider class
azureOpenAI = AzureOpenAI(deployment_name="gpt-35-turbo")

context = App.select_context(query_engine)

# Define a groundedness feedback function
grounded = Groundedness(groundedness_provider=azureOpenAI)
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name="groundedness")
    .on(context.collect()).on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = (
    Feedback(azureOpenAI.relevance_with_cot_reasons, name="answer_relevance")
    .on_input_output()
)

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(azureOpenAI.qs_relevance_with_cot_reasons, name="context_relevance")
    .on_input().on(context)
    .aggregate(np.mean)
)

# GroundTruth for comparing the Answer to the Ground-Truth Answer
ground_truth_collection = GroundTruthAgreement(golden_set, provider=azureOpenAI)
f_answer_correctness = (
    Feedback(ground_truth_collection.agreement_measure, name="answer_correctness")
    .on_input_output()
)

✅ In groundedness_measure_with_cot_reasons, input source will be set to __record__.app.query.rets.source_nodes[:].node.text.collect() .
✅ In groundedness_measure_with_cot_reasons, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In relevance_with_cot_reasons, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In relevance_with_cot_reasons, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In qs_relevance_with_cot_reasons, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In qs_relevance_with_cot_reasons, input statement will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In agreement_measure, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In agreement_measure, input response will be set to __record__.main_output or `Select.RecordOutput` .


## Run Evaluation

### Deferred

In [None]:
for name in datasets:
    app = f"{name}_{collection_name}"
    tru_recorder = TruLlama(
        query_engine,
        app_id=app,
        feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness, f_answer_correctness],
        feedback_mode="deferred",
    )
    for query in datasets[name]:
        with tru_recorder as recording:
            query_engine.query(query)

### Inline

In [None]:
import time
from datetime import datetime

def waitForResults(app, index):
    # it normally takes about 10 seconds to get results
    # so delay until that time, and then check more frequently
    print(f"waiting for results on app: {app} index: {index}")
    start = datetime.now()
    time.sleep(7)
    while True:
        time.sleep(2)
        df, feedbackColumns = tru.get_records_and_feedback([app])
        row = df.loc[index]
        completeCount = 0
        for fbCol in feedbackColumns:
            if not np.isnan(row[fbCol]):
                completeCount += 1
        if completeCount == len(feedbackColumns):
            return
        else:
            print(f"index: {index} has completeCount: {completeCount}, continuing to wait")
        if (datetime.now() - start).total_seconds() > 30:
            print("timeout, giving up")
            return

In [None]:
import uuid
count = 0

for name in datasets:
    shortUuid = str(uuid.uuid4())[9:13]
    app = f"{name}_{collection_name}_{shortUuid}"
    tru_recorder = TruLlama(
        query_engine,
        app_id=app,
        feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness, f_answer_correctness],
    )
    index = 0
    for query in datasets[name]:
        with tru_recorder as recording:
            query_engine.query(query)
        waitForResults(app, index)
        index +=1
        count +=1
        if count > 10:
            break
    if count > 10:
        break