In [None]:
%pip install -q -U ragstack-ai trulens_eval

In [None]:
collection_name = "llama_512"

In [None]:
from dotenv import load_dotenv

load_dotenv()

## Init an AstraDB vector store

In [None]:
from llama_index.vector_stores import AstraDBVectorStore
import os

astra_db_store = AstraDBVectorStore(
    collection_name=collection_name,
    api_endpoint=os.getenv("ASTRA_DB_ENDPOINT"),
    token=os.getenv("ASTRA_DB_TOKEN"),
    embedding_dimension=1536,
)

## Setup Azure LLMs

In [None]:
from llama_index.llms import AzureOpenAI as AzureOpenAIChat
from llama_index.embeddings import AzureOpenAIEmbedding

temperature = 0.0

gpt_35_turbo = AzureOpenAIChat(
    deployment_name="gpt-35-turbo",
    model="gpt-35-turbo",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    model_version="0613",
    temperature=temperature,
)

gpt_35_turbo_16k = AzureOpenAIChat(
    deployment_name="gpt-35-turbo-16k",
    model="gpt-35-turbo-16k",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    model_version="0613",
    temperature=temperature,
)

gpt_4 = AzureOpenAIChat(
    deployment_name="gpt-4",
    model="gpt-4",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    model_version="1106-preview",
    temperature=temperature,
)

gpt_4_32k = AzureOpenAIChat(
    deployment_name="gpt-4-32k",
    model="gpt-4-32k",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    model_version="0613",
    temperature=temperature,
)

embed_model = AzureOpenAIEmbedding(
    deployment_name="text-embedding-ada-002",
    model="text-embedding-ada-002",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    temperature=temperature,
)

## Load the documents

In [None]:
from llama_index import SimpleDirectoryReader

reader = SimpleDirectoryReader(
    input_dir="data",
    recursive=True,
    required_exts=[".pdf", ".md", ".html", ".txt"]
)
documents = reader.load_data()
len(documents)

This was un-expected. ^^ I think it has split the pdfs by page.  Leaving this for now, but should go back later to make a better comparison to langchain.

## Split the docs into nodes and load into vector store

In [None]:
from llama_index.node_parser import TokenTextSplitter
from llama_index.ingestion import IngestionPipeline

splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=0)
pipeline = IngestionPipeline(transformations=[splitter])

nodes = pipeline.run(documents=documents)
len(nodes)

In [None]:
from llama_index import  VectorStoreIndex, StorageContext, ServiceContext

service_context = ServiceContext.from_defaults(
    llm=gpt_35_turbo,
    embed_model=embed_model,
)

storage_context = StorageContext.from_defaults(
    vector_store=astra_db_store,
)

In [None]:
# docs already loaded, don't need to do this again
# index = VectorStoreIndex(
#     nodes=nodes,
#     storage_context=storage_context,
#     service_context=service_context,
# )

## Setup Query Engine

In [None]:
from llama_index import get_response_synthesizer
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.postprocessor import SimilarityPostprocessor

index = VectorStoreIndex.from_vector_store(
    vector_store=astra_db_store,
    service_context=service_context,
)

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=4,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    service_context=service_context
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    # node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

In [None]:
# try a query
response = query_engine.query("What are the symptoms?")
print(response)

## Setup Evaluation 

### Init TruLens

In [None]:
## Setup TruLens
from trulens_eval import Tru
tru = Tru()
tru.run_dashboard()

### Load Datasets

In [None]:
import json

base_path = "./data/"

datasets = {}
golden_set = []

for name in os.listdir(base_path):
    if os.path.isdir(os.path.join(base_path, name)):
        datasets[name] = []
        with open(os.path.join(base_path, name, "rag_dataset.json")) as f:
            examples = json.load(f)['examples']
            for e in examples:
                datasets[name].append(e["query"])
                golden_set.append({
                    "query": e["query"],
                    "response": e["reference_answer"],
                })
            print("Loaded dataset: ", name)

### Initialize Feedback Functions 

In [None]:
from trulens_eval.feedback.provider import AzureOpenAI
from trulens_eval.feedback import Groundedness, GroundTruthAgreement
from trulens_eval import TruLlama, Feedback
from trulens_eval.app import App
import numpy as np
# Initialize provider class
azureOpenAI = AzureOpenAI(deployment_name="gpt-35-turbo")

context = App.select_context(query_engine)

grounded = Groundedness(groundedness_provider=azureOpenAI)
# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons)
    .on(context.collect())
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = Feedback(azureOpenAI.relevance_with_cot_reasons).on_input_output()
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(azureOpenAI.qs_relevance_with_cot_reasons)
    .on_input()
    .on(context)
    .aggregate(np.mean)
)

# GroundTruth for comparing the Answer to the Ground-Truth Answer
ground_truth_collection = GroundTruthAgreement(golden_set, provider=azureOpenAI)
f_answer_correctness = (
    Feedback(ground_truth_collection.agreement_measure)
    .on_input_output()
)

### Run Evaluation

In [None]:
count = 0

for name in datasets:
    app = f"{name}_{collection_name}"
    tru_recorder = TruLlama(
        query_engine,
        app_id=app,
        feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness, f_answer_correctness],
        #feedback_mode="deferred",
    )
    for query in datasets[name]:
        with tru_recorder as recording:
            query_engine.query(query)
            count +=1
            if count > 10:
                break
    if count > 10:
        break