In [10]:
!pip install "arize-phoenix[experimental,llama-index]" "openai>=1" getpass4  llama-index-callbacks-arize-phoenix

Collecting llama-index-callbacks-arize-phoenix
  Downloading llama_index_callbacks_arize_phoenix-0.1.6-py3-none-any.whl (2.2 kB)
Installing collected packages: llama-index-callbacks-arize-phoenix
Successfully installed llama-index-callbacks-arize-phoenix-0.1.6


In [6]:
import os
import openai
import pandas as pd
import phoenix as px
from getpass import getpass
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, set_global_handler
from llama_index.llms.openai import OpenAI
from phoenix.evals import (
    OpenAIModel,
    run_evals,
)
from tqdm import tqdm

In [7]:
session = px.launch_app()

🌍 To view the Phoenix app in your browser, visit https://udxwftg8ds1-496ff2e9c6d22116-6006-colab.googleusercontent.com/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [8]:
if not (openai_api_key := os.getenv("OPENAI_API_KEY")):
    openai_api_key = getpass("Enter your OpenAI API key: ")
openai.api_key = openai_api_key
os.environ["OPENAI_API_KEY"] = openai_api_key

Enter your OpenAI API key: ··········


In [11]:
set_global_handler("arize_phoenix")

In [12]:
# Loading the data from the directory using SimpleDirectoryReader and building a VectorStoreIndex
documents = SimpleDirectoryReader("data").load_data()
vector_index = VectorStoreIndex.from_documents(documents)

# Initialising a query engine
query_engine = vector_index.as_query_engine()

In [13]:
# Passing our queries (prompts) based on the data we have loaded
queries = ["What is meant by the term Attention Mechanism?", "What are decoder-only transformers?"]
queries

['What is meant by the term Attention Mechanism?',
 'What are decoder-only transformers?']

In [14]:
for query in tqdm(queries):
    query_engine.query(query)

100%|██████████| 2/2 [00:03<00:00,  1.52s/it]


In [15]:
from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents
from phoenix.trace import DocumentEvaluations, SpanEvaluations
from phoenix.evals import (
    HallucinationEvaluator,
    OpenAIModel,
    QAEvaluator,
    RelevanceEvaluator,
    run_evals,
)

In [16]:
queries_df = get_qa_with_reference(session)
queries_df

Unnamed: 0_level_0,input,output,reference
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
f9abe045d3b86c9f,What is meant by the term Attention Mechanism?,"The term ""Attention Mechanism"" refers to a com...",Attention Visualizations\nInput-Input Layer5\n...
15d50c98d8d93d63,What are decoder-only transformers?,Decoder-only transformers are models that cons...,"1 Introduction\nRecurrent neural networks, lon..."


In [17]:
retrieved_documents_df = get_retrieved_documents(session)
retrieved_documents_df

Unnamed: 0_level_0,Unnamed: 1_level_0,context.trace_id,input,reference,document_score
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
28633eaebebaa156,0,ed76ff616d4315e4579ab7469a4d7e37,What is meant by the term Attention Mechanism?,Attention Visualizations\nInput-Input Layer5\n...,0.793238
28633eaebebaa156,1,ed76ff616d4315e4579ab7469a4d7e37,What is meant by the term Attention Mechanism?,"Provided proper attribution is provided, Googl...",0.787569
4d4b75610b63eb10,0,806e11e4889d41b7e553074afb2d03be,What are decoder-only transformers?,"1 Introduction\nRecurrent neural networks, lon...",0.789966
4d4b75610b63eb10,1,806e11e4889d41b7e553074afb2d03be,What are decoder-only transformers?,Table 4: The Transformer generalizes well to E...,0.783031


In [18]:
eval_model = OpenAIModel(model_name = "gpt-3.5-turbo")
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_correctness_evaluator = QAEvaluator(eval_model)
relevance_evaluator = RelevanceEvaluator(eval_model)

hallucination_eval_df, qa_correctness_eval_df = run_evals(
    dataframe=queries_df,
    evaluators=[hallucination_evaluator, qa_correctness_evaluator],
    provide_explanation=True,
)
relevance_eval_df = run_evals(
    dataframe=retrieved_documents_df,
    evaluators=[relevance_evaluator],
    provide_explanation=True,
)[0]

px.log_evaluations(
    SpanEvaluations(eval_name="Hallucination", dataframe=hallucination_eval_df),
    SpanEvaluations(eval_name="QA Correctness", dataframe=qa_correctness_eval_df),
)
px.log_evaluations(DocumentEvaluations(eval_name="Relevance", dataframe=relevance_eval_df))

The `model_name` field is deprecated. Use `model` instead.                 This will be removed in a future release.


WARNI [phoenix.evals.executors] 🐌!! If running inside a notebook, patching the event loop with nest_asyncio will allow asynchronous eval submission, and is significantly faster. To patch the event loop, run `nest_asyncio.apply()`.


run_evals |          | 0/4 (0.0%) | ⏳ 00:00<? | ?it/s

WARNI [phoenix.evals.executors] 🐌!! If running inside a notebook, patching the event loop with nest_asyncio will allow asynchronous eval submission, and is significantly faster. To patch the event loop, run `nest_asyncio.apply()`.


run_evals |          | 0/4 (0.0%) | ⏳ 00:00<? | ?it/s

WARNI [phoenix.session.evaluation] This `log_evaluations` function is deprecated and will be removed in a future release. Please use `px.Client().log_evaluations(*evaluations)` instead.
WARNI [phoenix.session.evaluation] This `log_evaluations` function is deprecated and will be removed in a future release. Please use `px.Client().log_evaluations(*evaluations)` instead.
