In [1]:
import os
import tqdm
import numpy as np
import pandas as pd

# API Setup

In [2]:
from dotenv import load_dotenv
load_dotenv(dotenv_path="../.env")

True

# Load Data

In [3]:
# Load embedding model
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5", embed_batch_size=32)



In [4]:
# Load the index from disk
from llama_index.vector_stores.lancedb import LanceDBVectorStore
from llama_index.core import VectorStoreIndex

vector_store = LanceDBVectorStore(
    uri="./lancedb", table_name="pipeline_test"
)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embedding_model,
)

# Set up LLM

In [5]:
# OpenAI
from llama_index.llms.openai import OpenAI

# Set up OpenAI client - API key is handled in your .env file
openai_llm = OpenAI(model="gpt-4o", temperature=0.1, max_tokens=2048)

In [6]:
# AnyScale
from llama_index.llms.anyscale import Anyscale

# Set up AnyScale client - API key is handled in your .env file
anyscale_llm = Anyscale(model="meta-llama/Meta-Llama-3-70B-Instruct", temperature=0.1, max_tokens=2048)

# Setup RAG

In [7]:
from llama_index.core.prompts import PromptTemplate

prompt = """You are a helpful AI assistant that answers questions after carefully reading all the provided context. You always cite sources (document titles) and also quote the relevant snippets. Information may be spread across multiple documents. If the information is not present in any of the contexts, you will say 'I don't know'.
-----
Context: 
{context_str} 
-----
Question: {query_str} 
Answer: `answer`
Source: `source document title`
Relevant Snippet: `snippet`
"""

# Convert prompt into a prompt template that llamaindex can use
prompt = PromptTemplate(template=prompt)

In [8]:
# OpenAI
openai_query_engine = index.as_query_engine(llm=openai_llm, text_qa_template=prompt, similarity_top_k=3)

# Anyscale
anyscale_query_engine = index.as_query_engine(llm=anyscale_llm, text_qa_template=prompt, similarity_top_k=3)

# Query Transformation: HyDE

We examine two scenarios - one where we don't use HyDE and one where we do. 

Note: To focus on HyDE, we've simplified and abstracted away most of the helper code we used previously.

## Without Hyde



In [9]:
user_input = "How many points did Michael Jordan actually score in his final NBA game?"

In [10]:
print(openai_query_engine.query(user_input))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Michael Jordan scored 15 points in his final NBA game.

Source: Michael Jordan
Relevant Snippet: "Jordan's final NBA game was on April 16, 2003 in Philadelphia. After scoring only 13 points in the game, Jordan went to the bench with 4 minutes and 13 seconds remaining in the third quarter... At 1:45, Jordan was intentionally fouled by the 76ers' Eric Snow, and stepped to the line to make both free throws."


In [11]:
print(anyscale_query_engine.query(user_input))

Answer: 13 points
Source: Michael Jordan
Relevant Snippet: "After scoring only 13 points in the game, Jordan went to the bench with 4 minutes and 13 seconds remaining in the third quarter and with his team trailing the Philadelphia 76ers, 75–56."


## With HyDE

In [12]:
# Query transforms perform query transformations such as expansion, HyDE, etc.
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
openai_hyde = HyDEQueryTransform(llm=openai_llm, include_original=True)
anyscale_hyde = HyDEQueryTransform(llm=anyscale_llm, include_original=True)

In [13]:
# Query engine applies a query transformation before retrieval/answer generation
from llama_index.core.query_engine import TransformQueryEngine
anyscale_hyde_query_engine = TransformQueryEngine(anyscale_query_engine, anyscale_hyde)
openai_hyde_query_engine = TransformQueryEngine(openai_query_engine, anyscale_hyde)

In [14]:
# Show the HyDE prompt
print(anyscale_hyde.get_prompts()["hyde_prompt"].template)

Please write a passage to answer the question
Try to include as many key details as possible.


{context_str}


Passage:"""



In [15]:
# Anyscale
print("Hypothetical document:")
print(anyscale_hyde.run(user_input).custom_embedding_strs[0])
print("---" * 30)
print(anyscale_hyde_query_engine.query(user_input))

Hypothetical document:
Here is a passage answering the question:

Michael Jordan's final NBA game took place on April 18, 2003, when his Washington Wizards faced off against the Philadelphia 76ers in the first round of the Eastern Conference playoffs. In a game that would ultimately be a 107-87 loss for the Wizards, Jordan, then 40 years old, played 37 minutes and scored 15 points on 6-of-15 shooting from the field, including 1-of-3 from three-point range. He also added 4 rebounds, 2 assists, and 2 steals to his stat line. Although it wasn't one of his most impressive performances, Jordan's 15 points in his final NBA game marked a fitting conclusion to an illustrious career that saw him win six NBA championships, five MVP awards, and cement his status as one of the greatest basketball players of all time.
------------------------------------------------------------------------------------------
Answer: 13 points
Source: Michael Jordan
Relevant Snippet: "After scoring only 13 points in 

In [16]:
# OpenAI
print("Hypothetical document:")
print("OpenAI:", openai_hyde.run(user_input).custom_embedding_strs[0])
print("---" * 30)
print(openai_hyde_query_engine.query(user_input))

Hypothetical document:
OpenAI: Michael Jordan, widely regarded as one of the greatest basketball players of all time, played his final NBA game on April 16, 2003. This game was between the Washington Wizards, the team Jordan was playing for at the time, and the Philadelphia 76ers. Despite the Wizards' 107-87 loss to the 76ers, Jordan's performance was a memorable moment in sports history. In his final appearance on the NBA court, Michael Jordan scored 15 points. He played a total of 28 minutes, during which he made 6 out of 15 field goal attempts and 3 out of 4 free throws. The game was marked by a standing ovation from the crowd and heartfelt tributes from fellow players and fans, celebrating the end of an illustrious career that spanned over 15 seasons and included six NBA championships.
------------------------------------------------------------------------------------------
Michael Jordan scored 15 points in his final NBA game.

Source: Michael Jordan
Relevant Snippet: "Jordan's f

# Query Rewriting

Ref: https://docs.llamaindex.ai/en/latest/examples/query_transformations/query_transform_cookbook/#query-rewriting-custom

In [17]:
user_input = "How many points did Michael Jordan actually score in his final NBA game?"

In [18]:
# Define a prompt to generate a number of queries similar to a given query
query_gen_str = """\
You are a helpful assistant that generates similar search queries based on a \
single input query. Generate {num_queries} search queries, one on each line, \
related to the following input query. Do not include anything else in your response.
Query: {query}
Queries:
"""
query_gen_prompt = PromptTemplate(query_gen_str)

In [19]:
def generate_queries(query: str, llm, num_queries: int = 4):
    response = llm.predict(
        query_gen_prompt, num_queries=num_queries, query=query
    )
    # assumes LLM properly put each query on a newline
    queries = response.split("\n")
    return queries

In [20]:
# Anyscale
anyscale_queries = generate_queries(query=user_input, llm=anyscale_llm, num_queries=4) + [user_input]

print("\n".join(anyscale_queries))

What was the date of Michael Jordan's final NBA game?
What team did Michael Jordan play for in his final NBA game?
What was the outcome of Michael Jordan's final NBA game?
What was Michael Jordan's highest scoring performance in his NBA career?
How many points did Michael Jordan actually score in his final NBA game?


In [21]:
all_anyscale_results = []
for query in anyscale_queries:
    top_result = anyscale_query_engine.retrieve(query)[0]
    all_anyscale_results.append(top_result)
    print(top_result.text[:100], top_result.score)

With the recognition that 2002 – 03 would be Jordan 's final season , tributes were paid to him thro 0.6940546035766602
With the recognition that 2002 – 03 would be Jordan 's final season , tributes were paid to him thro 0.6894277334213257
With the recognition that 2002 – 03 would be Jordan 's final season , tributes were paid to him thro 0.6841335296630859
Jordan also holds the top career regular season and playoff scoring averages of 30.1 and 33.4 points 0.7003832459449768
With the recognition that 2002 – 03 would be Jordan 's final season , tributes were paid to him thro 0.6666018962860107


In [22]:
# OpenAI
openai_queries = generate_queries(query=user_input, llm=openai_llm, num_queries=4) + [user_input]

print("\n".join(openai_queries))

Michael Jordan final NBA game points total
Michael Jordan last game scoring stats
Michael Jordan retirement game points scored
Michael Jordan final game performance stats
How many points did Michael Jordan actually score in his final NBA game?


In [23]:
all_openai_results = []
for query in openai_queries:
    top_result = openai_query_engine.retrieve(query)[0]
    all_openai_results.append(top_result)
    print(top_result.text[:100], top_result.score)

Jordan also holds the top career regular season and playoff scoring averages of 30.1 and 33.4 points 0.7295424342155457
In an injury-plagued 2001 – 02 season , he led the team in scoring ( 22.9 ppg ) , assists ( 5.2 apg  0.750728189945221
In March 1995 , Jordan decided to quit baseball due to the ongoing Major League Baseball strike , as 0.7328667044639587
Jordan averaged a league leading 33.6 ppg on 52.6 % shooting , to go with 6.9 rpg and 6.3 apg in lea 0.7255814075469971
With the recognition that 2002 – 03 would be Jordan 's final season , tributes were paid to him thro 0.6666018962860107


# Rank Fusion

[Reciprocal Rank Fusion (RRF)](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) is a commonly used rank fusion algorithm. You can find an implementation of this algorithm in [this](https://docs.llamaindex.ai/en/stable/examples/low_level/fusion_retriever/#step-3-perform-fusion) example from LlamaIndex. 

In [24]:
from collections import Counter

def most_common(results):
    counts = Counter([result.node_id for result in results])
    most_common = counts.most_common(1)[0][0]
    for result in results:
        if result.node_id == most_common:
            return result

In [25]:
print(most_common(all_anyscale_results).text)

With the recognition that 2002 – 03 would be Jordan 's final season , tributes were paid to him throughout the NBA . In his final game at his old home court , the United Center in Chicago , Jordan received a four-minute standing ovation . The Miami Heat retired the number 23 jersey on April 11 , 2003 , even though Jordan never played for the team . At the 2003 All-Star Game , Jordan was offered a starting spot from Tracy McGrady and Allen Iverson , but refused both . In the end he accepted the spot of Vince Carter , who decided to give it up under great public pressure . 
 Jordan 's final NBA game was on April 16 , 2003 in Philadelphia . After scoring only 13 points in the game , Jordan went to the bench with 4 minutes and 13 seconds remaining in the third quarter and with his team trailing the Philadelphia 76ers , 75 – 56 . Just after the start of the fourth quarter , the First Union Center crowd began chanting " We want Mike ! " . After much encouragement from coach Doug Collins , Jo

In [26]:
print(most_common(all_openai_results).text)

Jordan also holds the top career regular season and playoff scoring averages of 30.1 and 33.4 points per game , respectively . By 1998 , the season of his Finals-winning shot against the Jazz , he was well known throughout the league as a clutch performer . In the regular season , Jordan was the Bulls ' primary threat in the final seconds of a close game and in the playoffs , Jordan would always demand the ball at crunch time . Jordan 's total of 5,987 points in the playoffs is the highest in NBA history . He retired with 32,292 points in regular season play , placing him fourth on the NBA 's all-time scoring list behind Kareem Abdul-Jabbar , Karl Malone , and Kobe Bryant . 
 With five regular-season MVPs ( tied for second place with Bill Russell ; only Kareem Abdul-Jabbar has won more , six ) , six Finals MVPs ( NBA record ) , and three All-Star MVPs , Jordan is the most decorated player ever to play in the NBA . Jordan finished among the top three in regular-season MVP voting a recor