### Hypothetical Document Embedding - [HyDE]:

HyDE is an innovative approach that transforms query questions into hypothetical documents containing the answer, aiming to bridge the gap between query and document distributions in vector space.


In this tutorial, we will focus on the following topics:
- Generate a hypothetical document from the query using the language model.
- Use the hypothetical document as the search query in the vector store.
- Retrieve the most similar documents to this hypothetical document.

---
tools:
* [anthropic](https://github.com/anthropics/anthropic-sdk-python)
* [chromadb](https://github.com/chroma-core/chroma)

In [None]:
import os
from dotenv import load_dotenv

# load .env
load_dotenv(dotenv_path='../.env')

# get API key
api_key = os.getenv('ANTHROPIC_API_KEY')

print("api_key -> ", api_key)

In [None]:
import chromadb
import pprint
from chromadb.utils import embedding_functions

# declare default embedding function [all-MiniLM-L6-v2]
default_embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

collection_name = "hyde_collection"

chroma_client = chromadb.PersistentClient(path="./chromadb/")
# declare ChromaDB collection
collection = chroma_client.get_or_create_collection(
    name=collection_name,
    embedding_function=default_embedding_function
    )

result = collection.get()

print(f"Collection {collection_name} created successfully")
pprint.pprint(result)

In [None]:
import pprint

def load_txt_from_dir(dir_path):
    documents = []
    for filename in os.listdir(dir_path):
        if filename.endswith(".txt"):
            with open(os.path.join(dir_path, filename), "r") as file:
                documents.append({"text": file.read()})
    return documents

directory_path = "../files/txt"

# load documents from directory
txt_files = load_txt_from_dir(directory_path)

print(f" {len(txt_files)} files loaded")
pprint.pprint(txt_files)

In [None]:
import pprint

def split_text(
    text, 
    chunk_size=200, 
    chunk_overlap=20
    ):
    chunks = []
    start = 0
    text_length = len(text)
    while start < text_length:
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap
    return chunks

# split text into chunks
chunked_txt = []

for file_id, txt_file in enumerate(txt_files):
    chunks = split_text(txt_file["text"])
    for chunk_id, chunk in enumerate(chunks):
        chunked_txt.append(
            {
                'id': f"{file_id}-{chunk_id}", 
                'text': chunk,
            }
        )

print(f"Split in to {len(chunked_txt)} chunks\n\n")

pprint.pprint(chunked_txt[11])

In [None]:
# upsert documents with embeddings to collection ChromaDB
for chunk in chunked_txt :
    collection.upsert(
            ids=chunk['id'],
            documents=chunk['text'],
    )

result = collection.get()

print(f"Collection {collection_name} has {len(result['ids'])} documents")
pprint.pprint(result)

---
### Augmented query generation

<img width=50% src="https://raw.githubusercontent.com/NirDiamant/RAG_Techniques/d5cf5b72cb213da587c7fac4d14766ad890177fb/images/HyDe.svg">


Use cases:
- Information Retrieval
- Question Answering Systems
- E-commerce Search
- Academic Research

### HYPOTHETICAL ANSWER <- Augment query

In [None]:
from anthropic import Anthropic

client = Anthropic(api_key=api_key)

def augment_query(query):
    prompt = "Provide an hypothetical answer to the given question."
    user_prompt = query + "\n" + prompt
    response = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        messages=[{
            "role": "user",
            "content": user_prompt
        }]
    )
    return response.content[0].text

query = "How much cost synthesizer Labyrinth?"
augment = augment_query(query)
print(augment)

In [7]:
import pprint

# function to query collection
def query_collection(question, n_results=5):
    results = collection.query(
        query_texts=question,
        n_results=n_results,
        # include=['embeddings', 'documents', 'distances']
    )
    # pprint.pprint(results)
    
    # extract relevant chunks
    relevant_chunks = [txt for sublist in results["documents"] for txt in sublist]
    # pprint.pprint(relevant_chunks)
    # for idx, txt in enumerate(results["documents"]):
    #     txt_id = results["ids"][0][idx]
    #     distance = results["distances"][0][idx]
    #     print("Chunks found:")
    #     print(f"document id: {txt_id}")
    #     print(f"text found:  {txt}")
    #     print(f"distance:    {distance}\n\n")

    return relevant_chunks


# function for generate response with openai
def api_response(query, relevant_chunks):
    
    context = "\n\n".join(relevant_chunks)
    
    user_prompt = (f"""
            You have been tasked with helping us to answer the following query: 
            <query>
            {query}
            </query>
            You have access to the following documents which are meant to provide context as you answer the query:
            <documents>
            {context}
            </documents>
            Please remain faithful to the underlying context, and only deviate from it if you are 100% sure that you know the answer already. 
            Answer the question now, and avoid providing preamble such as 'Here is the answer', etc
            """
            )
    
    response = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=2048,
        messages=[{
                "role": "user", 
                "content": user_prompt
                }],
        temperature=0

    )
    return response.content[0].text

In [None]:
# query collection

question = "tell me how much cost Labyrinth?"
augmented_result = augment_query(question)
relevant_chunks = query_collection(question + "\n" + augmented_result)
answer = api_response(question, relevant_chunks)
print("\n------------------------------------\n")
print("answer ->", answer)

---

well done!

#### Good things to know



#### Now let's cleanup db

---
### list collections

In [None]:
list_collections = chroma_client.list_collections()

print(list_collections)

---
### delete collection

In [None]:
chroma_client.delete_collection(collection_name)

list_collections = chroma_client.list_collections()

print(list_collections)

---
made with <3 by 
[dima dem](https://github.com/dimadem/) |42London