In [16]:
import getpass
import os

In [17]:
if not os.getenv("GROQ_API_KEY"):
    os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your GROQ API Key: ")

In [18]:
from langchain_openai import AzureOpenAIEmbeddings

from chunking_evaluation.chunking import ClusterSemanticChunker

In [19]:
# Ensure the environment variables are set correctly
if not os.getenv("AZURE_EMBEDDING_MODEL"):
    os.environ["AZURE_EMBEDDING_MODEL"] = getpass.getpass("Enter your AZURE_EMBEDDING_MODEL: ")

if not os.getenv("AZURE_EMBEDDING_ENDPOINT"):
    os.environ["AZURE_EMBEDDING_ENDPOINT"] = getpass.getpass("Enter your AZURE_EMBEDDING_ENDPOINT: ")

if not os.getenv("AZURE_EMBEDDING_API_KEY"):
    os.environ["AZURE_EMBEDDING_API_KEY"] = getpass.getpass("Enter your AZURE_EMBEDDING_API_KEY: ")



In [20]:

embeddings = AzureOpenAIEmbeddings(
    model=os.getenv("AZURE_EMBEDDING_MODEL"),
    azure_endpoint=os.getenv("AZURE_EMBEDDING_ENDPOINT"),
    api_key=os.getenv("AZURE_EMBEDDING_API_KEY")
)

ef = embeddings.embed_documents

In [50]:
from chromadb.utils import embedding_functions

azure_openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_base=os.getenv("AZURE_EMBEDDING_BASE"),
    deployment_id=os.getenv("AZURE_EMBEDDING_MODEL"),
    api_type="azure",
    api_version="2023-05-15",
    api_key=os.getenv("AZURE_EMBEDDING_API_KEY")
    )

## Reading the data

In [21]:
with open("context_data/the-man-who-was-thursday-a-nightmare.txt", "r") as file:
    data = file.read()

In [66]:
chunker = ClusterSemanticChunker(embedding_function=azure_openai_ef, max_chunk_size=200)

In [62]:
semantic_chunks=chunker.split_text(data)

In [67]:
len(semantic_chunks)

757

In [70]:
semantic_chunks[1]

'Title: The Man Who Was Thursday: A Nightmare\n\nAuthor: G. K. Chesterton\n\nRelease date: April 1, 1999 [eBook #1695]\n                Most recently updated: February 5, 2024 Language: English\n\nCredits: Harry Plantinga and David Widger\n\n\n*** START OF THE PROJECT GUTENBERG EBOOK THE MAN WHO WAS THURSDAY: A NIGHTMARE ***\n\n\n\n\nThe Man Who Was Thursday A Nightmare\n\nby G. K. Chesterton\n\n\nContents\n\n A WILD, MAD, HILARIOUS AND PROFOUNDLY MOVING TALE\n THE MAN WHO WAS THURSDAY'

In [65]:
# for i in range(0, 10):
#     print(f"Chunk {i+1}: {semantic_chunks[i]}")
#     print("-" * 80)

## Load the chuncks to the chromadb

In [55]:
import chromadb
client = chromadb.HttpClient(host="localhost", port=8088)

In [64]:
#client.delete_collection(name="demo_rag_collection")

In [71]:
collection = client.get_or_create_collection(name="demo_rag_collection", embedding_function=azure_openai_ef)


collection.count()

196

In [58]:
collection.add(
    documents=semantic_chunks,
    ids=[str(i) for i in range(len(semantic_chunks))])

In [None]:
# resp = collection.query(
#     query_texts=["Who is the writer of the book The Man Who Was Thursday: A Nightmare?"]
# )

#print(resp["documents"][0][0])