In [None]:
import fitz  # PyMuPDF

In [None]:
def extract_text_slide_by_slide(file_path):
    doc = fitz.open(file_path)  # Open the PDF file
    slides = []
    for page_num, page in enumerate(doc):  # Iterate through each page (slide)
        text = page.get_text()  # Extract text from the slide
        slides.append({
            "slide_number": page_num + 1,  # Slide number (1-based index)
            "text": text
        })
    return slides

filename = './documents/algo.pdf'
# Example usage
# file_path = ".pdf"
slides = extract_text_slide_by_slide(filename)
print(slides)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_text(text, chunk_size=300, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]  # Split by paragraphs, lines, and words
    )
    chunks = text_splitter.split_text(text)
    return chunks

for slide in slides:
    slide_text = slide["text"]
    chunks = chunk_text(slide_text)
    print(f"Slide {slide['slide_number']} has {len(chunks)} chunks:")
    for i, chunk in enumerate(chunks):
        print(f"Chunk {i + 1}:\n{chunk}\n")
    slide["chunks"] = chunks

In [None]:
print(slides)

In [1]:
from openai import AzureOpenAI
import os
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from langchain_openai import AzureChatOpenAI

load_dotenv()

client = AzureOpenAI(
    api_key=os.getenv("EMBED_API_KEY"),  # Load API key from .env
    api_version="2023-05-15",  # API version (check Azure portal for the latest)
    azure_endpoint=os.getenv("EMBED_ENDPOINT")  # Load endpoint from .env
)

model_name = os.getenv("EMBED_DEPLOY_NAME")

def generate_embeddings(texts, deployment_name):
    embeddings = []
    for text in texts:
        response = client.embeddings.create(
            input=text,
            model=deployment_name  # Use the deployment name for the embedding model
        )
        embedding = response.data[0].embedding  # Extract the embedding vector
        embeddings.append(embedding)
    return embeddings

# for slide in slides:
#     chunks = slide["chunks"]
#     embeddings = generate_embeddings(chunks, model_name)
#     print(embeddings)
#     slide["embeddings"] = embeddings  # Add embeddings to the slide dictionary

In [None]:
print(slides[0])

In [12]:
import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings

persist_directory = "./chroma_db_data"  # Directory to store data
chroma_client = chromadb.PersistentClient(path="./vector_store/chroma_db_data")

# Create or load a collection
collection = chroma_client.get_or_create_collection(name="course_doc")

def store_embeddings_in_chromadb(slides):
    for slide in slides:
        slide_number = slide["slide_number"]
        chunks = slide["chunks"]
        embeddings = slide["embeddings"]
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            collection.add(
                ids=[f"slide_{slide_number}_chunk_{i + 1}"],  # Unique ID
                embeddings=[embedding],  # Embedding vector
                metadatas=[{"slide_number": slide_number, "chunk_number": i + 1, "text": chunk}]  # Metadata
            )

# store_embeddings_in_chromadb(slides)

# # Query ChromaDB
# query_embedding = generate_embeddings(["What is a sample text?"], os.getenv("EMBEDDING_DEPLOYMENT_NAME"))[0]
# results = collection.query(
#     query_embeddings=[query_embedding],
#     n_results=2  # Number of results to retrieve
# )

# # Print results
# for result in results["metadatas"][0]:
#     print(result["text"])

In [13]:
for collection_name in chroma_client.list_collections():

    collection = chroma_client.get_collection(name=str(collection_name))  # ✅ load full collection
    print(str(collection_name))
    results = collection.get(include=["metadatas"])
    for id_, meta in zip(results["ids"], results["metadatas"]):
        print(f"{id_}: Slide {meta['slide_number']} Chunk {meta['chunk_number']}")
        print(meta["text"])
        print("---")


algo_chunks
slide_1_chunk_1: Slide 1 Chunk 1
SC2079/CE3004/CZ3004
MULTIDISCIPLINARY DESIGN PROJECT
Algorithms Briefing
Huang Shell Ying
(assyhuang@ntu.edu.sg)
---
slide_2_chunk_1: Slide 2 Chunk 1
MDP TASK
Build a robotic system that can
• Autonomously traverse a fixed area with 
known obstacles to recognize images pasted 
on the obstacles
• Transmit and receive control signals from 
mobile device
• Simulate the physical robot and algorithms in 
software
2
MDP Algorithm Briefing
SC2079/CX3004
---
slide_2_chunk_2: Slide 2 Chunk 2
software
2
MDP Algorithm Briefing
SC2079/CX3004
---
slide_3_chunk_1: Slide 3 Chunk 1
Robot’s Environment
• A square area of 200cm x 200cm with virtual boundaries.
• There are five obstacles.  Each is a block with 10cm x 
10cm footprint scattered in the area.
• The obstacles are in the orientation that is parallel to the 
sides of the square area.
---
slide_3_chunk_2: Slide 3 Chunk 2
sides of the square area.
• An image is displayed on one of the four sides of an

In [None]:
def query_chromadb(query_text, n_results=5):
    query_embedding = generate_embeddings([query_text], model_name)[0]
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results
    )
    return results

# Example usage
query_text = "images"
results = query_chromadb(query_text)
for result in results["metadatas"][0]:
    print(f"Slide {result['slide_number']}, Chunk {result['chunk_number']}:\n{result['text']}\n")

In [None]:
import chromadb
from openai import AzureOpenAI
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize ChromaDB with persistent storage
persist_directory = "./chroma_db_data"  # Replace with your persistent directory
chroma_client = chromadb.PersistentClient(path=persist_directory)

# Load the existing collection
collection_name = "slide_chunks"  # Replace with your collection name
collection = chroma_client.get_collection(name=collection_name)

# Initialize Azure OpenAI client for generating embeddings
client = AzureOpenAI(
    api_key=os.getenv("EMBED_API_KEY"),  # Load API key from .env
    api_version="2023-05-15",  # API version (check Azure portal for the latest)
    azure_endpoint=os.getenv("EMBED_ENDPOINT")  # Load endpoint from .env
)

# Define the embedding model name
model_name = os.getenv("EMBED_DEPLOY_NAME")

# Function to generate embeddings for a query
def generate_embeddings(texts, deployment_name):
    embeddings = []
    for text in texts:
        response = client.embeddings.create(
            input=text,
            model=deployment_name  # Use the deployment name for the embedding model
        )
        embedding = response.data[0].embedding  # Extract the embedding vector
        embeddings.append(embedding)
    return embeddings

# Function to query ChromaDB
def query_chromadb(query_text, n_results=5):
    # Generate embedding for the query text
    query_embedding = generate_embeddings([query_text], model_name)[0]
    
    # Query the collection
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results
    )
    return results

# Example usage
query_text = "weak entity"  # Replace with your query text
results = query_chromadb(query_text)

# Print query results
print(f"Query: {query_text}")
for i, (metadata, text) in enumerate(zip(results["metadatas"][0], results["documents"][0])):
    print(f"Result {i + 1}:")
    print(f"Slide {metadata['slide_number']}, Chunk {metadata['chunk_number']}:")
    print(f"Text: {metadata['text']}\n")

In [None]:
chroma_client = chromadb.PersistentClient(path="./chroma_db_data")
print(chroma_client.list_collections())