In [2]:
import os
import torch
import hashlib
import pickle
import time
import ollama
from ollama import chat
from ollama import Client
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from qdrant_client import QdrantClient
from langchain_qdrant import Qdrant
from qdrant_client.http import models

In [3]:
DATA_DIR = "./course_materials"
METADATA_DIR = "./course_materials_metadata"
METADATA_FILE = os.path.join(METADATA_DIR, "metadata.pkl")
COLLECTION_NAME = "student_coursework"
QDRANT_URL = "http://localhost:6333"

os.makedirs(METADATA_DIR, exist_ok=True)

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu" 

In [5]:
index = None
chunks = None

In [6]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": device})

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": device})


In [7]:
ollama_client = Client(
    host='http://localhost:1134', 
    headers={'x-some-header':'some-value'}
)

In [8]:
q_client = QdrantClient(url=QDRANT_URL, prefer_grpc=False)

# docker run -p 6333:6333 -p 6334:6334 \
#     -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
#     qdrant/qdrant

In [9]:
vectore_store = Qdrant(
    client = q_client,
    collection_name=COLLECTION_NAME,
    embeddings=embedding_model,
)

  vectore_store = Qdrant(


In [24]:
def compute_pdf_hash():
    hasher = hashlib.sha256()
    pdf_files = sorted([f for f in os.listdir(DATA_DIR) if f.endswith('.pdf')])

    for pdf in pdf_files:
        with open(os.path.join(DATA_DIR, pdf), 'rb') as f:
            hasher.update(f.read())
    return hasher.hexdigest()


In [25]:
def is_db_outdated():
    if not os.path.exists(METADATA_FILE):
        return True
    
    try:
        with open(METADATA_FILE, 'rb') as f:
            saved_hash = pickle.load(f).get("pdf_hash", None)

        current_hash = compute_pdf_hash()

        return saved_hash != current_hash
    
    except Exception as e:
        return True

In [15]:
def build_collection():
    global index, chunks

    start_time = time.time()

    if not is_db_outdated():
        print("Database is up to date.")
        return
    
    print("Building new collection...")

    loader = DirectoryLoader(DATA_DIR, glob="**/*.pdf", loader_cls=PyPDFLoader, use_multithreading=True)
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents(documents)

    print(f"{len(chunks)} text chunks extracted.")

    existing_collections = [col.name for col in q_client.get_collections().collections]

    if COLLECTION_NAME in existing_collections:
        q_client.update_collection(
            collection_name=COLLECTION_NAME,
            optimizers_config=models.OptimizersConfigDiff(indexing_threshold=10000),
        )
    else:
        q_client.create_collection(
            collection_name=COLLECTION_NAME,
            vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
        )
    
    vectore_store.add_documents(chunks)

    with open(METADATA_FILE, 'wb') as f:
        pickle.dump({"pdf_hash": compute_pdf_hash()}, f)
        
    print(f"Collection {COLLECTION_NAME} built in {time.time() - start_time:.2f} seconds.")

In [10]:
from typing import List

def contains_img_tags(context: str) -> bool:
    """Check if the context contains <IMG src> and <IMG> tags."""
    tags = ["<IMG src", "<IMG>"]
    return all(tag in context for tag in tags)

In [11]:
def generate_answer(query):
    results = vectore_store.similarity_search(query, k=5)

    context = "\n".join([doc.page_content for doc in results])

    print("Context for the query:", context)

    print("Contains images in the context", contains_img_tags(context))

    prompt = f"use the provided course material to answer the question: {query}\n\nContext:\n{context}\n\nAnswer:"

    response = ollama.chat(
        model="llama3.2",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ])
    
    return response['message']['content']

In [17]:
def main():
    build_collection()

    while True:
        query = input("Enter your question (or 'exit' to quit): ")
        if query.lower() == 'exit':
            break

        start_time = time.time()
        print("Processing query...")

        answer = generate_answer(query)
        print("\nAnswer:\n", answer)

        print(f"Query processed in {time.time() - start_time:.2f} seconds.")

In [18]:
if __name__ == "__main__":
    main()

Database is up to date.


Processing query...
Context for the query: 48 2 • Early Humans
Access for free at openstax.org
48 2 • Early Humans
Access for free at openstax.org
was once believed the images were designed to be popularly admired as interesting decorations, not unlike the
2.1 • Early Human Evolution and Migration 47
was once believed the images were designed to be popularly admired as interesting decorations, not unlike the
2.1 • Early Human Evolution and Migration 47
exchange was vital for cooperation. But did they also have a type of written communication? Some researchers
think it’s possible that seemingly abstract signs preserved in caves represent just that. Watch this short video
about fascinating scholarship around these intriguing cave signs (https://openstax.org/l/77CaveSigns) to learn
more.
Ice, Ice, and More Ice
Scientists who study the changes that have occurred on Earth over billions of years have identi]ed at least ]ve

Answer:
 Unfortunately, you didn't provide a specific question relat

# Multimodal RAG

## 1. Converting everything into single modality

In [12]:
import pymupdf
from PIL import Image
import io

In [13]:
text_data = []
image_data = []

**Extracting images**

In [14]:
def extract_images_from_pdfs():
    if not os.path.exists("extracted_images"):
        os.makedirs("extracted_images")
        
    for file_name in os.listdir(DATA_DIR):
        file_path = os.path.join(DATA_DIR, file_name)
        
        if os.path.isfile(file_path) and file_name.endswith('.pdf'):
            print(f"Processing {file_name}...")

            with pymupdf.open(file_path) as doc:
                if not os.path.exists("extracted_images"):
                    os.makedirs("extracted_images")
                
            #loop through all documents and all pages
                for page_num in range(len(doc)):
                    page = doc[page_num]

                    text = page.get_text().strip()
                    text_data.append({"response": text, "name": page_num+1})

                    images = page.get_images(full=True)
                    
                    # Extract images
                    for img_index, img in enumerate(images, start=0):
                        xref = img[0]
                        base_image = doc.extract_image(xref)
                        image_bytes = base_image["image"]
                        image_ext = base_image["ext"]

                        image_filename = f"extracted_images/page_{page_num + 1}_img_{img_index + 1}.{image_ext}"
                        
                        # with open(image_filename, "wb") as img_file:
                        #     img_file.write(image_bytes)

                        # image_data.append(image_filename)

                        image = Image.open(io.BytesIO(image_bytes))
                        image.save(image_filename)


In [15]:
extract_images_from_pdfs()

Processing Apollo 11 - Wikipedia.pdf...


**Image captioning**

In [19]:
#llama vision
for img in os.listdir("extracted_images_2"):
    # image = Image.open(f"extracted_images/{img}")
    print(f"extracted_images_2/{img}")

    response = ollama.chat(
        model='llama3.2-vision',
        messages=[{
            'role': 'user',
            'content': 'You are an assistant tasked with summarizing tables, images and text NASA website for retrieval. \
                        These summaries will be embedded and used to retrieve the raw text or table elements \
                        Give a concise summary of the table or text that is well optimized for retrieval.',
            'images': [f"extracted_images_2/{img}"],
        }]
    )

    response.message.content = f"<IMG src=extracted_images_2/{img}>" + response.message.content + "<IMG>"

    print(response)

    image_data.append({"response": response, "name": img})

extracted_images_2/page_10_img_1.jpeg
model='llama3.2-vision' created_at='2025-04-17T15:42:17.324808Z' done=True done_reason='stop' total_duration=20954653916 load_duration=35406208 prompt_eval_count=63 prompt_eval_duration=3810827333 eval_count=238 eval_duration=17018100375 message=Message(role='assistant', content='<IMG src=extracted_images_2/page_10_img_1.jpeg>The image depicts a person in a white spacesuit inside a spacecraft, with their back against the wall and their head facing towards the camera. The suit appears to be made of a thick, white material, possibly Kevlar or other synthetic fibers.\n\n*   **Spacesuit:**\n    *   Color: White\n    *   Material: Thick, possibly Kevlar or synthetic fibers\n    *   Purpose: Likely for spacewalks or extravehicular activities\n*   **Spacecraft Interior:**\n    *   Background: Gray or metallic color\n    *   Lighting: Possibly fluorescent or LED lights\n    *   Equipment: Various instruments and controls visible on the walls and ceiling\n 

In [None]:
# def generate_image_captions():
#     global text_data, image_data

#     for img in os.listdir(IMAGE_DIR):
#         img_path = os.path.join(IMAGE_DIR, img)
#         start_time = time.time()

#         if os.path.isfile(img_path):
#             print(f"\nGenerating caption for {img}...\n")

#             response = chat(
#                 model="llama3.2-vision",
#                 messages=[
#                     {
#                         "role": "system",
#                         "content": "You are a helpful assistant that generates captions for images."
#                     },
#                     {
#                         "role": "user",
#                         "content": "You are an assistant tasked with summarizing tables, images and text NASA website for retrieval. \
#                                     These summaries will be embedded and used to retrieve the raw text or table elements \
#                                     Give a concise summary of the table or text that is well optimized for retrieval.",
#                         "images": [f"{IMAGE_DIR}/{img}"],
#                     }
#                 ],
#             )

#             response.message.content = f"<IMG src=extracted_images/{img}>" + response.message.content + "<IMG>"
#             image_data.append({"response": response, "name": img})

#     doc_list = [Document(page_content=text['response'], metadata={"name": text['name']}) for text in text_data]
#     img_list = [Document(page_content=img['response'].message['content'], metadata={"name": img['name']}) for img in image_data]
#     print(img_list)
#     print(f"Generated image captions in {time.time() - start_time:.2f} seconds.\n")
        
#     return doc_list, img_list       


In [20]:
image_data

[{'response': ChatResponse(model='llama3.2-vision', created_at='2025-04-17T15:34:11.556089Z', done=True, done_reason='stop', total_duration=94277953750, load_duration=5931930542, prompt_eval_count=106, prompt_eval_duration=74189990000, eval_count=195, eval_duration=14058777125, message=Message(role='assistant', content="The image shows a person in a white spacesuit floating inside an aircraft, with various instruments and controls visible on the walls and ceiling.\n\n* A person in a white spacesuit is floating inside an aircraft.\n\t+ The person appears to be a pilot or astronaut, judging by their attire and position within the cockpit.\n\t+ They are wearing a white spacesuit with a helmet and gloves, indicating that they are prepared for space travel.\n* Various instruments and controls are visible on the walls and ceiling of the aircraft.\n\t+ The instruments appear to be electronic devices, possibly monitoring the aircraft's systems or displaying information to the pilot.\n\t+ The c

In [21]:
from langchain_core.documents import Document

docs_list = [Document(page_content=text['response'], metadata={"name": text['name']}) for text in text_data]
img_list = [Document(page_content=img['response'].message['content'], metadata={"name": img['name']}) for img in image_data]

In [22]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=400, chunk_overlap=50
)

doc_splits = text_splitter.split_documents(docs_list)
img_splits = text_splitter.split_documents(img_list)

In [26]:
existing_collections = [col.name for col in q_client.get_collections().collections]

if COLLECTION_NAME in existing_collections:
    q_client.update_collection(
        collection_name=COLLECTION_NAME,
        optimizers_config=models.OptimizersConfigDiff(indexing_threshold=10000),
    )
else:
    q_client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
    )

documents = doc_splits + img_splits

vectore_store.add_documents(documents)

print("Documents and images added to the collection.")

with open(METADATA_FILE, 'wb') as f:
        pickle.dump({"pdf_hash": compute_pdf_hash()}, f)


Documents and images added to the collection.


In [27]:

query = input("Enter your question")

start_time = time.time()
print("Processing query...")

answer = generate_answer(query)

print("\n\n\nAnswer:\n", answer)

print(f"Query processed in {time.time() - start_time:.2f} seconds.")

Processing query...
Context for the query: Apollo 11 insignia
The Apollo 11 mission emblem was designed by Collins, who
wanted a symbol for "peaceful lunar landing by the United States".
At Lovell's suggestion, he chose the bald eagle, the national bird of
the United States, as the symbol. Tom Wilson, a simulator
instructor, suggested an olive branch in its beak to represent their
peaceful mission. Collins added a lunar background with the Earth
in the distance. The sunlight in the image was coming from the
wrong direction; the shadow should have been in the lower part of
the Earth instead of the left. Aldrin, Armstrong and Collins
decided the Eagle and the Moon would be in their natural colors,
and decided on a blue and gold border. Armstrong was concerned
that "eleven" would not be understood by non-English speakers, so
they went with "Apollo 11",[70] and they decided not to put their
names on the patch, so it would "be representative of everyone
who had worked toward a lunar landing

---

## 2. Multimodal Embeddings stored in a single vector db

In [1]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('clip-ViT-B-32')
processor = SentenceTransformer('clip-ViT-B-32')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = CLIPModel.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")

In [3]:
def embed_with_clip(image, text):
    inputs = processor(text=text, images=image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    image_embeds = outputs.image_embeds
    # text_embeds = outputs.text_embeds
    return image_embeds

In [10]:
def create_collection(collection_name):
    q_client.create_collection(
        collection_name,
        vectors_config={
        "text_embedding": models.VectorParams(
                size=384,  # Dimension of text embeddings
                distance=models.Distance.COSINE,  # Cosine similarity is used for comparison
            ),
        'image_embedding': models.VectorParams(
            size = 512,
            distance=models.Distance.COSINE,
        ),
        },
    )

create_collection("image_text_collection")

In [11]:
def ingest_data(points):
    operation_info = q_client.upsert(
        collection_name="image_text_collection",
        points=points,
    )

In [24]:
from langchain_core.documents import Document

docs_list = [Document(page_content=text['response'], metadata={"name": text['name']}) for text in text_data]

In [25]:
vectore_store_text = Qdrant(
    client = q_client,
    collection_name="image_text_collection",
    embeddings=embedding_model,
    vector_name="text_embedding"
)

In [26]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=400, chunk_overlap=50
)

doc_splits = text_splitter.split_documents(docs_list)
vectore_store_text.add_documents(doc_splits)

UnexpectedResponse: Unexpected Response: 400 (Bad Request)
Raw response content:
b'{"status":{"error":"Wrong input: Vector dimension error: expected dim: 1536, got 384"},"time":0.047737625}'

In [27]:
image_splits = []

for img in os.listdir("extracted_images"):
    try:
        image = Image.open(f"extracted_images/{img}")
        image_embedding = embed_with_clip(image, "Describe the image content")
        
        image_splits.append({
            "id": img,
            "vector": image_embedding.detach().numpy()[0],
            "payload": {"name": img}
        })
    except Exception as e:
        print(f"Error processing image {img}: {e}")

Error processing image page_24_img_2.jpeg: name 'embed_with_clip' is not defined
Error processing image page_21_img_1.png: name 'embed_with_clip' is not defined
Error processing image page_29_img_2.jpeg: name 'embed_with_clip' is not defined
Error processing image page_10_img_1.jpeg: name 'embed_with_clip' is not defined
Error processing image page_27_img_1.jpeg: name 'embed_with_clip' is not defined
Error processing image page_9_img_1.png: name 'embed_with_clip' is not defined
Error processing image page_22_img_1.jpeg: name 'embed_with_clip' is not defined
Error processing image page_15_img_1.jpeg: name 'embed_with_clip' is not defined
Error processing image page_29_img_3.jpeg: name 'embed_with_clip' is not defined
Error processing image page_16_img_2.jpeg: name 'embed_with_clip' is not defined
Error processing image page_18_img_1.jpeg: name 'embed_with_clip' is not defined
Error processing image page_28_img_1.jpeg: name 'embed_with_clip' is not defined
Error processing image page_25_

In [None]:
vector_store_image = Qdrant(
    client=q_client,
    collection_name="image_text_collection",
    embeddings=CLIPModel,  # No embeddings for the image, using custom embedding function
    vector_name="image_embedding"
)

In [None]:
from langchain.docstore.document import Document

# Convert each dictionary into a Document.
docs = [
    Document(page_content="", metadata=img_dict)
    for img_dict in image_splits
]

vector_store_image.add_documents(docs)

In [None]:
vector_store_image.add_documents(image_splits)

In [None]:
from typing import List, Union, Tuple
import base64
import numpy as np

In [None]:
def get_image_paths(directory: str, number: int = None) -> List[str]:
    image_paths = []
    count = 0
    for filename in os.listdir(directory):
        if filename.endswith('.jpeg'):
            image_paths.append(os.path.join(directory, filename))
            if number is not None and count == number:
                return [image_paths[-1]]
            count += 1
    return image_paths
direc = 'extracted_images/'
image_paths = get_image_paths(direc)


In [None]:
def get_features_from_image_path(image_paths):
  images = [processor(Image.open(image_path).convert("RGB")) for image_path in image_paths]
  image_input = torch.tensor(np.stack(images))
  with torch.no_grad():
    image_features = model.encode_image(image_input).float()
  return image_features
image_features = get_features_from_image_path(image_paths)