# Installing Dependencies

In [6]:
!python3 -m pip install pymupdf openai chromadb tqdm nltk tiktoken python-docx langchain langsmith langchain_openai moviepy

from openai import OpenAI
from tqdm import tqdm
import os


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


## Setting up Langsmith for Evaluation

In [3]:
os.environ["OPENAI_API_KEY"] = "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"
os.environ["LANGCHAIN_API_KEY"] = "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"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "Java TA Chatbot"
os.environ["LANGCHAIN_TRACING_V2"] = "true"

# Extract Text Function

In [3]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text

## Break the full text into overlapping chunks for better retrieval

In [6]:
import nltk

# Fix SSL issue (macOS sometimes has missing certs)
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download punkt
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/fahadalsaud/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1]:
import nltk

# Explicitly add the punkt path
# nltk.data.path.append("/Users/mohsintanveer/nltk_data")

from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

# Confirm it works
text = "Hello world. This is a test sentence. Let's see if it works."
print(sent_tokenize(text))

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/fahadalsaud/nltk_data...


['Hello world.', 'This is a test sentence.', "Let's see if it works."]


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [13]:
import nltk
import tiktoken

nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

# Load tokenizer for OpenAI's embedding model
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

def chunk_text(text, max_tokens=500, overlap=50):
    sentences = sent_tokenize(text)  # Break the text into individual sentences
    chunks = []                     # To store final chunks
    current_chunk = []              # Temporarily hold sentences for a chunk
    current_tokens = 0              # Running token count for the current chunk

    for sentence in sentences:
        token_count = len(encoding.encode(sentence))  # Tokens in this sentence

        # If adding this sentence exceeds the token limit:
        if current_tokens + token_count > max_tokens:
            chunks.append(" ".join(current_chunk))  # Save current chunk

            # Handle overlap: keep the last few sentences for context
            overlap_sents = []
            overlap_tokens = 0
            for s in reversed(current_chunk):  # Go backwards through the chunk
                s_tokens = len(encoding.encode(s))
                if overlap_tokens + s_tokens <= overlap:
                    overlap_sents.insert(0, s)  # Add sentence at the beginning
                    overlap_tokens += s_tokens
                else:
                    break  # Stop when overlap token limit is reached

            # Start a new chunk with overlapping sentences + the new one
            current_chunk = overlap_sents + [sentence]
            current_tokens = sum(len(encoding.encode(s)) for s in current_chunk)
        else:
            # Add this sentence to the current chunk
            current_chunk.append(sentence)
            current_tokens += token_count

    # Don't forget to save the last chunk if there's anything left
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/fahadalsaud/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Parse questions from exercise sheets and generate markschemes

In [1]:
import re

def parse_questions_from_exercises(text):
    """
    Extracts exercise questions that begin with patterns like Q1., Q2., Q5a., etc.
    Returns a list of cleaned question strings.
    """
    lines = text.split('\n')
    questions = []
    current_question = ""

    # Regex to match question identifiers like Q1., Q2., Q5a., Q8.
    question_start = re.compile(r"^(Q\d+(\.\d+)?[a-z]?\.)\s+(.*)")

    for line in lines:
        line = line.strip()
        if not line:
            continue

        match = question_start.match(line)
        if match:
            if current_question:
                questions.append(current_question.strip())
            current_question = match.group(3)  # the question text
        else:
            current_question += " " + line  # append continuation lines

    if current_question:
        questions.append(current_question.strip())

    return questions

In [6]:
text = extract_text_from_pdf("Exercises1-2.pdf")
text += " \n\n" + extract_text_from_pdf("Exercises2.pdf")
text += " \n\n" + extract_text_from_pdf("Exercises3.pdf")
text += " \n\n" + extract_text_from_pdf("Exercises4.pdf")
text += " \n\n" + extract_text_from_pdf("Exercises5.pdf")
questions = parse_questions_from_exercises(text)

In [24]:
# import fitz  # PyMuPDF
# import base64
# from openai import OpenAI
# from docx import Document
# from tqdm import tqdm

# client = OpenAI(api_key="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")

# def answer_questions_from_pdf(pdf_path, output_docx="java_answers.docx"):
#     doc = fitz.open(pdf_path)
#     answers = []

#     system_msg = (
#         "You are a helpful and patient Java teaching assistant at UCL. "
#         "You are given a PDF page with Java exam questions and UML diagrams. "
#         "For every question on the page, write a full beginner-friendly answer. "
#         "Explain code structure, Java concepts (like classes, inheritance, constructors, etc.), "
#         "and relate UML diagrams to code. If a UML diagram is present, describe and interpret it carefully."
#     )

#     word_doc = Document()
#     word_doc.add_heading("Answers to Java Exam Questions", level=1)

#     for page_num in tqdm(range(len(doc)), desc="Answering pages"):
#         page = doc[page_num]
#         text = page.get_text().strip()

#         # Convert page to image and encode
#         pix = page.get_pixmap(dpi=300)
#         image_bytes = pix.tobytes("png")
#         base64_img = base64.b64encode(image_bytes).decode("utf-8")

#         # Create messages for GPT-4o
#         messages = [
#             {"role": "system", "content": system_msg},
#             {
#                 "role": "user",
#                 "content": [
#                     {"type": "text", "text": text},
#                     {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_img}"}}
#                 ]
#             }
#         ]

#         # Generate answer
#         response = client.chat.completions.create(
#             model="gpt-4o",
#             messages=messages,
#             temperature=0.3
#         )

#         answer = response.choices[0].message.content.strip()
#         answers.append((f"Page {page_num + 1}", text, answer))

#         # 🔹 Print nicely
#         print(f"\n{'='*30}\n📝 Page {page_num + 1}\n{'='*30}")
#         print(f"\n📄 Questions:\n{text[:1000]}")  # Truncate if very long
#         print(f"\n✅ Answer:\n{answer}\n")

#         # 🔹 Add to Word doc
#         word_doc.add_heading(f"Page {page_num + 1}", level=2)
#         word_doc.add_paragraph("📝 Questions:", style="Intense Quote")
#         word_doc.add_paragraph(text)
#         word_doc.add_paragraph("✅ Answer:", style="Intense Quote")
#         word_doc.add_paragraph(answer)
#         word_doc.add_paragraph("\n")

#     word_doc.save(output_docx)
#     print(f"\n✅ All answers saved to: {output_docx}")

#     return answers

In [7]:
import fitz  # PyMuPDF
import base64
import re
from openai import OpenAI
from tqdm import tqdm
from docx import Document
from pathlib import Path
from typing import List

client = OpenAI(api_key="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")


# A page must contain EITHER "[xx marks]" OR a line beginning with "n."
QUESTION_PAGE_RE = re.compile(
    r"\[\d+\s*marks?\]"          #  [10 marks]   or [2 mark]
    r"|"                             #  …or…
    r"^\s*\d+\.\s",              #  1. 2. 3. etc. at line start
    re.IGNORECASE | re.MULTILINE
)

# Instruction verbs / OOP keywords for a secondary sanity check
VALID_QUESTION_PATTERNS = [
    r"\b(write|explain|describe|compare|implement|calculate|state|list|give)\b",
    r"\b(java|uml|class|method|object|interface|recursion)\b",
]
VALID_Q_RE = re.compile("|".join(VALID_QUESTION_PATTERNS), re.IGNORECASE)


def extract_pages(pdf_path):
    """Extracts text + images from each page"""
    doc = fitz.open(pdf_path)
    page_data = []

    for i, page in enumerate(doc):
        text = page.get_text().strip()
        pix = page.get_pixmap(dpi=300)
        img_bytes = pix.tobytes("png")
        base64_img = base64.b64encode(img_bytes).decode("utf-8")
        page_data.append({"page": i + 1, "text": text, "image": base64_img})
    return page_data


def extract_questions_only(pdf_path: str | Path) -> List[str]:
    """
    Return a list of *question blocks* from the given PDF, skipping
    cover pages and general instructions.
    """
    pdf_path = Path(pdf_path)
    doc = fitz.open(pdf_path)
    all_pages = [page.get_text().strip() for page in doc]

    question_pages = [t for t in all_pages if QUESTION_PAGE_RE.search(t)]
    raw_blocks = re.split(r"\n(?=\d+\.)", "\n".join(question_pages))

    questions = [
        block.strip()
        for block in raw_blocks
        if block.strip() and VALID_Q_RE.search(block)
    ]
    return questions


def find_pages_for_question(question_text, pages):
    """Estimate which page(s) the question came from based on matching text"""
    matched = []
    for page in pages:
        if question_text[:40] in page["text"]:
            matched.append(page)
    return matched


def answer_exam_questions(pdf_path, output_docx="java_exam_answers.docx"):
    pages = extract_pages(pdf_path)
    question_pages = [p for p in pages if QUESTION_PAGE_RE.search(p["text"])]
    question_blocks = extract_questions_only(pdf_path)

    word_doc = Document()
    word_doc.add_heading("Answers to Java Exam Questions", level=1)
    answers = []

    for idx, question_text in tqdm(
            enumerate(question_blocks, 1),
            total=len(question_blocks),
            desc="Answering questions"):

        matched_pages = find_pages_for_question(question_text, question_pages)
        image_base64 = matched_pages[0]["image"] if matched_pages else None

        messages = [
            {"role": "system", "content": (
                "You are a helpful and patient Java tutor at UCL. "
                "You are answering an exam question from a past paper. "
                "Explain all parts clearly, using beginner-friendly reasoning, and refer to UML diagrams if visible in the image."
                "For coding answers: 1) Add thorough but not redundant commenting explaining important code logic. 2) Try to use as beginner-friendly code keeping in mind this is for university level programming students."
            )},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question_text},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}} if image_base64 else {"type": "text", "text": "(No diagram found for this question.)"}
                ]
            }
        ]

        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=messages,
                temperature=0.3
            )
            answer = response.choices[0].message.content.strip()
        except Exception as e:
            answer = f"⚠️ Error generating answer: {str(e)}"

        answers.append((f"Question {idx}", question_text, answer))

        print(f"\n\n{'='*40}\n📝 Question {idx}\n{'='*40}")
        print(f"\n📄 {question_text[:300]}...\n\n✅ {answer[:500]}...\n")

        word_doc.add_heading(f"Question {idx}", level=2)
        word_doc.add_paragraph("📝 Question Text:", style="Intense Quote")
        word_doc.add_paragraph(question_text)
        word_doc.add_paragraph("✅ Answer:", style="Intense Quote")
        word_doc.add_paragraph(answer)
        word_doc.add_paragraph("\n")

    word_doc.save(output_docx)
    print(f"\n✅ All answers saved to: {output_docx}")
    return answers

In [8]:
answer_exam_questions("./knowledge_base_data/past_papers/COMP1008_2007.pdf", output_docx="java_2007_answers.docx")

Answering questions:  17%|█▋        | 1/6 [00:16<01:21, 16.25s/it]



📝 Question 1

📄 .>
COMPlO08 Object-Oriented
Programming
Answer ALL of SECfION
A and TWO questions from SECfION
B
(THREE questions in total)
Marks for each part of each question are indicated in square brackets.
Calculators are NOT allowed.
Java code given in your answers does not have to be syntactically perfect bu...

✅ To solve this problem, we need to write Java classes based on the given UML diagram.

### a) Class `Item` as an Abstract Class

The `Item` class is abstract and has a constructor and an abstract method `getShippingCost`. Here's how you can write it in Java:

```java
// Abstract class Item
public abstract class Item {
    // Attributes
    private String description;
    private int idCode;

    // Constructor
    public Item(String description, int idCode) {
        this.description = descriptio...



Answering questions:  33%|███▎      | 2/6 [00:34<01:10, 17.53s/it]



📝 Question 2

📄 1. Consider this UML class diagram:
Order
Item
1...

✅ To solve this problem, we need to write Java classes based on the UML diagram provided.

### a) Abstract Class `Item`

The `Item` class is abstract, which means it cannot be instantiated directly and may contain abstract methods that must be implemented by subclasses.

Here's how you can implement the `Item` class:

```java
public abstract class Item {
    private String description;
    private int idCode;

    // Constructor
    public Item(String description, int idCode) {
        this.descri...



Answering questions:  50%|█████     | 3/6 [00:59<01:01, 20.66s/it]



📝 Question 3

📄 0..* -description : String
+addltem( anltem : Item) : void
-idCode : int
+calculateShippingCostO
: int
+includeslfem( idCode: int): boolean
+gefShlppingCost() : tnt
+getDescriptionO : String
+getidCode() : int
.
,;
l';>
I
I
Heavyltem
Lightltem
-weight: int
+getShippingCost() : int
+getShippingCost()...

✅ Let's tackle each part of the question step by step.

### a) Abstract Class `Item`

The `Item` class is abstract with a constructor and an abstract method `getShippingCost`. Here's how you can implement it in Java:

```java
public abstract class Item {
    private String description;
    private int idCode;

    // Constructor
    public Item(String description, int idCode) {
        this.description = description;
        this.idCode = idCode;
    }

    // Abstract method to be implemented by ...



Answering questions:  67%|██████▋   | 4/6 [01:23<00:44, 22.01s/it]



📝 Question 4

📄 2.
a) Concisely explain the following:
protected, polymorphism,
generic method, final, static method, null
[2 marks each, total of 121
b) Write a Java class Queue that stores a queue of objects. A Queue has:
•
A maximum length, which is set when a Queue object is created.
•
A method add, to add an o...

✅ ### a) Concisely explain the following:

1. **Protected**:
   - A Java access modifier that allows access to the members of a class within its own package and by subclasses. It provides more accessibility than private but is more restrictive than public.

2. **Polymorphism**:
   - A core concept in object-oriented programming that allows objects to be treated as instances of their parent class. The most common use is method overriding, where a subclass provides a specific implementation of a met...



Answering questions:  83%|████████▎ | 5/6 [01:57<00:26, 26.47s/it]



📝 Question 5

📄 3.
a) Describe the properties of a well-designed,
cohesive class.
[6 marks!
b) Consider this specification:
"Write a program to manage a small library. The program should:
•
Hold a complete list of all the books belonging to the library.
•
Store the author, title, publisher, publication year and pag...

✅ ### 3. a) Properties of a Well-Designed, Cohesive Class

1. **Single Responsibility**: A class should have one responsibility or purpose. This makes it easier to understand and maintain.

2. **Encapsulation**: The internal state of the class should be hidden from the outside. Use private fields and provide public methods to access and modify them.

3. **Clear Interface**: The class should have a well-defined interface with methods that clearly describe the actions the class can perform.

4. **Mi...



Answering questions: 100%|██████████| 6/6 [02:22<00:00, 23.80s/it]



📝 Question 6

📄 4.
a) "Ruby is a dynamically typed language". Explain what this means and
compare type checking in Ruby with type checking in Java, pointing out any
advantages or disadvantages of their respective approaches to type checking.
(10 marks (
b) Explain what a C++ pointer is and compare pointers to refer...

✅ Sure, let's break down each part of the question.

### a) Ruby is a dynamically typed language

**Dynamic Typing in Ruby:**
- In Ruby, variables do not have a fixed type. The type is determined at runtime based on the value assigned to the variable.
- Example: 
  ```ruby
  x = 10      # x is an Integer
  x = "Hello" # x is now a String
  ```

**Type Checking in Ruby vs. Java:**

- **Ruby (Dynamic Typing):**
  - **Advantages:**
    - Flexibility: You can easily change the type of a variable.
    ...


✅ All answers saved to: java_2007_answers.docx





[('Question 1',
  '.>\nCOMPlO08 Object-Oriented\nProgramming\nAnswer ALL of SECfION\nA and TWO questions from SECfION\nB\n(THREE questions in total)\nMarks for each part of each question are indicated in square brackets.\nCalculators are NOT allowed.\nJava code given in your answers does not have to be syntactically perfect but should,\nat least, be a good approximation.\nSection A\nAnswer ALL of this Section',
  "To solve this problem, we need to write Java classes based on the given UML diagram.\n\n### a) Class `Item` as an Abstract Class\n\nThe `Item` class is abstract and has a constructor and an abstract method `getShippingCost`. Here's how you can write it in Java:\n\n```java\n// Abstract class Item\npublic abstract class Item {\n    // Attributes\n    private String description;\n    private int idCode;\n\n    // Constructor\n    public Item(String description, int idCode) {\n        this.description = description;\n        this.idCode = idCode;\n    }\n\n    // Abstract method\

In [9]:
from openai import OpenAI
client = OpenAI(api_key="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")

def generate_exercise_answer(question, context_prompt=None):
    system_msg = (
        "You are a helpful Java tutor that gives clear, accurate answers "
        "to university-level Java exercises."
    )

    if context_prompt:
        system_msg += " " + context_prompt

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": question}
        ]
    )
    return response.choices[0].message.content.strip()


In [10]:
qa_chunks = []

for q in tqdm(questions, desc="Generating answers"):
    answer = generate_exercise_answer(q)
    qa_chunks.append(f"Q: {q}\nA: {answer}")

In [6]:
import chromadb

# Point to the same persistent storage directory
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Load the existing collection
collection = chroma_client.get_collection(name="knowledge-base6")

In [23]:
from docx import Document

def save_qa_chunks_to_docx(qa_chunks, filename="qa_output.docx"):
    doc = Document()
    doc.add_heading("Exercise Questions and Answers", level=1)

    for chunk in qa_chunks:
        if chunk.startswith("Q:") and "\nA:" in chunk:
            q_part, a_part = chunk.split("\nA:", 1)
            question = q_part[2:].strip()
            answer = a_part.strip()

            doc.add_paragraph(f"Q: {question}", style="List Number")
            doc.add_paragraph(f"A: {answer}")
            doc.add_paragraph("\n\n")  # Add extra spacing between entries
        else:
            print("Skipping malformed chunk:", chunk)

    doc.save(filename)
    print(f"Saved {len(qa_chunks)} Q&A pairs to {filename}")

In [24]:
save_qa_chunks_to_docx(qa_chunks)

Saved 23 Q&A pairs to qa_output.docx


In [17]:
from tqdm import tqdm

# 1. Embed new Q&A chunks with progress bar
embeddings = [
    client.embeddings.create(input=chunk, model="text-embedding-3-small").data[0].embedding
    for chunk in tqdm(qa_chunks, desc="Embedding Q&A chunks")
]

# 2. Add to existing collection with progress bar
for chunk, embedding in tqdm(zip(qa_chunks, embeddings), total=len(qa_chunks), desc="Storing in ChromaDB"):
    collection.add(
        documents=[chunk],
        embeddings=[embedding],
        ids=[str(hash(chunk))]  # Optional: use uuid5 if you want stability across sessions
    )

Embedding Q&A chunks: 100%|██████████| 23/23 [00:18<00:00,  1.23it/s]
Storing in ChromaDB: 100%|██████████| 23/23 [00:00<00:00, 57.71it/s]


# Embed and Store in Vector DB

In [6]:
import os
from openai import OpenAI
import chromadb

client = OpenAI(api_key="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")
# Create/load local persistent DB folder
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.create_collection("knowledge-base6")

# Get all PDFs in your directory
pdf_files = [f for f in os.listdir() if f.endswith(".pdf")]

all_chunks = []
all_metadatas = []

# Loop through all PDF files
for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
    text = extract_text_from_pdf(pdf_file)
    chunks = chunk_text(text)

    all_chunks.extend(chunks)
    # Save which file each chunk came from
    all_metadatas.extend([{"source": pdf_file}] * len(chunks))

# Embed all chunks
embedding_model = "text-embedding-3-small"
embeddings = [
    client.embeddings.create(input=chunk, model=embedding_model).data[0].embedding
    for chunk in tqdm(all_chunks, desc="Embedding Chunks")
]

# Store all in ChromaDB
for chunk, embedding, metadata in tqdm(zip(all_chunks, embeddings, all_metadatas), total=len(all_chunks), desc="Storing in DB"):
    collection.add(
        documents=[chunk],
        embeddings=[embedding],
        ids=[str(hash(chunk))],
        metadatas=[metadata]
    )

Processing PDFs: 100%|██████████| 61/61 [00:01<00:00, 51.51it/s]
Embedding Chunks: 100%|██████████| 490/490 [03:52<00:00,  2.11it/s]
Storing in DB: 100%|██████████| 490/490 [00:03<00:00, 132.97it/s]


## Generate an Answer with RAG

In [32]:
from langsmith import traceable
@traceable(name="RAG_Chatbot_Answer")
def rag_answer(query, collection, embedding_model="text-embedding-3-small", k=3):

    client = OpenAI(api_key="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")
    
    # Step 1: Embed the user's question
    query_embedding = client.embeddings.create(
        input=query,
        model=embedding_model
    ).data[0].embedding

    # Step 2: Retrieve top-k similar chunks from ChromaDB
    results = collection.query(query_embeddings=[query_embedding], n_results=k)
    relevant_chunks = results["documents"][0]

    # Step 3: Build the RAG prompt
    context = "\n\n".join(relevant_chunks)
    prompt = f"""
You are a helpful Java teaching assistant at UCL. Use the context below, which is taken from course materials, to answer the user's question. If the answer is not in the context, say you don’t know.

Context:
{context}

Question:
{query}

Answer:
"""

    # Step 4: Ask GPT to answer using the context
    response = client.chat.completions.create(
        model="gpt-4o",  # or "gpt-3.5-turbo"
        messages=[
            {"role": "system", "content": "You are a helpful Java teaching assistant at UCL."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3
    )
    
    return response.choices[0].message.content

In [16]:
from langchain_openai import ChatOpenAI
from langsmith import traceable

@traceable(name="RAG_Chatbot_Answer")
def rag_answer2(query, collection, embedding_model="text-embedding-3-small", k=3):
    from openai import OpenAI
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))  # keep for embeddings

    # Step 1: Embed the user's question
    query_embedding = client.embeddings.create(
        input=query,
        model=embedding_model
    ).data[0].embedding

    # Step 2: Retrieve top-k chunks from Chroma
    results = collection.query(query_embeddings=[query_embedding], n_results=k)
    relevant_chunks = results["documents"][0]

    # Step 3: Construct prompt
    context = "\n\n".join(relevant_chunks)
    full_prompt = f"""
You are a helpful and expert Java teaching assistant at UCL. You assist students by answering their questions using only the course material provided in the context.
Your answers must always be:
Accurate, based solely on the context below;
Thorough, with clear explanations and examples when relevant;
Friendly and pedagogical, like a knowledgeable TA during office hours.
🔍 Context Usage Instructions:
Use only the information found in the context. Do not invent APIs, methods, definitions, or facts.
You may reformat, rename, and adapt examples from the context to answer the user’s question.
If the answer is not supported in any way by the context (no matching concept, explanation, or code), reply with:
“Sorry, I couldn’t find that in the course material I was given.”
Do not include this apology if you’ve already answered the question or explained something from the context.
📋 Answer Format:
Brief Summary
A one- or two-line direct answer to the question.
Detailed Explanation
A clear and structured explanation using the terminology and style of the UCL course.
Java Code (if relevant)
Provide working and formatted code blocks in:
```java
// Code with meaningful comments
public int square(int x) {
    'return x * x;'
}
```
Add comments or labels like // Constructor or // Method call example where helpful.
Edge Cases & Pitfalls
Briefly mention any exceptions, compiler warnings, gotchas, or common mistakes related to the topic.
Optional Extras (only if helpful)
ASCII-style diagrams for control flow, object relationships, or memory
Small tables (e.g., lifecycle states, type conversions)
Mini Quiz (optional)
Occasionally include a short quiz question to reinforce learning (e.g., “What would happen if the return type was void?”). Include answers at the end.
✏️ Formatting Rules:
Use correct Java identifier formatting (e.g., MyClass, toString(), ArrayList<Integer>)
Use bullet points or subheadings where clarity improves
Do not include material or Java APIs not explicitly referenced in the context
⚠️ Handling Common Cases:
If the user question is too vague, explain a general case using course-relevant examples (e.g., square(int x) or sayHello()).
If multiple interpretations of a question are possible, briefly list the plausible ones and address each.
If the question mentions a Java keyword (e.g., final, static, record), define it precisely and relate it to context.
If the question is about bugs, compilation errors, or design, point to patterns, methods, or design tips from the context material.
🎓 Teaching Style:
Be professional, supportive, and clear — like a trusted lab demonstrator or tutor.
Prioritize conceptual clarity over fancy language.
Avoid filler. Never speculate.
Structure your answer to help students understand, not just memorize.
🧠 Self-Check Before Answering:
Ask yourself: “Can I find any relevant example, definition, or code in the context that helps answer this question?”
If yes, adapt and use it.
If no, say: “Sorry, I couldn’t find that in the course material I was given.”

Context:
{context}

Question:
{query}

Answer:
"""

    # Step 4: LangSmith-traceable LLM call
    llm = ChatOpenAI(model="gpt-4o", temperature=0.3)
    response = llm.invoke(full_prompt)

    return response.content

In [17]:
print(rag_answer2("Explain loops in Java", collection))

Brief Summary
Java provides three types of loops: while, do-while, and for loops, each serving different purposes for repeating code execution.

Detailed Explanation

1. **While Loop**
   - **Syntax**: 
     ```java
     while (boolean-expression) {
         // Loop body
     }
     ```
   - **Behavior**: The loop body executes zero or more times as long as the boolean expression evaluates to true.
   - **Example**:
     ```java
     int counter = 0;
     while (counter < 10) {
         System.out.println("Hello " + counter);
         counter++;
     }
     ```
   - **Use Case**: Suitable when the number of iterations is not known beforehand.

2. **Do-While Loop**
   - **Syntax**:
     ```java
     do {
         // Loop body
     } while (boolean-expression);
     ```
   - **Behavior**: The loop body executes at least once, and then continues as long as the boolean expression is true.
   - **Example**:
     ```java
     int counter = 0;
     do {
         System.out.println("Hello " + 

In [18]:
# RAG test loop: Ask questions and get grounded answers
while True:
    query = input("\nAsk a question (or type 'exit' to quit): ")
    if query.lower() in {"exit", "quit"}:
        break
    answer = rag_answer2(query, collection)
    print("\nAnswer:\n", answer)

Brief Summary
Java supports three main types of loops: `while`, `do-while`, and `for` loops, each with specific use cases and behavior.

Detailed Explanation

### While Loop
- **Syntax**: 
  ```java
  while (boolean-expression) {
      // Loop body
  }
  ```
- **Behavior**: The loop body is executed zero or more times. The `boolean-expression` is evaluated before each iteration, and if it evaluates to `true`, the loop body executes. Otherwise, the loop terminates.
- **Example**:
  ```java
  int counter = 0;
  while (counter < 10) {
      System.out.println("Hello " + counter);
      counter++;
  }
  ```
  This loop prints "Hello" followed by numbers 0 to 9.

### Do-While Loop
- **Syntax**:
  ```java
  do {
      // Loop body
  } while (boolean-expression);
  ```
- **Behavior**: The loop body is executed one or more times. The `boolean-expression` is evaluated after each iteration.
- **Example**:
  ```java
  int counter = 0;
  do {
      System.out.println("Hello " + counter);
      cou

In [29]:
import chromadb

chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_collection(name="knowledge-base6")
print("Total documents:", collection.count())
# results = collection.get(include=["metadatas", "documents", "ids"])
results = collection.get()

# for i in range(len(results["documents"])):
#     print(f"\nID: {results['ids'][i]}")
#     print(f"Source: {results['metadatas'][i].get('source', 'N/A')}")
#     print(f"Source Type: {results['metadatas'][i].get('source_type', 'N/A')}")
#     print(f"Topic: {results['metadatas'][i].get('topic', 'N/A')}")
#     print(f"Document Snippet: {results['documents'][i][:150]}...")


# from collections import Counter

# meta_keys = [key for meta in results["metadatas"] for key in meta.keys()]
# print("Metadata fields used:", Counter(meta_keys))
print(results)

Total documents: 513


AttributeError: 'NoneType' object has no attribute 'keys'

## Transcribing Video Files

In [None]:
from moviepy import VideoFileClip
import openai
import os

openai.api_key = "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"

def transcribe_video(video_path):
    """
    Extracts audio from a video file and transcribes it using OpenAI Whisper.
    Returns the transcription as text.
    """
    # Extract audio
    audio_path = "temp_audio.mp3"
    clip = VideoFileClip(video_path)
    clip.audio.write_audiofile(audio_path, codec='libmp3lame')

    # Transcribe using Whisper
    with open(audio_path, "rb") as audio_file:
        transcript = openai.Audio.transcribe("whisper-1", audio_file)

    # Clean up
    os.remove(audio_path)

    return transcript["text"]