# Installing Dependencies

In [22]:
!python3 -m pip install pymupdf openai chromadb tqdm nltk tiktoken python-docx

from openai import OpenAI
from tqdm import tqdm

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting lxml>=3.1.0 (from python-docx)
  Downloading lxml-6.0.0-cp312-cp312-macosx_10_13_universal2.whl.metadata (6.6 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
Downloading lxml-6.0.0-cp312-cp312-macosx_10_13_universal2.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: lxml, python-docx
Successfully installed lxml-6.0.0 python-docx-1.2.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


# Extract Text Function

In [3]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text

## Break the full text into overlapping chunks for better retrieval

In [6]:
import nltk

# Fix SSL issue (macOS sometimes has missing certs)
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download punkt
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/fahadalsaud/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1]:
import nltk
import os

# Explicitly add the punkt path
# nltk.data.path.append("/Users/mohsintanveer/nltk_data")

from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

# Confirm it works
text = "Hello world. This is a test sentence. Let's see if it works."
print(sent_tokenize(text))

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/fahadalsaud/nltk_data...


['Hello world.', 'This is a test sentence.', "Let's see if it works."]


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [13]:
import nltk
import tiktoken

nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

# Load tokenizer for OpenAI's embedding model
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

def chunk_text(text, max_tokens=500, overlap=50):
    sentences = sent_tokenize(text)  # Break the text into individual sentences
    chunks = []                     # To store final chunks
    current_chunk = []              # Temporarily hold sentences for a chunk
    current_tokens = 0              # Running token count for the current chunk

    for sentence in sentences:
        token_count = len(encoding.encode(sentence))  # Tokens in this sentence

        # If adding this sentence exceeds the token limit:
        if current_tokens + token_count > max_tokens:
            chunks.append(" ".join(current_chunk))  # Save current chunk

            # Handle overlap: keep the last few sentences for context
            overlap_sents = []
            overlap_tokens = 0
            for s in reversed(current_chunk):  # Go backwards through the chunk
                s_tokens = len(encoding.encode(s))
                if overlap_tokens + s_tokens <= overlap:
                    overlap_sents.insert(0, s)  # Add sentence at the beginning
                    overlap_tokens += s_tokens
                else:
                    break  # Stop when overlap token limit is reached

            # Start a new chunk with overlapping sentences + the new one
            current_chunk = overlap_sents + [sentence]
            current_tokens = sum(len(encoding.encode(s)) for s in current_chunk)
        else:
            # Add this sentence to the current chunk
            current_chunk.append(sentence)
            current_tokens += token_count

    # Don't forget to save the last chunk if there's anything left
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/fahadalsaud/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [19]:
# def chunk_text(text, chunk_size=500, overlap=100):
#     chunks = []
#     start = 0
#     while start < len(text):
#         end = min(start + chunk_size, len(text))
#         chunk = text[start:end]
#         chunks.append(chunk.strip())
#         start += chunk_size - overlap
#     return chunks

## Parse questions from exercise sheets and generate markschemes

In [1]:
import re

def parse_questions_from_exercises(text):
    """
    Extracts exercise questions that begin with patterns like Q1., Q2., Q5a., etc.
    Returns a list of cleaned question strings.
    """
    lines = text.split('\n')
    questions = []
    current_question = ""

    # Regex to match question identifiers like Q1., Q2., Q5a., Q8.
    question_start = re.compile(r"^(Q\d+(\.\d+)?[a-z]?\.)\s+(.*)")

    for line in lines:
        line = line.strip()
        if not line:
            continue

        match = question_start.match(line)
        if match:
            if current_question:
                questions.append(current_question.strip())
            current_question = match.group(3)  # the question text
        else:
            current_question += " " + line  # append continuation lines

    if current_question:
        questions.append(current_question.strip())

    return questions

In [6]:
text = extract_text_from_pdf("Exercises1-2.pdf")
text += " \n\n" + extract_text_from_pdf("Exercises2.pdf")
text += " \n\n" + extract_text_from_pdf("Exercises3.pdf")
text += " \n\n" + extract_text_from_pdf("Exercises4.pdf")
text += " \n\n" + extract_text_from_pdf("Exercises5.pdf")
questions = parse_questions_from_exercises(text)

In [9]:
from openai import OpenAI
client = OpenAI(api_key="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")

def generate_exercise_answer(question, context_prompt=None):
    system_msg = (
        "You are a helpful Java tutor that gives clear, accurate answers "
        "to university-level Java exercises."
    )

    if context_prompt:
        system_msg += " " + context_prompt

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": question}
        ]
    )
    return response.choices[0].message.content.strip()


In [10]:
qa_chunks = []

for q in tqdm(questions, desc="Generating answers"):
    answer = generate_exercise_answer(q)
    qa_chunks.append(f"Q: {q}\nA: {answer}")

In [16]:
import chromadb

# Point to the same persistent storage directory
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Load the existing collection
collection = chroma_client.get_collection(name="knowledge-base6")

In [23]:
from docx import Document

def save_qa_chunks_to_docx(qa_chunks, filename="qa_output.docx"):
    doc = Document()
    doc.add_heading("Exercise Questions and Answers", level=1)

    for chunk in qa_chunks:
        if chunk.startswith("Q:") and "\nA:" in chunk:
            q_part, a_part = chunk.split("\nA:", 1)
            question = q_part[2:].strip()
            answer = a_part.strip()

            doc.add_paragraph(f"Q: {question}", style="List Number")
            doc.add_paragraph(f"A: {answer}")
            doc.add_paragraph("\n\n")  # Add extra spacing between entries
        else:
            print("Skipping malformed chunk:", chunk)

    doc.save(filename)
    print(f"Saved {len(qa_chunks)} Q&A pairs to {filename}")

In [None]:
save_qa_chunks_to_docx(qa_chunks)

In [21]:
save_qa_chunks_to_excel(qa_chunks)

Saved 23 Q&A pairs to qa_output.xlsx


In [17]:
from tqdm import tqdm

# 1. Embed new Q&A chunks with progress bar
embeddings = [
    client.embeddings.create(input=chunk, model="text-embedding-3-small").data[0].embedding
    for chunk in tqdm(qa_chunks, desc="Embedding Q&A chunks")
]

# 2. Add to existing collection with progress bar
for chunk, embedding in tqdm(zip(qa_chunks, embeddings), total=len(qa_chunks), desc="Storing in ChromaDB"):
    collection.add(
        documents=[chunk],
        embeddings=[embedding],
        ids=[str(hash(chunk))]  # Optional: use uuid5 if you want stability across sessions
    )

Embedding Q&A chunks: 100%|██████████| 23/23 [00:18<00:00,  1.23it/s]
Storing in ChromaDB: 100%|██████████| 23/23 [00:00<00:00, 57.71it/s]


# Embed and Store in Vector DB

In [6]:
import os
from openai import OpenAI
import chromadb

client = OpenAI(api_key="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")
# Create/load local persistent DB folder
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.create_collection("knowledge-base6")

# Get all PDFs in your directory
pdf_files = [f for f in os.listdir() if f.endswith(".pdf")]

all_chunks = []
all_metadatas = []

# Loop through all PDF files
for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
    text = extract_text_from_pdf(pdf_file)
    chunks = chunk_text(text)

    all_chunks.extend(chunks)
    # Save which file each chunk came from
    all_metadatas.extend([{"source": pdf_file}] * len(chunks))

# Embed all chunks
embedding_model = "text-embedding-3-small"
embeddings = [
    client.embeddings.create(input=chunk, model=embedding_model).data[0].embedding
    for chunk in tqdm(all_chunks, desc="Embedding Chunks")
]

# Store all in ChromaDB
for chunk, embedding, metadata in tqdm(zip(all_chunks, embeddings, all_metadatas), total=len(all_chunks), desc="Storing in DB"):
    collection.add(
        documents=[chunk],
        embeddings=[embedding],
        ids=[str(hash(chunk))],
        metadatas=[metadata]
    )

Processing PDFs: 100%|██████████| 61/61 [00:01<00:00, 51.51it/s]
Embedding Chunks: 100%|██████████| 490/490 [03:52<00:00,  2.11it/s]
Storing in DB: 100%|██████████| 490/490 [00:03<00:00, 132.97it/s]


## Generate an Answer with RAG

In [9]:
def rag_answer(query, collection, embedding_model="text-embedding-3-small", k=3):

    client = OpenAI(api_key="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")
    
    # Step 1: Embed the user's question
    query_embedding = client.embeddings.create(
        input=query,
        model=embedding_model
    ).data[0].embedding

    # Step 2: Retrieve top-k similar chunks from ChromaDB
    results = collection.query(query_embeddings=[query_embedding], n_results=k)
    relevant_chunks = results["documents"][0]

    # Step 3: Build the RAG prompt
    context = "\n\n".join(relevant_chunks)
    prompt = f"""
You are a helpful Java teaching assistant at UCL. Use the context below, which is taken from course materials, to answer the user's question. If the answer is not in the context, say you don’t know.

Context:
{context}

Question:
{query}

Answer:
"""

    # Step 4: Ask GPT to answer using the context
    response = client.chat.completions.create(
        model="gpt-4o",  # or "gpt-3.5-turbo"
        messages=[
            {"role": "system", "content": "You are a helpful Java teaching assistant at UCL."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3
    )
    
    return response.choices[0].message.content

In [10]:
print(rag_answer("Explain loops in Java", collection))

Java provides three types of loops: the while loop, the do-while loop, and the for loop, each serving different purposes and use cases.

1. **While Loop**: 
   - The while loop repeatedly executes a block of statements as long as a specified boolean expression evaluates to true. 
   - The loop body can be executed zero or more times, depending on the condition.
   - Syntax:
     ```java
     while (boolean-expression) {
       // Loop body
     }
     ```
   - Example:
     ```java
     int counter = 0;
     while (counter < 10) {
       System.out.println("Hello " + counter);
       counter++;
     }
     ```

2. **Do-While Loop**:
   - The do-while loop is similar to the while loop, but it guarantees that the loop body will be executed at least once, as the condition is evaluated after the loop body.
   - Syntax:
     ```java
     do {
       // Loop body
     } while (boolean-expression);
     ```
   - Example:
     ```java
     int counter = 0;
     do {
       System.out.println("

In [11]:
# RAG test loop: Ask questions and get grounded answers
while True:
    query = input("\nAsk a question (or type 'exit' to quit): ")
    if query.lower() in {"exit", "quit"}:
        break
    answer = rag_answer(query, collection)
    print("\nAnswer:\n", answer)


Answer:
 The context provided does not include specific information on how to create a function in Java. However, based on general Java knowledge, I can guide you on how to create a simple function (method) in Java.

Here's a basic example of a function in Java:

```java
public class MyClass {

    // This is a simple function that takes an integer as a parameter and returns its square
    public int square(int number) {
        return number * number;
    }

    public static void main(String[] args) {
        MyClass myClass = new MyClass();
        int result = myClass.square(5);
        System.out.println("The square of 5 is: " + result);
    }
}
```

In this example, the `square` method is a function that takes an integer parameter and returns the square of that integer. The `main` method demonstrates how to call this function and print the result.

Answer:
 I don’t know.

Answer:
 Polymorphism in Java refers to the ability of a single function or method to operate on different t