In [None]:
import os
# Set API key (Keep this safe)
os.environ["OPENAI_API_KEY"] = "sk-cZbL5KHmx8nDzCDpVs0eq88z6LPq_YGPrjCOMyMSOdT3BlbkFJ-sbY35BLry9WJfwigH3ABIGW07QFwqGVY2snK7XfIA"

In [None]:
# -------------------------------------------------
# 🚀 Install Required Libraries (Notebook Only)
# -------------------------------------------------
!pip install openai==0.28.0 faiss-cpu numpy pymupdf ipywidgets tiktoken pytesseract rank_bm25
!apt-get install tesseract-ocr

# -------------------------------------------------
# 📌 Import Dependencies
# -------------------------------------------------
import openai
import os
import faiss
import numpy as np
import fitz  # PyMuPDF
import json
import ipywidgets as widgets
from IPython.display import display, clear_output
import tiktoken
import re
import pytesseract
from PIL import Image
import io
from rank_bm25 import BM25Okapi  # 🔥 Hybrid Retrieval

# -------------------------------------------------
# ✅ Set OpenAI API Key
# -------------------------------------------------
openai.api_key = os.getenv("OPENAI_API_KEY")

# -------------------------------------------------
# 📌 Global Variables & Filenames
# -------------------------------------------------
feedback_history = []   # Stores user feedback
improvement_history = []  # Stores GPT-generated improvements
last_generated_exam = ""  # Stores the last exam for evaluation
FEEDBACK_FILE = "feedback_history.json"

# 🔥 Student data file (missing in original script)
STUDENT_DATA_FILE = "student_data.json"
student_profiles = {}

# -------------------------------------------------
# 📌 Feedback / Student Data Persistence
# -------------------------------------------------
def save_feedback():
    """Saves feedback and improvement history to a JSON file."""
    data = {
        "feedback_history": feedback_history,
        "improvement_history": improvement_history
    }
    with open(FEEDBACK_FILE, "w") as f:
        json.dump(data, f)

def load_feedback():
    """Loads feedback and improvement history from a JSON file."""
    global feedback_history, improvement_history
    try:
        with open(FEEDBACK_FILE, "r") as f:
            data = json.load(f)
            feedback_history = data.get("feedback_history", [])
            improvement_history = data.get("improvement_history", [])
    except FileNotFoundError:
        feedback_history = []
        improvement_history = []

def load_student_profiles():
    """Loads student performance data from a JSON file."""
    global student_profiles
    try:
        with open(STUDENT_DATA_FILE, "r") as f:
            student_profiles = json.load(f)
    except FileNotFoundError:
        student_profiles = {}

def save_student_profiles():
    """Saves student performance data to a JSON file."""
    with open(STUDENT_DATA_FILE, "w") as f:
        json.dump(student_profiles, f)

def track_student_progress(student_id, feedback):
    """
    Logs feedback under a specific student's history.
    Currently unused in the UI, but available if you want
    to incorporate student-specific tracking.
    """
    if student_id not in student_profiles:
        student_profiles[student_id] = {"feedback": [], "difficulty_preference": "Medium"}

    student_profiles[student_id]["feedback"].append(feedback)

    # 🔥 Adjust difficulty based on feedback
    if "too easy" in feedback.lower():
        student_profiles[student_id]["difficulty_preference"] = "Hard"
    elif "too hard" in feedback.lower():
        student_profiles[student_id]["difficulty_preference"] = "Easy"

    save_student_profiles()

def get_student_difficulty(student_id):
    """Retrieves the preferred difficulty level for a student."""
    if student_id in student_profiles:
        return student_profiles[student_id]["difficulty_preference"]
    return "Medium"  # default if no data available

# Load data on startup
load_student_profiles()
load_feedback()

# -------------------------------------------------
# 📌 Function: Extract Text from PDF (Uses OCR if Needed)
# -------------------------------------------------
def extract_text_with_ocr(pdf_path):
    text_data = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text = page.get_text("text").strip()
            # If no text found, try OCR
            if not text:
                img = page.get_pixmap()
                img = Image.open(io.BytesIO(img.tobytes()))
                text = pytesseract.image_to_string(img)
            # Only add if the page isn't basically empty
            if len(text) > 50:
                text_data.append(text)
    return "\n".join(text_data)

# -------------------------------------------------
# 📌 Function: Sentence-Aware (Token-Aware) Chunking
# -------------------------------------------------
def chunk_text(text, max_tokens=500):
    """
    Splits text into chunks, ensuring each chunk is <= max_tokens
    based on the tiktoken 'cl100k_base' tokenizer.
    """
    encoding = tiktoken.get_encoding("cl100k_base")
    words = text.split()
    chunks = []
    chunk = []
    token_count = 0

    for word in words:
        word_tokens = len(encoding.encode(word))
        if token_count + word_tokens > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = []
            token_count = 0
        chunk.append(word)
        token_count += word_tokens

    if chunk:
        chunks.append(" ".join(chunk))

    return chunks

# -------------------------------------------------
# 📌 Function: Generate OpenAI Embeddings with Caching
# -------------------------------------------------
embedding_cache = {}
def generate_embedding(text):
    if text in embedding_cache:
        return embedding_cache[text]
    response = openai.Embedding.create(input=[text], model="text-embedding-ada-002")
    embedding = response["data"][0]["embedding"]
    embedding_cache[text] = embedding
    return embedding

# -------------------------------------------------
# 📌 Function: Store Practice Exams in a Single FAISS Index
#    (Each chunk is stored individually)
# -------------------------------------------------
def store_exams_faiss(pdf_paths):
    """
    Returns:
      exam_index: faiss.IndexFlatL2
      exam_chunks: list of dicts, each dict has:
         {
           "file_name": str,
           "chunk_text": str,
           "full_text": str  # entire PDF if needed,
         }
    """
    d = 1536  # Dimension for text-embedding-ada-002
    index = faiss.IndexFlatL2(d)
    exam_chunks = []

    # Collect all chunks first
    all_embeddings = []
    for pdf_path in pdf_paths:
        pdf_text = extract_text_with_ocr(pdf_path)
        chunks = chunk_text(pdf_text)
        for chunk in chunks:
            exam_chunks.append({
                "file_name": os.path.basename(pdf_path),
                "chunk_text": chunk,
                "full_text": pdf_text
            })

    # Compute embeddings for each chunk (batching optional)
    for chunk_data in exam_chunks:
        emb = generate_embedding(chunk_data["chunk_text"])
        all_embeddings.append(emb)

    # Add to FAISS
    index.add(np.array(all_embeddings).astype("float32"))
    return index, exam_chunks

# -------------------------------------------------
# 📌 Function: Store Course Material in FAISS + BM25
#    (Again, chunk-level approach)
# -------------------------------------------------
def store_course_material_faiss(pdf_path):
    d = 1536
    index = faiss.IndexFlatL2(d)

    pdf_text = extract_text_with_ocr(pdf_path)
    chunks = chunk_text(pdf_text)
    chunk_embeddings = [generate_embedding(ch) for ch in chunks]
    index.add(np.array(chunk_embeddings).astype("float32"))

    # Build BM25 from tokenized chunks
    tokenized_corpus = [ch.split() for ch in chunks]
    bm25 = BM25Okapi(tokenized_corpus)

    return index, chunks, bm25

# -------------------------------------------------
# 📌 Function: Retrieve Practice Exam
# -------------------------------------------------
def retrieve_practice_exam(query, exam_index, exam_chunks, top_k=1):
    """
    Returns the full_text from the chunk that best matches `query`.
    If multiple PDFs, you could refine logic to return top_k distinct PDFs.
    """
    query_emb = np.array([generate_embedding(query)]).astype("float32")
    distances, indices = exam_index.search(query_emb, top_k)

    # If nothing found, default to the first chunk’s full text
    if len(indices[0]) == 0:
        return exam_chunks[0]["full_text"]

    # Return the full text of the top chunk
    top_chunk = exam_chunks[indices[0][0]]
    return top_chunk["full_text"]

# -------------------------------------------------
# 📌 Function: Retrieve Course Material (Hybrid: FAISS + BM25)
# -------------------------------------------------
def retrieve_course_material(query, course_index, course_chunks, bm25, top_k=3):
    query_emb = np.array([generate_embedding(query)]).astype("float32")
    distances, indices = course_index.search(query_emb, top_k)

    # If FAISS yields no results
    if len(indices[0]) == 0:
        # fallback to BM25
        top_bm25 = bm25.get_top_n(query.split(), course_chunks, n=top_k)
        return top_bm25

    # Return top K chunk texts
    results = [course_chunks[i] for i in indices[0]]
    return results

# -------------------------------------------------
# 📌 Function: Generate AI-Powered Practice Exam
#    (With Streaming if desired)
# -------------------------------------------------
def generate_practice_exam_realtime(course, difficulty):
    """
    Generates an advanced exam using GPT with real-time streaming,
    closely matching the style of past exams and course material.
    Incorporates feedback-based improvements and advanced prompt engineering.
    """
    global last_generated_exam

    # Retrieve practice exam reference & relevant course chunks
    exam_text = retrieve_practice_exam(course, exam_index, practice_exams)
    course_material = retrieve_course_material(course, course_index, course_chunks, bm25, top_k=3)

    # Combine improvements from negative feedback
    combined_improvements = "\n".join(improvement_history)

    # Build the context from relevant text
    context = (
        f"--- Past Exam Reference ---\n{exam_text}\n\n"
        f"--- Relevant Course Chunks ---\n" + "\n\n".join(course_material)
    )

    # ---------------------------------------------
    # Advanced Prompt Engineering
    # ---------------------------------------------
    system_prompt = """\
You are a highly specialized AI in creating university-level computer science exams.
Your primary objective:
1. Closely match the tone, style, and structure of the provided past exams. produce the same number of questions, if not, more questions than whats included in the practice exams.
2. Incorporate relevant course content from the retrieved chunks to ensure coverage of essential topics.
3. Make the resulting exam as comprehensive and realistic as possible, so that a diligent student can ACE their real exam.
4. Use rigorous academic standards, but remain accessible and properly structured.
5. Keep your chain-of-thought internal. Output only the final refined exam text.

Formatting Requirements:
- Use headings for different sections (e.g., Section A: Multiple Choice, Section B: Short Answer).
- Provide clear instructions and question numbering.
- For coding problems (Java), use valid fenced code blocks: ```java ... ```
- If needed, illustrate data structures or algorithms in text or ASCII.
- Keep explanations minimal in the final output to mirror real exam conditions unless a solution or answer key is explicitly requested (not included here by default).
"""

    # User prompt: incorporate improvements, difficulty, and context
    user_prompt = f"""\
You are tasked with generating a **{difficulty}**-level practice exam for the course: **{course}**.

**Incorporate the following improvement suggestions (if any)**:
{combined_improvements}

**Key Instructions**:
1. The exam should reflect the style and difficulty of the past exam provided in 'Past Exam Reference.'
2. Use relevant details from 'Relevant Course Chunks' to ensure proper coverage of the course material.
3. Include Multiple Choice, Short Answer, and Coding/Problem-Solving questions (with Java code blocks if appropriate).
4. Tailor the difficulty to {difficulty}:
   - For 'Easy', ensure clarity and fundamental coverage;
   - For 'Hard', push complexity in problem-solving and advanced concepts.
5. The ultimate goal: help the student fully prepare to ACE their exam.

Here is the context to inspire your practice exam:
{context}

Now, please create a single, cohesive practice exam below (without revealing your internal reasoning).
"""

    # Create a streaming ChatCompletion request
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # or "gpt-4" if available
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        stream=True,
        temperature=0.7,
        max_tokens=1500
    )

    # ---------------------------------------------
    # Streaming Response Handling
    # ---------------------------------------------
    exam_output = ""
    buffer = ""

    for chunk in response:
        if "content" in chunk["choices"][0]["delta"]:
            buffer += chunk["choices"][0]["delta"]["content"]
            # Print only when a full line break is detected
            if buffer.endswith("\n"):
                print(buffer.strip())
                exam_output += buffer
                buffer = ""

    last_generated_exam = exam_output
    return exam_output

# -------------------------------------------------
# 📌 (Optional) AI Self-Evaluate Function
# -------------------------------------------------
def ai_self_evaluate(last_exam):
    """
    Takes the last exam text and returns
    AI-generated improvements.
    """
    improvement_prompt = f"""
    The user marked the last exam as 'bad'.
    Here is the exam text:
    -------------------
    {last_exam}
    -------------------
    Please provide 3 specific improvements to make future exams better:
    - Focus on question variety
    - Level of detail in solutions
    - Formatting or clarity changes

    Format your response as plain text bullet points.
    """

    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an AI specialized in creating and refining university exams."},
                {"role": "user", "content": improvement_prompt},
            ],
            temperature=0.7
        )
        improvements = response["choices"][0]["message"]["content"].strip()

        # Return or store the improvements
        return improvements
    except Exception as e:
        print(f"Failed to generate improvements: {e}")
        return "Failed to get improvements."

# -------------------------------------------------
# 📌 Build FAISS Indexes
# -------------------------------------------------
pdf_exam_paths = ["/content/content/EXAM_COSC_2P03_JULY_2007.pdf"]
exam_index, practice_exams = store_exams_faiss(pdf_exam_paths)

course_pdf_path = "/content/content/Mark Allen Weiss - Data structures and algorithm analysis in Java-Pearson  (2012).pdf"
course_index, course_chunks, bm25 = store_course_material_faiss(course_pdf_path)

# -------------------------------------------------
# 📌 UI Components
# -------------------------------------------------
university_input = widgets.Text(placeholder="Enter University Name", description="University:")
course_input = widgets.Text(placeholder="Enter Course Name", description="Course:")
difficulty_dropdown = widgets.Dropdown(options=["Easy", "Medium", "Hard"], description="Difficulty:")
generate_button = widgets.Button(description="Generate Exam", button_style="success")
output_area = widgets.Output()

# Feedback Buttons
thumbs_up = widgets.Button(description="👍 Good", button_style="success")
thumbs_down = widgets.Button(description="👎 Bad", button_style="danger")

def thumbs_up_feedback(b):
    """Stores positive feedback and immediately saves."""
    feedback_history.append("✅ Exam was well received.")
    save_feedback()  # 🔥 Save feed                                                                                                                                                                                                                                                                                                                                                                                                                                                                       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////  QQQQQQQQQQQQQQ///////////////////////////QQQQQQQQQQ/  QQQQQQQQQQQQ/                                                                                                                                                                               ////////////////                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      back to JSON
    print("👍 AI will reinforce this exam style.")

def thumbs_down_feedback(b):
    """Triggers AI self-evaluation, stores negative feedback, and saves."""
    print("👎 AI will analyze and improve next time.")

    # Call AI self-evaluation
    improvement_notes = ai_self_evaluate(last_generated_exam)

    # Store improvement feedback and save
    improvement_history.append(improvement_notes)
    save_feedback()  # 🔥 Save feedback to JSON

    print("\n🔧 AI Improvement Plan:")
    print(improvement_notes)

thumbs_up.on_click(thumbs_up_feedback)
thumbs_down.on_click(thumbs_down_feedback)

def on_generate_exam(b):
    course = course_input.value.strip()
    difficulty = difficulty_dropdown.value
    if course:
        clear_output(wait=True)
        display(university_input, course_input, difficulty_dropdown, generate_button, output_area, thumbs_up, thumbs_down)
        with output_area:
            output_area.clear_output()
            print("\n📚 AI-Generated Practice Exam:\n")
            exam_text = generate_practice_exam_realtime(course, difficulty)
            print(exam_text)

generate_button.on_click(on_generate_exam)

# Display UI
display(university_input, course_input, difficulty_dropdown, generate_button, output_area, thumbs_up, thumbs_down)

Text(value='brock', description='University:', placeholder='Enter University Name')

Text(value='2p03', description='Course:', placeholder='Enter Course Name')

Dropdown(description='Difficulty:', index=2, options=('Easy', 'Medium', 'Hard'), value='Hard')

Button(button_style='success', description='Generate Exam', style=ButtonStyle())

Output()

Button(button_style='success', description='👍 Good', style=ButtonStyle())

Button(button_style='danger', description='👎 Bad', style=ButtonStyle())

**debugging**