### ============================================================
### AI Tutor Pipeline - Step 2: Quiz Generation from Curriculum
### ------------------------------------------------------------
### This script takes a curriculum skeleton and a PDF textbook,
### matches topics to content, and uses Gemini LLM to generate
### MCQs for each topic. Output: curriculum JSON with quizzes.
### ============================================================


In [26]:
import os
import json
import re
import math
from typing import List, Dict, Any
from difflib import SequenceMatcher
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

### ----------- FILE PATHS AND ENVIRONMENT SETUP -----------

In [27]:
PDF_PATH = "files/science-textbook-grade-5.pdf"
CURRICULUM_JSON_PATH = "json_files/learning_path_skeleton_5th.json"
OUT_JSON_PATH = "quiz_files/quiz_output_5th.json"
MAX_QUESTIONS = 20

os.environ["GOOGLE_API_KEY"] = open("api_key_paid.txt").read().strip()
os.environ["LANGSMITH_API_KEY"] = "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"
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "ai-tutur-cg"  # or your project name

### ----------------------- PDF CHUNKING -----------------------

In [None]:
def extract_chunks_from_pdf(pdf_path: str) -> List[Dict[str, Any]]:
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    # Each doc has page_content and metadata (page number)
    chunks = [{"text": doc.page_content, "page": doc.metadata.get("page", None)} for doc in docs]
    return chunks

### FUNCTION: normalize / fuzzy_ratio / find_matching_content

In [29]:
def normalize(text):
    return re.sub(r'[^a-z0-9 ]', '', text.lower())

def fuzzy_ratio(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [30]:
# # ----------------- MAPPING NODE TO CONTENT ------------------
# def find_matching_content(title: str, chunks: List[Dict[str, Any]]) -> str:
#     """Finds the first chunk containing the title as a heading. Fallback: returns longest chunk containing keywords, else empty string."""
#     title_clean = title.lower().replace('.', '').replace('?', '')
#     candidates = [c["text"] for c in chunks if title_clean in c["text"].lower()]
#     if candidates:
#         # Best: chunk where title is a heading
#         return candidates[0]
#     # Fallback: any chunk with most keyword overlap
#     for c in chunks:
#         if any(word in c["text"].lower() for word in title_clean.split()):
#             return c["text"]
#     return ""  # If not found

def find_matching_content(title: str, chunks: List[Dict[str, Any]]) -> str:
    norm_title = normalize(title)
    candidates = []

    # 1. Exact normalized substring match
    for chunk in chunks:
        norm_chunk = normalize(chunk["text"])
        if norm_title in norm_chunk:
            return chunk["text"]

    # 2. Partial word overlap: at least 60% of title words in chunk
    title_words = set(norm_title.split())
    for chunk in chunks:
        chunk_words = set(normalize(chunk["text"]).split())
        if len(title_words & chunk_words) >= max(1, int(0.6 * len(title_words))):
            candidates.append(chunk["text"])

    if candidates:
        return max(candidates, key=len)

    # 3. Fuzzy match
    best_score = 0
    best_chunk = ""
    for chunk in chunks:
        norm_chunk = normalize(chunk["text"])
        score = fuzzy_ratio(norm_title, norm_chunk)
        if score > best_score:
            best_score = score
            best_chunk = chunk["text"]
    if best_score > 0.5:
        return best_chunk

    # 4. Fallback: warn and return empty
    print(f"Warning: No content found for topic '{title}'")
    return ""


### ------------- FLATTEN NODES FOR ALLOCATION -----------------

In [None]:
def flatten_nodes(node, path=()):
    flat = []
    title = node.get("title", "")
    sub_titles = node.get("sub_titles", [])
    if title:
        flat.append((path + (title,), node))
    for st in sub_titles:
        flat.append((path + (title, st), {"title": st, "brief": node.get("brief", "")}))
    for sub in node.get("subsections", []):
        flat.extend(flatten_nodes(sub, path + (title,)))
    return flat

### ------------- ALLOCATE QUESTION COUNTS ---------------------

In [None]:
def allocate_questions(nodes: List[tuple], chunks, max_qs=20):
    # Compute content length for each node (use text or brief as fallback)
    node_infos = []
    total_words = 0
    for path, node in nodes:
        content = find_matching_content(node["title"], chunks)
        if not content:
            content = node.get("brief", "")
        word_count = len(content.split())
        node_infos.append({"path": path, "node": node, "content": content, "words": word_count})
        total_words += word_count

    # Allocate by word count (min 1 per node with content)
    allocations = []
    left = max_qs
    for info in node_infos:
        if info["words"] == 0:
            allocations.append(0)
            continue
        # Proportional, rounded down, at least 1 if any content
        count = max(1, int((info["words"] / total_words) * max_qs)) if total_words else 1
        allocations.append(count)
        left -= count

    # Distribute leftovers
    idx = 0
    while left > 0:
        if node_infos[idx % len(node_infos)]["words"] > 0:
            allocations[idx % len(node_infos)] += 1
            left -= 1
        idx += 1

    for i, info in enumerate(node_infos):
        info["q_count"] = allocations[i]

    return node_infos

In [33]:
def clean_llm_output(output):
    # Remove triple backticks and leading/trailing whitespace
    output = output.strip()
    if output.startswith("```json"):
        output = output[len("```json"):].strip()
    if output.endswith("```"):
        output = output[:-3].strip()
    return output

### --------------- QUIZ GENERATION VIA LLM --------------------

In [None]:
def generate_mcqs_for_content(llm, content, topic, n_questions=1):
#     prompt_text = f"""
# You are an expert MCQ generator.
# Using the content below, generate {n_questions} multiple-choice questions (MCQs) of mixed difficulty (basic, intermediate, advanced) for the topic "{topic}".
# - Each question should have 4 answer options, labelled A, B, C, D.
# - Indicate the correct answer using a single uppercase letter in the "answer" field.
# - Output ONLY valid JSON, no explanations or markdown, in this format:
# [{{"question": "...", "options": ["A) ...", "B) ...", "C) ...", "D) ..."], "answer": "B"}}, ...]
# Content:
# {content}
# """
    # prompt_text = f"""
    # You are an expert MCQ generator.
    # Using the content below, generate {n_questions} multiple-choice questions (MCQs) of mixed difficulty (basic, intermediate, advanced) for the topic "{topic}".
    # - Each question should have 4 answer options, labelled A, B, C, D.
    # - Indicate the correct answer using a single uppercase letter in the "answer" field.
    # - For each question, ADD a "difficulty" field with the value "easy", "intermediate", or "hard".
    # - Output ONLY valid JSON, no explanations or markdown, in this format:
    # [{{"question": "...", "options": ["A) ...", "B) ...", "C) ...", "D) ..."], "answer": "B", "difficulty": "easy"}}, ...]
    # Content:
    # {content}
    # """
    prompt_text = f"""
    You are an expert MCQ generator.
    Using the content below, generate {n_questions} multiple-choice questions (MCQs) of mixed difficulty (basic, intermediate, advanced) for the topic "{topic}".
    - Each question should have 4 answer options, labelled A, B, C, D.
    - Indicate the correct answer using a single uppercase letter in the "answer" field.
    - For each question, ADD a "difficulty" field with the value "easy", "intermediate", or "hard".
    - **DO NOT reference section numbers, headings, or chapter titles in your questions or answer options.**
    - Ask only conceptual, practical, or general questions that test understanding of the topic itself, not knowledge of where material appears in a file.
    - Output ONLY valid JSON, no explanations or markdown, in this format:
    [{{"question": "...", "options": ["A) ...", "B) ...", "C) ...", "D) ..."], "answer": "B", "difficulty": "easy"}}, ...]
    Content:
    {content}
    """     


    chain = PromptTemplate.from_template("{prompt}").partial(prompt=prompt_text) | llm | StrOutputParser()
    output = chain.invoke({})
    try:
        clean_output = clean_llm_output(output)
        questions = json.loads(clean_output)
        if isinstance(questions, dict):  # sometimes LLM returns a dict with a 'questions' field
            questions = questions.get('questions', [])
        print(f"Generated {len(questions[:n_questions])} MCQs for topic '{topic}'")
        return questions[:n_questions]
    except Exception:
        print(f"Could not parse output for topic {topic}: {output}")
        return []

### --------------- MAIN PIPELINE FUNCTION ---------------------

In [None]:
def main(pdf_path, curriculum_json_path, out_json_path, max_questions=20):
    # Load curriculum
    with open(curriculum_json_path, "r", encoding="utf-8") as f:
        curriculum = json.load(f)

    # Load and chunk PDF
    print("Extracting chunks from PDF...")
    chunks = extract_chunks_from_pdf(pdf_path)

    # Flatten all nodes
    print("Flattening curriculum...")
    all_nodes = []
    for section in curriculum["sections"]:
        all_nodes += flatten_nodes(section)

    # Allocate questions
    node_infos = allocate_questions(all_nodes, chunks, max_questions)

    # Set up Gemini (Google Generative AI)
    llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-pro-latest", temperature=0.7)

    # Generate quizzes and attach to curriculum (in-place)
    print("Generating MCQs...")
    path2questions = {}
    for info in node_infos:
        if info["q_count"] and info["content"]:
            qs = generate_mcqs_for_content(llm, info["content"], info["node"]["title"], info["q_count"])
            # Build topic/subtopic info
            path_titles = info["path"]
            section_title = path_titles[0] if len(path_titles) > 0 else ""
            subsection_title = path_titles[1] if len(path_titles) > 1 else ""
            sub_title = path_titles[2] if len(path_titles) > 2 else ""
            # Try to find ids as well if needed (optional)
            section_id = ""
            subsection_id = ""
            # Loop through curriculum to find matching IDs (optional, skip if not needed for now)

            # Enrich every MCQ with topic mapping
            enriched_qs = []
            for q in qs:
                q_enriched = dict(q)
                q_enriched["section_title"] = section_title
                q_enriched["subsection_title"] = subsection_title
                q_enriched["sub_title"] = sub_title
                # Optionally add ids here too
                enriched_qs.append(q_enriched)
            path2questions[info["path"]] = enriched_qs
    all_mcqs = []
    for qlist in path2questions.values():
        all_mcqs.extend(qlist)

    # Helper to recursively attach quizzes
    def attach_quizzes(node, path=()):
        node_path = path + (node.get("title", ""),)
        quizzes = path2questions.get(node_path, [])
        if quizzes:
            node["quizzes"] = quizzes
        for st in node.get("sub_titles", []):
            sub_path = node_path + (st,)
            st_quizzes = path2questions.get(sub_path, [])
            if st_quizzes:
                if "sub_title_quizzes" not in node:
                    node["sub_title_quizzes"] = {}
                node["sub_title_quizzes"][st] = st_quizzes
        for sub in node.get("subsections", []):
            attach_quizzes(sub, node_path)

    for section in curriculum["sections"]:
        attach_quizzes(section, ())

    # Save output
    print(f"Saving to {out_json_path}")
    with open(out_json_path, "w", encoding="utf-8") as f:
        json.dump(curriculum, f, indent=2, ensure_ascii=False)
    print("✅ Done!")
    
    return all_mcqs


In [36]:
import random
from collections import defaultdict

def select_quiz(mcq_list, num_easy=4, num_intermediate=3, num_hard=3, total=10):
    by_diff = defaultdict(list)
    for q in mcq_list:
        diff = q.get("difficulty", "easy").lower()
        by_diff[diff].append(q)
    quiz = []
    quiz += random.sample(by_diff["easy"], min(num_easy, len(by_diff["easy"])))
    quiz += random.sample(by_diff["intermediate"], min(num_intermediate, len(by_diff["intermediate"])))
    quiz += random.sample(by_diff["hard"], min(num_hard, len(by_diff["hard"])))
    # Fill up to 'total' with random questions from any difficulty (excluding those already picked)
    while len(quiz) < total:
        leftovers = [q for diff in by_diff for q in by_diff[diff] if q not in quiz]
        if not leftovers:
            break
        quiz.append(random.choice(leftovers))
    random.shuffle(quiz)
    return quiz

all_mcqs = main(PDF_PATH, CURRICULUM_JSON_PATH, OUT_JSON_PATH, MAX_QUESTIONS)
quiz_to_display = select_quiz(all_mcqs, 4, 3, 3, 10)

Extracting chunks from PDF...
Flattening curriculum...
Generating MCQs...
Generated 1 MCQs for topic 'Structure of Living Things'
Generated 1 MCQs for topic 'Cells'
Generated 1 MCQs for topic 'What are plants and animals made of?'
Generated 1 MCQs for topic 'How can cells be seen?'
Generated 1 MCQs for topic 'What are the parts of cells?'
Generated 1 MCQs for topic 'From Cells to Organisms'
Generated 1 MCQs for topic 'How are living things organized?'
Generated 1 MCQs for topic 'How do cells work together?'
Generated 1 MCQs for topic 'Diversity of Organisms'
Generated 1 MCQs for topic 'How are living things grouped together?'
Generated 1 MCQs for topic 'What do animals have in common?'
Generated 1 MCQs for topic 'What are plants?'
Generated 1 MCQs for topic 'What are fungi?'
Generated 1 MCQs for topic 'What are bacteria?'
Generated 1 MCQs for topic 'What are protists?'
Generated 1 MCQs for topic 'Plant Structure and Functions'
Generated 1 MCQs for topic 'Vascular Plants'
Generated 1 MC

In [37]:
quiz_output = {
    "quiz": quiz_to_display,
    "total_questions": len(quiz_to_display),
    "source_pdf": PDF_PATH,
    "curriculum_file": CURRICULUM_JSON_PATH,
    "output_file": OUT_JSON_PATH
    }

In [38]:
with open("quiz_files/filtered_quiz_5th.json", "w", encoding="utf-8") as f:
    json.dump(quiz_output, f, indent=2, ensure_ascii=False)

In [39]:
quiz_output

{'quiz': [{'question': 'An alloy is BEST described as a:',
   'options': ['A) Pure metal solidified after melting.',
    'B) Mixture of two or more metals and/or nonmetals.',
    'C) Compound formed by the chemical reaction of two metals.',
    'D) Metal that has been strengthened by heat treatment.'],
   'answer': 'B',
   'difficulty': 'intermediate',
   'section_title': 'Changes in Matter',
   'subsection_title': 'Metals and Alloys',
   'sub_title': 'What are alloys?'},
  {'question': "If the Sun's gravitational pull suddenly disappeared, what would happen to a planet in orbit?",
   'options': ['A) It would continue orbiting in the same path.',
    'B) It would spiral inwards towards where the Sun was.',
    'C) It would travel in a straight line tangent to its original orbit.',
    'D) It would be pulled towards the nearest star.'],
   'answer': 'C',
   'difficulty': 'intermediate',
   'section_title': 'The Solar System',
   'subsection_title': 'Gravity and Orbit',
   'sub_title': '