In [31]:
import os
import json
import re
import math
from typing import List, Dict, Any
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [32]:
PDF_PATH = "mi-intro.pdf"
CURRICULUM_JSON_PATH = "learning_path_skeleton_ml.json"
OUT_JSON_PATH = "quiz_output.json"
MAX_QUESTIONS = 20

os.environ["GOOGLE_API_KEY"] = open("api_key_paid.txt").read().strip()

In [33]:
# ----------------------- PDF CHUNKING -----------------------
def extract_chunks_from_pdf(pdf_path: str) -> List[Dict[str, Any]]:
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    # Each doc has page_content and metadata (page number)
    chunks = [{"text": doc.page_content, "page": doc.metadata.get("page", None)} for doc in docs]
    return chunks

In [34]:
# ----------------- MAPPING NODE TO CONTENT ------------------
def find_matching_content(title: str, chunks: List[Dict[str, Any]]) -> str:
    """Finds the first chunk containing the title as a heading. Fallback: returns longest chunk containing keywords, else empty string."""
    title_clean = title.lower().replace('.', '').replace('?', '')
    candidates = [c["text"] for c in chunks if title_clean in c["text"].lower()]
    if candidates:
        # Best: chunk where title is a heading
        return candidates[0]
    # Fallback: any chunk with most keyword overlap
    for c in chunks:
        if any(word in c["text"].lower() for word in title_clean.split()):
            return c["text"]
    return ""  # If not found


In [35]:
# ------------- FLATTEN NODES FOR ALLOCATION -----------------
def flatten_nodes(node, path=()):
    flat = []
    title = node.get("title", "")
    sub_titles = node.get("sub_titles", [])
    if title:
        flat.append((path + (title,), node))
    for st in sub_titles:
        flat.append((path + (title, st), {"title": st, "brief": node.get("brief", "")}))
    for sub in node.get("subsections", []):
        flat.extend(flatten_nodes(sub, path + (title,)))
    return flat

In [36]:
# ------------- ALLOCATE QUESTION COUNTS ---------------------
def allocate_questions(nodes: List[tuple], chunks, max_qs=20):
    # Compute content length for each node (use text or brief as fallback)
    node_infos = []
    total_words = 0
    for path, node in nodes:
        content = find_matching_content(node["title"], chunks)
        if not content:
            content = node.get("brief", "")
        word_count = len(content.split())
        node_infos.append({"path": path, "node": node, "content": content, "words": word_count})
        total_words += word_count

    # Allocate by word count (min 1 per node with content)
    allocations = []
    left = max_qs
    for info in node_infos:
        if info["words"] == 0:
            allocations.append(0)
            continue
        # Proportional, rounded down, at least 1 if any content
        count = max(1, int((info["words"] / total_words) * max_qs)) if total_words else 1
        allocations.append(count)
        left -= count

    # Distribute leftovers
    idx = 0
    while left > 0:
        if node_infos[idx % len(node_infos)]["words"] > 0:
            allocations[idx % len(node_infos)] += 1
            left -= 1
        idx += 1

    for i, info in enumerate(node_infos):
        info["q_count"] = allocations[i]

    return node_infos

In [37]:
def clean_llm_output(output):
    # Remove triple backticks and leading/trailing whitespace
    output = output.strip()
    if output.startswith("```json"):
        output = output[len("```json"):].strip()
    if output.endswith("```"):
        output = output[:-3].strip()
    return output

In [38]:
# --------------- QUIZ GENERATION VIA LLM --------------------
def generate_mcqs_for_content(llm, content, topic, n_questions=1):
    prompt_text = f"""
You are an expert MCQ generator.
Using the content below, generate {n_questions} multiple-choice questions (MCQs) of mixed difficulty (basic, intermediate, advanced) for the topic "{topic}".
- Each question should have 4 answer options, labelled A, B, C, D.
- Indicate the correct answer using a single uppercase letter in the "answer" field.
- Output ONLY valid JSON, no explanations or markdown, in this format:
[{{"question": "...", "options": ["A) ...", "B) ...", "C) ...", "D) ..."], "answer": "B"}}, ...]
Content:
{content}
"""
    chain = PromptTemplate.from_template("{prompt}").partial(prompt=prompt_text) | llm | StrOutputParser()
    output = chain.invoke({})
    try:
        clean_output = clean_llm_output(output)
        questions = json.loads(clean_output)
        if isinstance(questions, dict):  # sometimes LLM returns a dict with a 'questions' field
            questions = questions.get('questions', [])
        print(f"Generated {len(questions[:n_questions])} MCQs for topic '{topic}'")
        return questions[:n_questions]
    except Exception:
        print(f"Could not parse output for topic {topic}: {output}")
        return []

In [39]:
# --------------- MAIN PIPELINE FUNCTION ---------------------
def main(pdf_path, curriculum_json_path, out_json_path, max_questions=20):
    # Load curriculum
    with open(curriculum_json_path, "r", encoding="utf-8") as f:
        curriculum = json.load(f)

    # Load and chunk PDF
    print("Extracting chunks from PDF...")
    chunks = extract_chunks_from_pdf(pdf_path)

    # Flatten all nodes
    print("Flattening curriculum...")
    all_nodes = []
    for section in curriculum["sections"]:
        all_nodes += flatten_nodes(section)

    # Allocate questions
    node_infos = allocate_questions(all_nodes, chunks, max_questions)

    # Set up Gemini (Google Generative AI)
    llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-pro-latest", temperature=0)

    # Generate quizzes and attach to curriculum (in-place)
    print("Generating MCQs...")
    path2questions = {}
    for info in node_infos:
        if info["q_count"] and info["content"]:
            qs = generate_mcqs_for_content(llm, info["content"], info["node"]["title"], info["q_count"])
            path2questions[info["path"]] = qs

    # Helper to recursively attach quizzes
    def attach_quizzes(node, path=()):
        node_path = path + (node.get("title", ""),)
        quizzes = path2questions.get(node_path, [])
        if quizzes:
            node["quizzes"] = quizzes
        for st in node.get("sub_titles", []):
            sub_path = node_path + (st,)
            st_quizzes = path2questions.get(sub_path, [])
            if st_quizzes:
                if "sub_title_quizzes" not in node:
                    node["sub_title_quizzes"] = {}
                node["sub_title_quizzes"][st] = st_quizzes
        for sub in node.get("subsections", []):
            attach_quizzes(sub, node_path)

    for section in curriculum["sections"]:
        attach_quizzes(section, ())

    # Save output
    print(f"Saving to {out_json_path}")
    with open(out_json_path, "w", encoding="utf-8") as f:
        json.dump(curriculum, f, indent=2, ensure_ascii=False)
    print("✅ Done!")


In [40]:
# ------------------ RUN SCRIPT AS MAIN ----------------------
if __name__ == "__main__":
    main(PDF_PATH, CURRICULUM_JSON_PATH, OUT_JSON_PATH, MAX_QUESTIONS)

Extracting chunks from PDF...
Flattening curriculum...
Generating MCQs...
Generated 1 MCQs for topic 'Preliminaries'
Generated 1 MCQs for topic 'Introduction'
Generated 1 MCQs for topic 'What is Machine Learning?'
Generated 1 MCQs for topic 'Wellsprings of Machine Learning'
Generated 1 MCQs for topic 'Varieties of Machine Learning'
Generated 1 MCQs for topic 'Learning Input-Output Functions'
Generated 1 MCQs for topic 'Types of Learning'
Generated 1 MCQs for topic 'Input Vectors'
Generated 1 MCQs for topic 'Outputs'
Generated 1 MCQs for topic 'Training Regimes'
Generated 1 MCQs for topic 'Noise'
Generated 1 MCQs for topic 'Performance Evaluation'
Generated 1 MCQs for topic 'Learning Requires Bias'
Generated 1 MCQs for topic 'Sample Applications'
Generated 1 MCQs for topic 'Sources'
Generated 1 MCQs for topic 'Boolean Functions'
Generated 1 MCQs for topic 'Representation'
Generated 1 MCQs for topic 'Boolean Algebra'
Generated 1 MCQs for topic 'Diagrammatic Representations'
Generated 1 M

In [44]:
import json
from langchain_google_genai import ChatGoogleGenerativeAI

# 1. Load your JSON file
with open("learning_path_skeleton_ml.json", "r", encoding="utf-8") as f:
    curriculum = json.load(f)

# 2. Collect all unique topics (sections, subsections, sub_titles)
def collect_topics(node):
    topics = set()
    if node.get("title"):
        topics.add(node["title"])
    if node.get("sub_titles"):
        for st in node["sub_titles"]:
            topics.add(st)
    for sub in node.get("subsections", []):
        topics |= collect_topics(sub)
    return topics

all_topics = set()
for section in curriculum["sections"]:
    all_topics |= collect_topics(section)
all_topics = sorted(list(all_topics))

# 3. Ask the LLM to select the most important ones
llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-pro-latest", temperature=0)
prompt = f"""
You are a curriculum designer for a machine learning course.
Here is a list of all topics and subtopics covered:
{all_topics}
From these, select the 8 most critical for a well-rounded understanding of machine learning.
Output just the list of topic names as a Python list.
"""

response = llm.invoke(prompt)
# Now parse the LLM's response (should be a list of topic strings)
try:
    important_topics = eval(response)
    print("LLM selected topics:", important_topics)
except Exception:
    print("LLM response could not be parsed:", response)


LLM response could not be parsed: content="```python\n['What is Machine Learning?', 'Supervised Learning', 'Unsupervised Learning', 'Decision Trees', 'Neural Networks', 'Overfitting', 'PAC Learning', 'Temporal-Difference Learning']\n```" additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-1.5-pro-002', 'safety_ratings': []} id='run--85931906-5536-40c8-8fd5-7198205e923b-0' usage_metadata={'input_tokens': 836, 'output_tokens': 45, 'total_tokens': 881, 'input_token_details': {'cache_read': 0}}
