### ===============================================================
### AI Tutor Pipeline - Step 4: Personalized Content Generation
### ---------------------------------------------------------------
### This script:
###   - Reads a personalized learning path for a user (with user zone)
###   - Matches each topic to the most relevant textbook chunk
###   - Classifies each topic as basic/intermediate/advanced (via LLM)
###   - Paraphrases all content using an LLM, adapting to user zone
###   - Outputs personalized, student-ready revision notes as JSON
### ===============================================================


In [9]:
# !pip install langchain langchain_community pymupdf pandas
import os
import json
from typing import List, Dict, Any
import concurrent.futures
import pandas as pd

from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# LangSmith setup
os.environ["LANGSMITH_API_KEY"] = "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"
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "ai-tutur-cg" 

### PDF loading and chuncking

In [10]:
def chunk_pdf(pdf_path: str, chunk_size=1200, chunk_overlap=200) -> List:
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(docs)

In [11]:
def find_relevant_chunks(topic: str, chunks: List, window=2) -> str:
    topic_clean = topic.lower().replace('.', '').replace('?', '')
    matching_indices = [i for i, c in enumerate(chunks) if topic_clean in c.page_content.lower()]
    if not matching_indices:
        return ""
    selected_texts = []
    for idx in matching_indices:
        start = max(0, idx - window)
        end = min(len(chunks), idx + window + 1)
        for i in range(start, end):
            selected_texts.append(chunks[i].page_content)
    return "\n\n".join(list(dict.fromkeys(selected_texts)))

### Classifying topic difficulty using LLM

In [17]:
def classify_topic_difficulty_llm(topic: str, content: str, llm) -> str:
    prompt = f"""
You are an expert educational content classifier.
Given the following topic and its content from a user-uploaded document, decide if the topic should be considered 'basic' (introductory, suitable for beginners) or 'advanced' (complex, detailed, for experienced learners).

Instructions:
- If the topic is fundamental, introductory, or general, classify as 'basic'.
- If the topic is technical, specialized, or complex, classify as 'advanced'.
- Output one word only: either 'basic' or 'advanced' or intermediate.

Topic: {topic}
Content:
\"\"\"{content}\"\"\"
Answer:
"""
    chain = (
        PromptTemplate.from_template("{prompt}")
        .partial(prompt=prompt)
        | llm
        | StrOutputParser()
    )
    result = chain.invoke({}, config={"run_name": f"ClassifyLevel-{topic[:20]}"})
    result = result.strip().lower()
    if "basic" in result:
        return "basic"
    if "advanced" in result:
        return "advanced"
    return "intermediate"  # fallback if unclear


In [13]:
def adaptive_paraphrase(content: str, topic: str, user_zone: str, topic_type: str, llm) -> str:
    prompt = f"""
You are a world-class tutor. ONLY use the provided content to explain the topic: "{topic}".
Your student is a {user_zone.upper()}.
The topic type is: {topic_type.upper()}.

INSTRUCTIONS:
- If user is "SLOW-LEARNER AND BEGINNER":
    * Use simple, clear language.
    * Explain every term and step.
    * Give detailed, step-by-step guidance and plenty of examples.
    * Use analogies and visual descriptions where possible.
    * Repeat or reword important points for extra clarity.
- If user is "SLOW-LEARNER AND ADVANCED":
    * For basic topics: explain every term, clarify basics patiently.
    * For advanced topics: go very deep, elaborate, break down complex ideas, highlight pitfalls, show edge cases.
- If user is "FAST-LEARNER AND BEGINNER":
    * Cover all basics, but avoid unnecessary repetition.
    * Use concise, efficient explanations and examples.
    * Move at a brisk pace but ensure every concept is still clear.
- If user is "FAST-LEARNER AND ADVANCED":
    * Give concise, high-level, technical summaries.
    * Assume the student knows all basic concepts.
    * Focus on big-picture ideas, shortcuts, and advanced nuances.
    * Highlight the most important details for revision.

RULES:
- Do NOT add, invent, or import examples/explanations from outside the provided content.
- Do NOT reference other chapters or say "see section X".
- Only write about what is present in the content block.

CONTENT:
\"\"\"{content}\"\"\"

Output: Write student-ready, revision-style notes for this topic as described above.
"""
    chain = (
        PromptTemplate.from_template("{prompt}")
        .partial(prompt=prompt)
        | llm
        | StrOutputParser()
    )
    return chain.invoke({}, config={"run_name": f"Paraphrase-{user_zone}-{topic[:20]}"})


In [14]:
def parallel_adaptive_paraphrase(content_list, topic_list, user_zone, topic_type_list, llm):
    def single_paraphrase(content, topic, topic_type):
        return adaptive_paraphrase(content, topic, user_zone, topic_type, llm)
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(executor.map(single_paraphrase, content_list, topic_list, topic_type_list))
    return results

In [15]:
def deliver_personalized_content(
    pdf_path: str,
    learning_path: List[Dict[str, Any]],
    user_zone: str,
    gemini_api_key: str
) -> List[Dict[str, str]]:
    os.environ["GOOGLE_API_KEY"] = gemini_api_key
    llm = ChatGoogleGenerativeAI(
        model="models/gemini-1.5-pro-latest",
        temperature=0.3,
        project="personalized-learning-content",  # For LangSmith tracking
    )

    # Step 1: Chunk the PDF
    chunks = chunk_pdf(pdf_path)

    # Step 2: Prepare all topics and their content + classify topic difficulty
    personalized_content_list = []
    content_list, topic_list, topic_type_list = [], [], []
    for item in learning_path:
        topic = item["topic"]
        days = item.get("pace", "")
        content = find_relevant_chunks(topic, chunks)
        if not content.strip():
            personalized_text = "[No content found for this topic in your document.]"
            personalized_content_list.append({
                "topic": topic,
                "topic_type": "advanced",
                "days": days,
                "review_needed": item.get("review_needed", False),
                "missed_mcqs": item.get("missed_mcqs", 0),
                "content": personalized_text
            })
        else:
            # Classify topic difficulty on the fly (LLM call)
            topic_type = classify_topic_difficulty_llm(topic, content, llm)
            content_list.append(content)
            topic_list.append(topic)
            topic_type_list.append(topic_type)
            personalized_content_list.append({
                "topic": topic,
                "topic_type": topic_type,
                "days": days,
                "review_needed": item.get("review_needed", False),
                "missed_mcqs": item.get("missed_mcqs", 0),
                "content": ""  # to be filled in next step
            })

    # Step 3: Paraphrase all topics with content (parallel for speed)
    if content_list:
        paraphrased_texts = parallel_adaptive_paraphrase(content_list, topic_list, user_zone, topic_type_list, llm)
        idx = 0
        for i in range(len(personalized_content_list)):
            if personalized_content_list[i]["content"] == "":
                personalized_content_list[i]["content"] = paraphrased_texts[idx]
                idx += 1

    return personalized_content_list


In [20]:
if __name__ == "__main__":
    PDF_PATH = "files/science-textbook-grade-5.pdf"
    LEARNING_PATH_JSON = "path/learning_path_personalized_5th.json"
    GEMINI_API_KEY = open("api_key_paid.txt").read().strip()

    # Load learning path and user zone
    with open(LEARNING_PATH_JSON, "r", encoding="utf-8") as f:
        data = json.load(f)
    user_zone = data["user_zone"]
    learning_path = data["learning_path"]

    # Deliver personalized content
    content_plan = deliver_personalized_content(
        pdf_path=PDF_PATH,
        learning_path=learning_path,
        user_zone=user_zone,
        gemini_api_key=GEMINI_API_KEY
    )

    # Save as JSON for further UI or visualization
    with open("content/personalized_content_plan_5th.json", "w", encoding="utf-8") as f:
        json.dump(content_plan, f, indent=2, ensure_ascii=False)

    # Print output for review/debug
    for entry in content_plan:
        marker = "⭐ REVIEW ⭐" if entry.get("review_needed") else ""
        print(f"\n=== {entry['topic']} ({entry['days']}) {marker}\nType: {entry['topic_type']}\n{entry['content']}\n")


Unexpected argument 'project' provided to ChatGoogleGenerativeAI.
                project was transferred to model_kwargs.
                Please confirm that project is what you intended.
  content_plan = deliver_personalized_content(



=== Structure of Living Things (2 min) 
Type: basic
## Structure of Living Things - Revision Notes

**Chapter 1: Core Concepts**

* **Cells:** Fundamental units of life.  Plant cells have cell walls, chloroplasts, and large vacuoles; animal cells lack these. Key components include: cell membrane (selective barrier), cytoplasm (internal environment), nucleus (control center), mitochondria (energy production), vacuoles (storage).
* **Organization:** Cells → Tissues → Organs → Organ Systems → Organism.
* **Diversity:** Six kingdoms classify organisms. Focus on plant and animal distinctions (food production, mobility).  Within animals: vertebrates (backbone) vs. invertebrates. Within plants: vascular (transport systems) vs. nonvascular.  Fungi (absorb food), bacteria (lack nucleus), and protists (diverse, often single-celled) complete the kingdoms.


**Chapter 2: Plant-Specific Details**

* **Reproduction:** Spores (single-celled reproductive units) vs. seeds (undeveloped plant with store