### ================================================
### AI Tutor Pipeline - Step 1: Curriculum Extraction
### ------------------------------------------------
### This script takes a PDF (e.g., a textbook),
### splits it into chunks, and uses Gemini LLM
### to generate a curriculum skeleton (learning path) in JSON format.
### ================================================


In [85]:
# ! pip install langgraph langchain langchain-google-genai langsmith PyMuPDF langchain-community

In [1]:
# ==================== 1. Imports and Environment ====================
import os
import json
import re
from typing import TypedDict, Optional, Dict, Any, List

from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
from langgraph.graph import StateGraph

In [2]:
# ----  Set up API keys and environment variables ----
# Replace with your actual keys
os.environ["GOOGLE_API_KEY"] = open("api_key_paid.txt").read().strip()
# os.environ["LANGSMITH_API_KEY"] = "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"
os.environ["LANGSMITH_API_KEY"] = "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"
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "ai-tutur-cg"  # or your project name

In [3]:
# ---- 2. PDF to Chunks ----
def langchain_load_and_chunk(pdf_path, chunk_size=1200, chunk_overlap=200):
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_documents(docs)


In [4]:
# ---- 3. Build Prompt Template ----
prompt_template = """
You are an expert curriculum designer.
Given these document chunks, generate a Learning Path structure in JSON format.

Format Example:
{{
  "sections": [
    {{
      "section_id": "S1",
      "title": "<Section Title>",
      "brief": "<Short 2‚Äì3 line description>",
      "subsections": [
        {{
          "subsection_id": "S1.1",
          "title": "<Subsection Title>",
          "sub_titles": ["<Sub-title 1>", "<Sub-title 2>"],  // Use [] if none
          "brief": "<Short 2‚Äì3 line description>"
        }}
      ]
    }}
  ]
}}

For each subsection, include a 'sub_titles' list with relevant sub-headings found in the chunk (use an empty list [] if none are present). Always include the 'brief' field with a 2‚Äì3 line summary.

Document Chunks:
{chunks}

Return only the JSON structure, no additional text or formatting:
"""

prompt = PromptTemplate.from_template(prompt_template)

In [5]:
# ---- 4. Set up Gemini Model ----
llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-pro-latest", temperature=0)


In [6]:
# ---- 5. LangGraph Nodes ----
from typing import Dict, Any, List, TypedDict

class State(TypedDict):
    pdf_path: str
    docs: List[Any]
    chunk_texts: str
    skeleton: Dict[str, Any]

def load_and_chunk_node(state: State) -> State:
    docs = langchain_load_and_chunk(state["pdf_path"])
    chunk_texts = "\n\n".join([f"Chunk {i+1}:\n{doc.page_content}" for i, doc in enumerate(docs)])
    state["docs"] = docs
    state["chunk_texts"] = chunk_texts
    return state

def call_llm_node(state: State) -> State:
    chain = prompt | llm | StrOutputParser()
    output = chain.invoke({"chunks": state["chunk_texts"]})
    # Clean the output to extract JSON from markdown code blocks
    def extract_json_from_markdown(text):
        """Extract JSON content from markdown code blocks"""
        # Try to find JSON within ```json ... ``` blocks
        json_pattern = r'```json\s*(.*?)\s*```'
        match = re.search(json_pattern, text, re.DOTALL)
        if match:
            return match.group(1).strip()
        
        # Try to find JSON within ``` ... ``` blocks (without json specifier)
        code_pattern = r'```\s*(.*?)\s*```'
        match = re.search(code_pattern, text, re.DOTALL)
        if match:
            potential_json = match.group(1).strip()
            # Check if it looks like JSON (starts with { or [)
            if potential_json.startswith(('{', '[')):
                return potential_json
        
        # If no code blocks found, return original text
        return text.strip()
    
    try:
        # First, try to extract JSON from markdown
        cleaned_output = extract_json_from_markdown(output)
        
        # Try to parse the cleaned output
        skeleton = json.loads(cleaned_output)
        
        print("‚úÖ JSON parsed successfully!")
        print(json.dumps(skeleton, indent=2, ensure_ascii=False))
        
        # Save to file
        with open("json_files/learning_path_skeleton_5th.json", "w", encoding="utf-8") as f:
            json.dump(skeleton, f, indent=2, ensure_ascii=False)
        print("\n‚úÖ Learning Skeleton saved as 'learning_path_skeleton_sql.json'")
        
        state["skeleton"] = skeleton
        
    except json.JSONDecodeError as e:
        print(f"\n‚ùå JSON parsing failed: {e}")
        print("\nCleaned output that failed to parse:")
        print(cleaned_output)
        print("\nOriginal Gemini output:")
        print(output)
        state["skeleton"] = None
    except Exception as e:
        print(f"\n‚ùå Unexpected error: {e}")
        print("\nOriginal output:")
        print(output)
        state["skeleton"] = None
    
    return state

In [7]:
# ---- 6. Build and Run the LangGraph Workflow ----
workflow = (
    StateGraph(state_schema=State)
    .add_node("load_and_chunk", load_and_chunk_node)
    .add_node("call_llm", call_llm_node)
    # .add_node("save", save_node)
    .add_edge("__start__", "load_and_chunk")
    .add_edge("load_and_chunk", "call_llm")
    # .add_edge("call_llm", "save")
    .compile()
)

In [9]:
initial_state = {
    "pdf_path": "files/science-textbook-grade-5.pdf",  # Update with your actual PDF path
    "docs": [],
    "chunk_texts": "",
    "skeleton": {}
}
# workflow.invoke(initial_state)

result = workflow.invoke(initial_state)
print(f"\nüéâ Workflow completed! Final skeleton keys: {list(result['skeleton'].keys()) if result['skeleton'] else 'None'}")

‚úÖ JSON parsed successfully!
{
  "sections": [
    {
      "section_id": "S1",
      "title": "Structure of Living Things",
      "brief": "This section explores the fundamental building blocks of life, starting from cells and progressing to the organization of organisms. It covers the diversity of life forms and their classification into kingdoms.",
      "subsections": [
        {
          "subsection_id": "S1.1",
          "title": "Cells",
          "sub_titles": [
            "What are plants and animals made of?",
            "How can cells be seen?",
            "What are the parts of cells?"
          ],
          "brief": "This subsection introduces the concept of cells as the basic units of life. It discusses the differences between plant and animal cells, their internal structures, and their functions."
        },
        {
          "subsection_id": "S1.2",
          "title": "From Cells to Organisms",
          "sub_titles": [
            "How are living things organized