In [2]:
# ! pip install langchain langchain-google-genai langsmith PyMuPDF
! pip install langchain langchain-google-genai langsmith PyMuPDF langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain-community)
  Downloading aiohttp-3.12.7-cp312-cp312-win_amd64.whl.metadata (7.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Using cached httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp<4.0.0,>=3.8.3->langchain-community)
  Using cached aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp<4.0.0,>=3.8.3->langchain-community)
  Using cached aiosignal-1.3.2-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting attrs>=17.3.0 (from aiohttp<4.0.0

In [16]:
import os
import json

from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnableSequence
from langchain_core.output_parsers import StrOutputParser

In [17]:
# ---- 1. Set up API keys and environment variables ----
# Replace with your actual keys
os.environ["GOOGLE_API_KEY"] = open("api_key_paid.txt").read().strip()
os.environ["LANGSMITH_API_KEY"] = "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"
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "ai-tutur-cg"  # or your project name


In [18]:
# ---- 2. PDF to Chunks ----
def langchain_load_and_chunk(pdf_path, chunk_size=1200, chunk_overlap=200):
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_documents(docs)


In [19]:
# # ---- 3. Build Prompt Template ----
prompt_template = """
You are an expert curriculum designer.
Given these document chunks, generate a Learning Path structure in JSON format.

Format Example:
{{
  "sections": [
    {{
      "section_id": "S1",
      "title": "<Section Title>",
      "brief": "<Short 2–3 line description>",
      "subsections": [
        {{
          "subsection_id": "S1.1",
          "title": "<Subsection Title>",
          "sub_titles": ["<Sub-title 1>", "<Sub-title 2>"],  // Use [] if none
          "brief": "<Short 2–3 line description>"
        }}
      ]
    }}
  ]
}}

For each subsection, include a 'sub_titles' list with relevant sub-headings found in the chunk (use an empty list [] if none are present). Always include the 'brief' field with a 2–3 line summary.

Document Chunks:
{chunks}
"""

prompt = PromptTemplate.from_template(prompt_template)

# prompt_template = """
# You are an expert curriculum designer.
# Given these document chunks, generate a Learning Path structure in JSON format.

# Format Example:
# {{
#   "sections": [
#     {{
#       "section_id": "S1",
#       "title": "<Section Title>",
#       "brief": "<Short 2–3 line description>",
#       "subsections": [
#         {{
#           "subsection_id": "S1.1",
#           "title": "<Subsection Title>",
#           "sub_titles": ["<Sub-title 1>", "<Sub-title 2>"],  // Use [] if none
#           "brief": "<Short 2–3 line description>"
#         }}
#       ]
#     }}
#   ]
# }}

# For each subsection, include a 'sub_titles' list with relevant sub-headings found in the chunk (use an empty list [] if none are present).
# **Sort the sub_titles list in the logical order a student should learn them, from foundational concepts to more advanced or specific topics.**  # NEW
# Always include the 'brief' field with a 2–3 line summary.

# Document Chunks:
# {chunks}
# """

# prompt = PromptTemplate.from_template(prompt_template)

In [20]:
# ---- 4. Set up Gemini Model ----
llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-pro-latest", temperature=0)


In [21]:
# ---- 5. Agent Orchestrator ----
def run_langchain_structuring_agent(pdf_path):
    print("Loading and splitting document...")
    docs = langchain_load_and_chunk(pdf_path)
    chunk_texts = "\n\n".join([doc.page_content[:800] for doc in docs])  # Use only first 800 chars per chunk for prompt size

    chain = (
        prompt
        | llm
        | StrOutputParser()
    )

    print("\nPrompt preview (truncated):\n", prompt.format(chunks=chunk_texts)[:1200], "\n...")
    print("\nCalling Gemini via LangChain...\n")
    output = chain.invoke({"chunks": chunk_texts})

    try:
        skeleton = json.loads(output)
        print(json.dumps(skeleton, indent=2, ensure_ascii=False))
        with open("learning_path_skeleton.json", "w", encoding="utf-8") as f:
            json.dump(skeleton, f, indent=2, ensure_ascii=False)
        print("\n✅ Learning Skeleton saved as 'learning_path_skeleton.json'")
    except Exception:
        print("\nGemini output was not valid JSON. Here is the raw output:\n")
        print(output)
        skeleton = None
    return skeleton


In [22]:
# ---- 6. Example Usage ----
run_langchain_structuring_agent("mi-intro.pdf")

Loading and splitting document...

Prompt preview (truncated):
 
You are an expert curriculum designer.
Given these document chunks, generate a Learning Path structure in JSON format.

Format Example:
{
  "sections": [
    {
      "section_id": "S1",
      "title": "<Section Title>",
      "brief": "<Short 2–3 line description>",
      "subsections": [
        {
          "subsection_id": "S1.1",
          "title": "<Subsection Title>",
          "sub_titles": ["<Sub-title 1>", "<Sub-title 2>"],  // Use [] if none
          "brief": "<Short 2–3 line description>"
        }
      ]
    }
  ]
}

For each subsection, include a 'sub_titles' list with relevant sub-headings found in the chunk (use an empty list [] if none are present). Always include the 'brief' field with a 2–3 line summary.

Document Chunks:
INTRODUCTION
TO
MACHINE LEARNING
AN EARLY DRAFT OF A PROPOSED
TEXTBOOK
Nils J. Nilsson
Robotics Laboratory
Department of Computer Science
Stanford University
Stanford, CA 94305
e-mail: