# 📓 0. Setup and Initialization

In [1]:
# !pip install -q python-dotenv haystack-ai==2.2.4 haystack-experimental==0.1.0 google-generativeai pdfplumber

In [None]:
# !pip install pypdf

In [None]:

import os
import warnings
import json
import time
from datetime import datetime
from dotenv import load_dotenv
import google.generativeai as genai
from haystack import Pipeline, Document, component
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors.document_splitter import DocumentSplitter
from haystack.components.builders import PromptBuilder
from haystack.components.writers import DocumentWriter

warnings.filterwarnings('ignore')
load_dotenv()

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
MODEL_NAME = "gemini-2.0-flash"
PAID_MODE = True

def configure_gemini():
    genai.configure(api_key=GEMINI_API_KEY)
    model = genai.GenerativeModel(model_name=MODEL_NAME)
    return model

model = configure_gemini()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# =============================================
# 📓 11. TaskManagerAgent & UsageTrackerAgent (Utilities Setup)
# =============================================

def estimate_tokens(text):
    return max(1, len(text) // 4)

def estimate_output_tokens(text):
    return max(1, len(text) // 4)

def load_usage(usage_file="gemini_usage.json"):
    if os.path.exists(usage_file):
        with open(usage_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    return {
        'date': datetime.now().strftime('%Y-%m-%d'),
        'minute': datetime.now().strftime('%Y-%m-%d %H:%M'),
        'requests_today': 0,
        'requests_this_minute': 0,
        'tokens_this_minute': 0,
        'input_tokens_today': 0,
        'output_tokens_today': 0,
        'total_cost_today': 0.0
    }

def save_usage(usage, usage_file="gemini_usage.json"):
    with open(usage_file, 'w', encoding='utf-8') as f:
        json.dump(usage, f, indent=2)

def update_usage(usage, input_tokens, output_tokens, usage_file="gemini_usage.json", cost=None):
    now = datetime.now()
    today = now.strftime('%Y-%m-%d')
    this_minute = now.strftime('%Y-%m-%d %H:%M')
    if usage['date'] != today:
        usage['date'] = today
        usage['requests_today'] = 0
        usage['input_tokens_today'] = 0
        usage['output_tokens_today'] = 0
        usage['total_cost_today'] = 0.0
    if usage['minute'] != this_minute:
        usage['minute'] = this_minute
        usage['requests_this_minute'] = 0
        usage['tokens_this_minute'] = 0
    usage['requests_today'] += 1
    usage['requests_this_minute'] += 1
    usage['tokens_this_minute'] += input_tokens
    usage['input_tokens_today'] += input_tokens
    usage['output_tokens_today'] += output_tokens
    if 'total_cost_today' not in usage:
        usage['total_cost_today'] = 0.0
    if cost is not None:
        usage['total_cost_today'] += cost
    save_usage(usage, usage_file)
    return usage

def enforce_limits(usage, paid_mode=False, free_rpm=15, free_tpm=1_000_000):
    if usage['requests_today'] >= 1500:
        msg = f"Daily request limit (1500) reached."
        if paid_mode:
            print(f"WARNING: {msg}")
        else:
            print(f"{msg} Exiting.")
            exit(1)

    if usage['requests_this_minute'] >= free_rpm:
        msg = f"Minute request limit ({free_rpm}) reached."
        if paid_mode:
            print(f"WARNING: {msg}")
        else:
            print(f"{msg} Waiting...")
            while True:
                time.sleep(1)
                now = datetime.now().strftime('%Y-%m-%d %H:%M')
                if now != usage['minute']:
                    break

    if usage['tokens_this_minute'] >= free_tpm:
        msg = f"Minute token limit ({free_tpm}) reached."
        if paid_mode:
            print(f"WARNING: {msg}")
        else:
            print(f"{msg} Waiting...")
            while True:
                time.sleep(1)
                now = datetime.now().strftime('%Y-%m-%d %H:%M')
                if now != usage['minute']:
                    break


In [4]:
import google.generativeai as genai

# Load API key from file and configure Gemini
with open("api_key_paid.txt", "r") as f:
    api_key = f.read().strip()

genai.configure(api_key=api_key)

In [6]:
# =============================================
# 📓 1. StructuringAgent – Building Learning Skeleton
# =============================================

# Create Ingestion Pipeline
def create_ingestion_pipeline():
    document_store = InMemoryDocumentStore()
    pdf_converter = PyPDFToDocument()
    splitter = DocumentSplitter(split_by="word", split_length=300, split_overlap=50)
    writer = DocumentWriter(document_store=document_store)

    pipeline = Pipeline()
    pipeline.add_component("converter", pdf_converter)
    pipeline.add_component("splitter", splitter)
    pipeline.add_component("writer", writer)

    pipeline.connect("converter", "splitter")
    pipeline.connect("splitter", "writer")

    return pipeline, document_store

# Build Prompt for Learning Skeleton
def build_learning_skeleton_prompt():
    prompt_template = """
    You are an expert curriculum designer.
    Given extracted document chunks, generate a Learning Path structure:

    JSON Format Only:
    {
      "sections": [
        {
          "section_id": "S1",
          "title": "<Section Title>",
          "brief": "<Short 2–3 line description>",
          "subsections": [
            {
              "subsection_id": "S1.1",
              "title": "<Subsection Title>",
              "brief": "<Short 2–3 line description>"
            }
          ]
        }
      ]
    }

    Chunks:
    {% for document in documents %}
    {{ document.content }}
    {% endfor %}
    """
    return PromptBuilder(template=prompt_template)

# Call Gemini to generate learning skeleton
def call_gemini_learning_skeleton(prompt_text, usage_file="gemini_usage.json"):
    usage = load_usage(usage_file)
    enforce_limits(usage, paid_mode=PAID_MODE)
    time.sleep(4)  # Smart delay for free tier
    response = model.generate_content(prompt_text)
    output_text = response.text
    track_and_update_usage(prompt_text, output_text, usage_file, paid_mode=PAID_MODE)
    return output_text

def track_and_update_usage(prompt_text, output_text, usage_file="gemini_usage.json", paid_mode=True):
    input_tokens = estimate_tokens(prompt_text)
    output_tokens = estimate_output_tokens(output_text)

    if paid_mode:
        pricing = {'input': 0.15/1_000_000, 'output': 0.60/1_000_000}
    else:
        pricing = {'input': 0.0, 'output': 0.0}

    cost = input_tokens * pricing['input'] + output_tokens * pricing['output']
    usage = load_usage(usage_file)
    update_usage(usage, input_tokens, output_tokens, usage_file, cost)

# Run StructuringAgent
def run_structuring_agent(pdf_path):
    ingestion_pipeline, document_store = create_ingestion_pipeline()
    ingestion_pipeline.run({"converter": {"sources": [pdf_path]}})
    all_documents = document_store.filter_documents()

    prompt_builder = build_learning_skeleton_prompt()
    prompt = prompt_builder.run({"documents": all_documents})["prompt"]

    print("Calling Gemini to generate Learning Skeleton...")
    gemini_response = call_gemini_learning_skeleton(prompt)

    try:
        structured_output = json.loads(gemini_response)
    except json.JSONDecodeError:
        print("Gemini response was not valid JSON. Printing raw output:")
        print(gemini_response)
        return None

    print(json.dumps(structured_output, indent=2, ensure_ascii=False))

    with open("learning_path_skeleton.json", "w", encoding="utf-8") as f:
        json.dump(structured_output, f, indent=2, ensure_ascii=False)

    print("✅ Learning Skeleton saved as 'learning_path_skeleton.json'")
    return structured_output

# Example usage (uncomment to test):
run_structuring_agent("mi-intro.pdf")


Calling Gemini to generate Learning Skeleton...
Gemini response was not valid JSON. Printing raw output:
Okay, I'm ready. Please provide the document chunks you want me to use to generate the Learning Path structure in JSON format. I will analyze the chunks and create a logical learning progression with appropriate sections, subsections, titles, and briefs.



In [None]:
# =============================================
# 📓 2. ContentExtractionAgent – Retrieving Full Detailed Content
# =============================================

def run_content_extraction_agent(document_store, skeleton_json_path="learning_path_skeleton.json"):
    print("🔲 ContentExtractionAgent is a placeholder for now.")
    print("Will retrieve detailed content for each section/subsection based on skeleton.")


In [None]:
# =============================================
# 📓 3. ContentSkillAssessorAgent (First Round)
# =============================================

def run_content_skill_assessor_agent():
    print("🔲 ContentSkillAssessorAgent is a placeholder for now.")
    print("Will generate pre-learning content skill assessments.")


In [None]:
# =============================================
# 📓 4. AptitudeSkillAssessorAgent
# =============================================

def run_aptitude_skill_assessor_agent():
    print("🔲 AptitudeSkillAssessorAgent is a placeholder for now.")
    print("Will create aptitude tests independent of content.")


In [None]:
# =============================================
# 📓 5. CustomizedLearningPathAgent (First Pass)
# =============================================

def run_customized_learning_path_agent():
    print("🔲 CustomizedLearningPathAgent is a placeholder for now.")
    print("Will create a customized learning path based on content + aptitude assessment results.")


In [None]:
# =============================================
# 📓 6. QuizAgent
# =============================================

def run_quiz_agent():
    print("🔲 QuizAgent is a placeholder for now.")
    print("Will generate quizzes matching the user's skill level based on customized learning path.")


In [None]:
# =============================================
# 📓 7. TooltipAgent
# =============================================

def run_tooltip_agent():
    print("🔲 TooltipAgent is a placeholder for now.")
    print("Will create glossary tooltips for important terms based on the customized learning path.")


In [None]:
# =============================================
# 📓 8. AssistantAgent
# =============================================

def run_assistant_agent():
    print("🔲 AssistantAgent is a placeholder for now.")
    print("Will create a Section-specific Q&A chatbot to assist the user during learning.")


In [None]:
# =============================================
# 📓 9. ContentSkillAssessorAgent (Reassessment Round)
# =============================================

def run_content_skill_reassessor_agent():
    print("🔲 ContentSkillAssessorAgent (Reassessment) is a placeholder for now.")
    print("Will reassess the user's mastery level after completing the first learning path.")


In [None]:
# =============================================
# 📓 10. CustomizedLearningPathAgent (Iteration Phase)
# =============================================

def run_learning_path_iteration_agent():
    print("🔲 CustomizedLearningPathAgent (Iteration Phase) is a placeholder for now.")
    print("Will adapt and modify the learning path based on reassessment results if needed.")
