### ================================================================
### AI Tutor Pipeline - Step 3: Personalized Learning Path Builder
### ---------------------------------------------------------------
### This script:
###   - Matches curriculum topics to PDF content
###   - Simulates (or loads) quiz results for a user
###   - Computes the user learning zone (beginner/advanced, fast/slow)
###   - Calculates review pace per topic
###   - Outputs a personalized learning path (JSON) with pacing info
### ================================================================


In [99]:
import json
import pandas as pd
import random
from collections import defaultdict
import re
from difflib import SequenceMatcher
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

### ========== 1. Load Curriculum Skeleton ==========

In [23]:
with open("json_files/learning_path_skeleton_5th.json", "r", encoding="utf-8") as f:
    skeleton = json.load(f)

### ========== 2. Chunk PDF ==========

In [24]:
PDF_PATH = "files/science-textbook-grade-5.pdf"   # <-- update this!
loader = PyMuPDFLoader(PDF_PATH)
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=200)
chunks = splitter.split_documents(docs)  # List of Document objects


### ========== 3. Content Matching Functions ==========

In [25]:
def normalize(text):
    return re.sub(r'[^a-z0-9 ]', '', text.lower())

def fuzzy_ratio(a, b):
    return SequenceMatcher(None, a, b).ratio()

def find_matching_content(topic, chunks):
    norm_title = normalize(topic)
    candidates = []
    # 1. Exact normalized substring match
    for c in chunks:
        norm_chunk = normalize(c.page_content)
        if norm_title in norm_chunk:
            return c.page_content
    # 2. Partial word overlap: at least 60% of words
    title_words = set(norm_title.split())
    for c in chunks:
        chunk_words = set(normalize(c.page_content).split())
        if len(title_words & chunk_words) >= max(1, int(0.6 * len(title_words))):
            candidates.append(c.page_content)
    if candidates:
        return max(candidates, key=len)
    # 3. Fuzzy matching: best ratio
    best_score = 0
    best_chunk = ""
    for c in chunks:
        norm_chunk = normalize(c.page_content)
        score = fuzzy_ratio(norm_title, norm_chunk)
        if score > best_score:
            best_score = score
            best_chunk = c.page_content
    if best_score > 0.5:
        return best_chunk
    # 4. Fallback
    print(f"Warning: No content found for topic '{topic}'")
    return ""

### ========== 4. Topic Extraction & Mapping ==========

In [26]:
def extract_topics(skeleton):
    topics = []
    for section in skeleton["sections"]:
        topics.append(section["title"])  # Always add chapter/section
        for sub in section.get("subsections", []):
            topics.append(sub["title"])
            for st in sub.get("sub_titles", []):
                topics.append(st)
    return topics  # NO set() so duplicate sub-title topics show in both chapter & sub

topics = extract_topics(skeleton)
dependency_map = {t: [] for t in topics}  # No prereqs for now


In [27]:
# def get_topic_text_map(topics, chunks):
#     topic_text_map = {}
#     for topic in topics:
#         matched_chunks = [c.page_content for c in chunks if topic.lower() in c.page_content.lower()]
#         topic_text_map[topic] = "\n".join(matched_chunks)
#     return topic_text_map

# topic_text_map = get_topic_text_map(topics, chunks)

def get_topic_text_map(topics, chunks):
    topic_text_map = {}
    for topic in topics:
        topic_text_map[topic] = find_matching_content(topic, chunks)
    return topic_text_map

topic_text_map = get_topic_text_map(topics, chunks)

### ========== 5. Content Type Analysis ==========

In [28]:
def detect_content_types(text):
    code_lines = 0
    math_lines = 0
    table_lines = 0
    word_count = 0
    for line in text.splitlines():
        line_stripped = line.strip()
        # Code detection (SQL, Python, general)
        if (line_stripped.upper().startswith(("SELECT", "INSERT", "UPDATE", "DELETE", "CREATE", "ALTER", "DROP", "def ", "class ", "import ", "#", "--")) or
            line_stripped.endswith(";") or
            line_stripped.startswith("```")):
            code_lines += 1
        # Math detection
        if (any(s in line_stripped for s in ("$", "\\(", "\\)", "=", "+", "-", "*", "/")) and
            sum(1 for c in line_stripped if c in "=+-*/") >= 2):
            math_lines += 1
        # Table detection
        if (line_stripped.count('|') >= 2 or line_stripped.count(',') >= 4 or '\t' in line_stripped):
            table_lines += 1
        word_count += len(line_stripped.split())
    return word_count, code_lines, math_lines, table_lines


### ========== 6. Simulate (or Load) Quiz Results ==========

In [29]:
with open("quiz_files/filtered_quiz_5th.json", "r", encoding="utf-8") as f:
    quiz_data = json.load(f)
questions = quiz_data["quiz"]

In [75]:
def generate_user_answers(questions, topics_to_get_wrong=None, randomize=False):
    answers = []
    for q in questions:
        topic = q.get("subsection_title") or q.get("sub_title") or q.get("section_title") or "Unknown"
        if randomize:
            chosen = random.choice(['A', 'B', 'C', 'D'])
        elif topics_to_get_wrong and topic in topics_to_get_wrong:
            options = ['A', 'B', 'C', 'D']
            if q['answer'] in options:
                options.remove(q['answer'])
            chosen = random.choice(options)
        else:
            chosen = q['answer']
        answers.append({
            "question": q["question"],
            "topic": topic,
            "selected_answer": chosen,
            "correct_answer": q["answer"],
            "correct": (chosen == q["answer"]),
            "difficulty": q.get("difficulty", "unknown")
        })
    return answers

user_quiz_results = generate_user_answers(questions, randomize=True)  # Set to False for real user input


### ========== 7. User Zone Inference ==========

In [89]:
def learning_zone(aptitude, quiz, threshold=70):
    if aptitude >= threshold and quiz >= threshold:
        return "fast-learner and advanced"
    elif aptitude >= threshold and quiz < threshold:
        return "fast-learner and beginner"
    elif aptitude < threshold and quiz >= threshold:
        return "slow-learner and advanced"
    else:
        return "slow-learner and beginner"

correct = sum(1 for q in user_quiz_results if q["correct"])
# quiz_score = correct / len(user_quiz_results) * 100
quiz_score = 80
print("Quiz score:", quiz_score)
aptitude_score = 80  # Or real user score
user_zone = learning_zone(aptitude_score, quiz_score)
print("User zone:", user_zone)


Quiz score: 80
User zone: fast-learner and advanced


### ========== 8. Pace Estimation Functions ==========

In [90]:
def calculate_pace(word_count, code_lines, math_lines, table_lines, user_zone, missed_mcqs=0):
    base_wpm = 200
    base_time_min = (word_count / base_wpm) + (code_lines * 0.7) + (math_lines * 1) + (table_lines * 1)
    zone_multiplier = {
        "fast-learner and advanced": 0.7,
        "fast-learner and beginner": 1.0,
        "slow-learner and advanced": 1.15,
        "slow-learner and beginner": 1.3
    }[user_zone]
    review_time = missed_mcqs * 5
    total_minutes = base_time_min * zone_multiplier + review_time
    if total_minutes < 60:
        return f"{round(total_minutes)} min"
    elif total_minutes < 8 * 60:
        return f"{round(total_minutes/60,1)} hours"
    else:
        return f"{round(total_minutes/60/8,1)} days"


In [97]:
def get_prereqs(topic, deps_map, seen=None):
    if seen is None:
        seen = set()
    for pre in deps_map.get(topic, []):
        if pre not in seen:
            seen.add(pre)
            get_prereqs(pre, deps_map, seen)
    return seen

In [98]:
def build_learning_path_with_prereqs(quiz_results, user_zone, deps_map, topic_text_map):
    perf = defaultdict(list)
    for q in quiz_results:
        perf[q["topic"]].append(q)
    review_needed = set()
    missed_mcq_map = {}
    for topic, qs in perf.items():
        missed = [q for q in qs if not q["correct"]]
        if missed:
            review_needed.add(topic)
            review_needed.update(get_prereqs(topic, deps_map))
            missed_mcq_map[topic] = len(missed)
    learning_path = []
    for topic in deps_map.keys():
        text = topic_text_map.get(topic, "")
        word_count, code_lines, math_lines, table_lines = detect_content_types(text)
        missed_mcqs = missed_mcq_map.get(topic, 0)
        needs_review = topic in review_needed
        pace = calculate_pace(word_count, code_lines, math_lines, table_lines, user_zone, missed_mcqs)
        learning_path.append({
            "topic": topic,
            "pace": pace,
            "review_needed": needs_review,
            "missed_mcqs": missed_mcqs,
            "word_count": word_count,
            "code_lines": code_lines,
            "math_lines": math_lines,
            "table_lines": table_lines
        })
    return learning_path

learning_path = build_learning_path_with_prereqs(
    user_quiz_results,
    user_zone,
    dependency_map,
    topic_text_map
)

### ========== 9. Output Personalized Learning Path ==========

In [94]:
df = pd.DataFrame(learning_path)
# print(df)
learning_path_payload = {
    "user_zone": user_zone,
    "learning_path": learning_path
}
with open("path/learning_path_personalized_5th.json", "w", encoding="utf-8") as f:
    json.dump(learning_path_payload, f, indent=2, ensure_ascii=False)


### ========== 10. Chapter-Wise Pace Rollup ==========

In [95]:
def pace_to_minutes(pace_str):
    if "min" in pace_str:
        return int(pace_str.split()[0])
    elif "hour" in pace_str:
        return int(float(pace_str.split()[0]) * 60)
    elif "day" in pace_str:
        return int(float(pace_str.split()[0]) * 8 * 60)
    else:
        return 0

chapter_times = []
for section in skeleton["sections"]:
    chapter_title = section["title"]
    total_minutes = 0

    # --- NEW: Chapter-level content (section title itself)
    topic_row = next((t for t in learning_path if t["topic"] == chapter_title), None)
    if topic_row:
        total_minutes += pace_to_minutes(topic_row["pace"])

    for sub in section.get("subsections", []):
        topic = sub.get("title")
        topic_row = next((t for t in learning_path if t["topic"] == topic), None)
        if topic_row:
            total_minutes += pace_to_minutes(topic_row["pace"])
        # If you want to include sub_titles, you can loop those here too!

    if total_minutes < 60:
        pace_str = f"{total_minutes} min"
    elif total_minutes < 8 * 60:
        pace_str = f"{round(total_minutes / 60, 1)} hours"
    else:
        pace_str = f"{round(total_minutes / 60 / 8, 1)} days"
    chapter_times.append({"chapter": chapter_title, "pace": pace_str})
df_chapter = pd.DataFrame(chapter_times)
print("\nCHAPTER-WISE ESTIMATED PACE:")
print(df_chapter)



CHAPTER-WISE ESTIMATED PACE:
                         chapter    pace
0     Structure of Living Things   8 min
1  Plant Structure and Functions  13 min
2             Human Body Systems  21 min
3                  Earth's Water   5 min
4                Earth's Weather   6 min
5               The Solar System   9 min
6                Types of Matter  16 min
7              Changes in Matter   4 min
