In [7]:
# <Full script with requested updates>

# Complete integrated scraper with screenshot-on-failure and robust click handling.
# Paste into one Jupyter cell and run. Then call `df = await run_scraping(example)` in another cell.

import os
import re
import json
import time
import asyncio
import traceback
from pathlib import Path
from typing import List, Dict, Set
from urllib.parse import (
    urlparse, urljoin, urlunparse, parse_qsl, urlencode, unquote
)

import pandas as pd

from playwright.async_api import (
    async_playwright,
    Page,
    Error as PlaywrightError,
    TimeoutError as PlaywrightTimeoutError,
)


# ------------------------- CONFIG -------------------------

HEADLESS = False
PAGE_LOAD_TIMEOUT = 60000
EXTRA_WAIT_MS = 2000

OUTPUT_EXCEL = "accreditation_links_2_0_copy_copy.xlsx"
EXCEL_INPUT_FILE = "Engineering_Colleges_2875_valid_URL_part_1.xlsx"
SHEET_NAME = "Sheet1"

# ------------------------- GLOBAL EXECUTION CONTEXT -------------------------

CURRENT_COLLEGE_CONTEXT = {
    "college_name": None,
    "url": None,
}

# ------------------------- KEYWORDS (UPDATED) -------------------------
'''
CATEGORY_KEYWORDS = {
    "mandatory_disclosure": [
        "mandatory","mandatory disclosure",
        "mandatory_disclosure", "mandatory_disclosures",
        "statutory_disclosure", "statutory_disclosures",
        "aicte_mandatory_disclosure", "aicte_disclosure",
        "mandatory_discloser", "mandatory_discloure",
        "disclosure",
    ],
    "naac": [
        "naac", "assessment & accreditation", "assessment and accreditation","assessment_accreditation","assessment_and_accreditation"
        "naac_accreditation","ssr","dvv","nacc","qim","qnm","nacc"
    ],
    "nba": [
        "nba", "national board of accreditation","national_board_of_accreditation",
        "nba_accreditation",
    ],
    "nirf": [
        "nirf", "national institutional ranking framework", "nirf ranking","national_institutional_ranking_framework", "nirf_ranking",
        "nirf india ranking",
    ],
    "iqac": [
        "iqac", "internal quality assurance cell", "quality_assurance_cell", 
        "quality assurance cell", "quality assurance committee",
    ],
    "aicte": [
        "aicte", "all_india_council_for_technical education",
        "aicte approval", "aicte extension of approval",
    ],
    "aqar": [
        "aqar", "annual quality assurance report",
    ],
    "ariia": [
        "ariia", "atal ranking","atal ranking of institutions on innovation achievements", "atal"
    ],
    "accreditation": [
        "accreditation", "accredited", "accreditations", "download","affiliation"
    ],
    # explicit criteria category
    "criteria": [
        "crit","criteria", "criteria1", "criteria2", "criteria3", "criteria4", "criteria5", "criteria6", "criteria7"

    ],
    "criteria_1": ["criteria-1", "criteria 1", "criterion 1","criterion1","criteria1","criterion-1"], #,"1.2.1","1.2.2","1.3.2","1.4.1"
    "criteria_2": ["criteria-2", "criteria 2", "criterion 2","criterion2","criteria2","criterion-2"], #,"2.1.1","2.1.2","2.2.1","2.4.1","2.4.2","2.6.3"
    "criteria_3": ["criteria-3", "criteria 3", "criterion 3","criterion3","criteria3","criterion-3"], #,"3.1.1","3.2.2","3.3.1","3.3.2",'3.4.3',"3.5.1"
    "criteria_4": ["criteria-4", "criteria 4", "criterion 4","criterion4","criteria4","criterion-4"], #,"4.1.2","4.3.2","4.4.1"
    "criteria_5": ["criteria-5", "criteria 5", "criterion 5","criterion5","criteria5","criterion-5"], #,"5.1.1","5.2.1","5.1.3","5.1.4","5.2.1","5.2.2","5.3.1","5.3.2"
    "criteria_6": ["criteria-6", "criteria 6", "criterion 6","criterion6","criteria6","criterion-6"], #,"6.2.2","6.3.2","6.3.3","6.5.2"
    "criteria_7": ["criteria-7", "criteria 7", "criterion 7","criterion7","criteria7","criterion-7"], #,"7.1.2","7.1.3"
}

''' 
CATEGORY_KEYWORDS = {
    "mandatory_disclosure": ["mandatory", "mandatory disclosure", "mandatory_disclosure", "mandatory disclosures", "mandatory_disclosures", "statutory disclosure", "statutory_disclosure", "statutory disclosures", "statutory_disclosures", "aicte mandatory disclosure", "aicte_mandatory_disclosure", "aicte disclosure", "aicte_disclosure", "mandatory discloser", "mandatory_discloser", "mandatory discloure", "mandatory_discloure", "disclosure"],

    "naac": ["naac", "naac accreditation", "naac_accreditation", "assessment & accreditation", "assessment and accreditation", "assessment_accreditation", "assessment_and_accreditation", "ssr", "dvv", "nacc", "qim", "qnm"],

    "nba": ["nba", "nba accreditation", "nba_accreditation", "national board of accreditation", "national_board_of_accreditation"],

    "nirf": ["nirf", "nirf ranking", "nirf_ranking", "nirf india ranking", "nirf_india_ranking", "national institutional ranking framework", "national_institutional_ranking_framework"],

    "iqac": ["iqac", "internal quality assurance cell", "internal_quality_assurance_cell", "quality assurance cell", "quality_assurance_cell", "quality assurance committee", "quality_assurance_committee"],

    "aicte": ["aicte", "all india council for technical education", "all_india_council_for_technical_education", "aicte approval", "aicte_approval", "aicte extension of approval", "aicte_extension_of_approval"],

    "aqar": ["aqar", "annual quality assurance report", "annual_quality_assurance_report"],

    "ariia": ["ariia", "atal", "atal ranking", "atal_ranking", "atal ranking of institutions on innovation achievements", "atal_ranking_of_institutions_on_innovation_achievements"],

    "accreditation": ["accreditation", "accredited", "accreditations", "download", "affiliation"],

    "criteria": ["crit", "criteria", "criterion", "criteria1", "criteria2", "criteria3", "criteria4", "criteria5", "criteria6", "criteria7"],

    "criteria_1": ["criteria 1", "criteria-1", "criteria_1", "criterion 1", "criterion-1", "criterion_1", "criterion1", "criteria1"],

    "criteria_2": ["criteria 2", "criteria-2", "criteria_2", "criterion 2", "criterion-2", "criterion_2", "criterion2", "criteria2"],

    "criteria_3": ["criteria 3", "criteria-3", "criteria_3", "criterion 3", "criterion-3", "criterion_3", "criterion3", "criteria3"],

    "criteria_4": ["criteria 4", "criteria-4", "criteria_4", "criterion 4", "criterion-4", "criterion_4", "criterion4", "criteria4"],

    "criteria_5": ["criteria 5", "criteria-5", "criteria_5", "criterion 5", "criterion-5", "criterion_5", "criterion5", "criteria5"],

    "criteria_6": ["criteria 6", "criteria-6", "criteria_6", "criterion 6", "criterion-6", "criterion_6", "criterion6", "criteria6"],

    "criteria_7": ["criteria 7", "criteria-7", "criteria_7", "criterion 7", "criterion-7", "criterion_7", "criterion7", "criteria7"],
}

CRITERION_METRIC_REGEX = re.compile(
    r"""
    (?:
        \b([1-7])\.\d+\.\d+\b |          # 2.6.3 , 7.1.11
        \b([1-7])[\-_ ]\d+[\-_ ]\d+\b |  # 2-6-3 , 2_6_3
        \b([1-7])\.\d+\b |               # 2.6 (key indicator)
        metric\s*[-:]?\s*(\d{3,4}) |     # Metric 263 / 7211
        criterion\s*[-‚Äì]?\s*(i{1,3}|iv|v|vi|vii)  # Criterion IV
    )
    """,
    re.IGNORECASE | re.VERBOSE
)

ROMAN_TO_INT = {
    "i":1,"ii":2,"iii":3,"iv":4,
    "v":5,"vi":6,"vii":7
}


OUTPUT_DIR = Path("output_college_info_6_7")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
ERROR_LOG_FILE = OUTPUT_DIR / "failed_colleges_errors.log"

def log_college_error(college_name: str, base_url: str, exc: Exception):
    """
    Append any college-level error to a persistent log file.
    Never raises.
    """
    try:
        with open(ERROR_LOG_FILE, "a", encoding="utf-8") as f:
            f.write("\n" + "=" * 100 + "\n")
            f.write(f"TIMESTAMP : {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"COLLEGE   : {college_name}\n")
            f.write(f"BASE URL  : {base_url}\n")
            f.write("ERROR     :\n")
            f.write("".join(traceback.format_exception(type(exc), exc, exc.__traceback__)))
            f.write("\n")
    except Exception:
        # Logging must NEVER break execution
        pass


def infer_criteria_column(text: str, url: str) -> str | None:
    combined = f"_{text}_{url}_".lower()
    m = CRITERION_METRIC_REGEX.search(combined)
    if not m:
        return None

    # Roman numerals
    for r, v in ROMAN_TO_INT.items():
        if r in m.group(0).lower():
            return f"criteria_{v}"

    # Numeric cases
    digit = re.search(r"[1-7]", m.group(0))
    if digit:
        return f"criteria_{digit.group()}"

    return None

IQAC_ADMIN_SUBSTRINGS = (
    # governance
    "committ", "member", "compos", "struct",
    "chair", "coordin", "authorit", "power", "duti","strateg",

    # philosophy / intent
    "vision", "mission", "object", "goal",

    # planning / monitoring
    "plan", "monitor", "evaluat",

    # functions / operations
    "function", "activ", "process", "mechan",

    # quality language
    "enhanc", "improv",

    # policies / systems
    "framework", "internal",

    # feedback / stakeholders
    "stakehold",

    # meetings / records (non-doc)
    "minute", "meet",

    # events / training
    "workshop", "seminar", "training","benefit","development",

)


def setup_asyncio_exception_logger():
    loop = asyncio.get_event_loop()

    def handle_async_exception(loop, context):
        exc = context.get("exception")
        msg = context.get("message", "Async exception occurred")

        college = CURRENT_COLLEGE_CONTEXT.get("college_name")
        url = CURRENT_COLLEGE_CONTEXT.get("url")

        try:
            with open(ERROR_LOG_FILE, "a", encoding="utf-8") as f:
                f.write("\n" + "=" * 100 + "\n")
                f.write(f"TIMESTAMP : {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
                f.write(f"COLLEGE   : {college or 'UNKNOWN'}\n")
                f.write(f"URL       : {url or 'UNKNOWN'}\n")
                f.write("ASYNC ERROR:\n")
                f.write(msg + "\n")

                if exc:
                    f.write(
                        "".join(
                            traceback.format_exception(
                                type(exc), exc, exc.__traceback__
                            )
                        )
                    )
        except Exception:
            pass  # logging must never break execution

    loop.set_exception_handler(handle_async_exception)



'''
NAAC_INTERNAL_KEYWORDS = [
    "iqac",
    "iiqa",
    "ssr",
    "dvv",
    "qif",
    "quality indicator framework",
    "extended profile",
    "criterion",
    "criteria",
    "metric",
    "cycle",
]
'''
'''
NAVIGATION_KEYWORDS = [
    "naac", "nba", "nirf", "iqac",
    "mandatory", "disclosure", "accreditation", "ariia", "ranking",
    "quality", "aicte", "aqar", "annual quality assurance",
]

INTERESTING_KEYWORDS_IN_URL = [
    "naac", "nba", "nirf", "aicte", "aqar",
    "accreditation", "accreditations", "accredited",
    "approval", "approvals", "extension-of-approval",
    "mandatory", "disclosure", "disclosures", "statutory",
    "iqac", "internal-quality-assurance", "quality-assurance",
    "annual-quality-assurance-report", "annual-report",
    "aqar", "report", "reports",
    "ariia", "atal-ranking", "innovation-ranking",
    "link ", "links ", "quick-link", "quick-links", "quick links",
    "useful-links", "important-links",
    "download", "downloads", "forms", "notices", "notice",
    "circular", "circulars", "document", "documents",
    "ranking", "nirf-ranking", "nirf-india-ranking",
    "certificate", "certificates", "approval-letter",
    "brochure", "prospectus", "placement", "placements",
    "fee-structure", "fees",
]
'''
# ------------------------- GLOBAL STATE -------------------------
PROGRESS_ROWS = {}
#GLOBAL_URL_OWNERS = {}
GLOBAL_SEEN_URLS = set()
GLOBAL_SEEN_ARTIFACTS = set()   # typed dedupe keys
GLOBAL_SEEN_PAGES = set()      # raw normalized page URLs

GLOBAL_EXCEL_URLS = set()
GLOBAL_BFS_URLS = set()

from collections import defaultdict

DOMAIN_FAILURE_COUNT = defaultdict(int)
DOMAIN_SEMAPHORES = defaultdict(lambda: asyncio.Semaphore(1))




def should_accept_context(existing: str | None, new: str) -> bool:
    if existing is None:
        return True

    existing = existing.strip().lower()
    if existing in ("", "network pdf", "pdf document", "iframe pdf"):
        return True

    return False


# ------------------------- URL NORMALIZATION / HELPERS -------------------------
'''
async def reveal_hidden_dom(page):
    """
    Click elements that ONLY reveal DOM (menus, accordions),
    NOT expecting PDFs.
    """
    revealers = page.locator(
        '[data-bs-toggle="collapse"],'
        '[aria-expanded="false"],'
        'details > summary'
    )

    for i in range(await revealers.count()):
        el = revealers.nth(i)
        try:
            fingerprint = await get_element_fingerprint(el)
            if fingerprint in page._clicked_elements:
                continue

            try:
                if not await el.is_visible():
                    continue
                await el.scroll_into_view_if_needed(timeout=1500)
                await el.click(timeout=1500)
                await safe_load_wait(page)
            except Exception:
                continue


            page._clicked_elements.add(fingerprint)
            await page.wait_for_timeout(300)
        except Exception:
            continue
'''

TRACKING_PARAMS = {
    "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
    "fbclid", "gclid", "mc_cid", "mc_eid"
}

def is_iqac_admin_page(combined: str) -> bool:
    combined = combined.lower()

    if "iqac" not in combined:
        return False

    iqac_pos = combined.find("iqac")

    for sub in IQAC_ADMIN_SUBSTRINGS:
        sub_pos = combined.find(sub)

        if sub_pos != -1 and abs(sub_pos - iqac_pos) < 50:
            return True

    return False


def normalize_url(url: str, text: str = "") -> str:
    if not url:
        return url
    try:
        p = urlparse(url)
    except Exception:
        return url

    # üîí Force HTTPS to avoid http/https duplicates
    scheme = p.scheme or "http"

    # üîí Normalize domain
    netloc = (p.netloc or "").lower().lstrip("www.")

    path = p.path or "/"

    # üîí Collapse multiple slashes
    while "//" in path:
        path = path.replace("//", "/")

    # üîí Remove trailing slash except root
    if path.endswith("/") and path != "/":
        path = path.rstrip("/")

    try:
        qs = parse_qsl(p.query, keep_blank_values=True)
        qs = [(k, v) for (k, v) in qs if k not in TRACKING_PARAMS]
        qs.sort()
        query = urlencode(qs, doseq=True)
    except Exception:
        query = ""

    return urlunparse((
        scheme,
        netloc,
        path,
        "",      # params
        query,
        ""       # fragment
    ))



def canonicalize_for_dedupe(url: str) -> str:
    """
    Canonical URL used ONLY for deduplication (BFS + Excel).
    
    Rule:
    - Normalize URL
    - If path ends with .php AND there is NO query string ‚Üí strip .php
    - If query string exists ‚Üí DO NOT strip .php
    """
    if not url:
        return url

    norm = normalize_url(url, "")

    try:
        p = urlparse(norm)

        # üîí If query parameters exist ‚Üí do NOT collapse
        if p.query:
            return norm

        path = p.path or ""

        # üîí Collapse ONLY pure .php endings
        if path.lower().endswith(".php"):
            path = path[:-4] or "/"

        return urlunparse((
            "https",                 # üîí force single scheme
            p.netloc,
            path,
            "",
            "",
            ""
        ))


    except Exception:
        return norm


def make_dedupe_key(url: str, bucket: str | None = None) -> str:
    """
    Dedupe must depend on semantic type.
    bucket: 'pdf' | 'nonpdf' | None
    """
    norm = normalize_url(url, "")
    '''
    if bucket:
        return f"{bucket}::{norm}"
    '''
    return norm

def is_same_domain(base_url: str, target_url: str) -> bool:
    try:
        base = urlparse(base_url)
        target = urlparse(target_url)
        base_netloc = base.netloc.lower().lstrip("www.")
        target_netloc = target.netloc.lower().lstrip("www.")

        return target_netloc == "" or target_netloc == base_netloc
    except Exception:
        return False
'''
def claim_url_once(url: str, text: str, category: str, depth: int) -> bool:
    norm = normalize_url(url, "")
    if norm in GLOBAL_URL_OWNERS:
        return False  # ‚ùå already owned ‚Üí do nothing

    GLOBAL_URL_OWNERS[norm] = {
        "category": category,
        "text": text,
        "depth": depth,
        "is_pdf": is_pdf(url)
    }
    return True


def should_follow_link(text: str, href: str) -> bool:
    combined = ((text or "") + " " + (href or "")).lower()
    return any(kw in combined for kw in NAVIGATION_KEYWORDS)
'''
def is_interesting_url(url: str, text: str = "") -> bool:
    if not url:
        return False
    url_stripped = url.strip()
    if url_stripped.startswith(("javascript:", "mailto:", "tel:", "#")):
        return False
    parsed = urlparse(url_stripped)
    if parsed.scheme not in ("http", "https"):
        return False
    return True
'''
def url_matches_keywords(url: str, text: str = "") -> bool:
    if not url and not text:
        return False
    combined = ((url or "") + " " + (text or "")).lower()
    for kws in CATEGORY_KEYWORDS.values():
        for kw in kws:
            if kw and kw.lower() in combined:
                return True
    for kw in INTERESTING_KEYWORDS_IN_URL:
        if kw.lower() in combined:
            return True
    return False
'''
# ------------------------- CONTEXT HELPER (RETAINED) -------------------------
# Original combined getter replaced with a getter that returns both own_text and parent_heading separately.
'''
def accept_url(norm_url: str) -> bool:
    """
    Global BFS-level dedupe.
    Raw URLs are never touched here.
    """
    if norm_url in GLOBAL_SEEN_URLS:
        return False
    GLOBAL_SEEN_URLS.add(norm_url)
    return True

def accept_url(dedupe_key: str) -> bool:
    if dedupe_key in GLOBAL_SEEN_URLS:
        return False
    GLOBAL_SEEN_URLS.add(dedupe_key)
    return True
''' 
def accept_url(dedupe_key: str) -> bool:
    if dedupe_key in GLOBAL_SEEN_ARTIFACTS:
        return False
    GLOBAL_SEEN_ARTIFACTS.add(dedupe_key)
    return True


def detect_criteria_category(text: str, url: str):
    combined = f"_{text}_{url}_".lower()

    for i in range(1, 8):
        if re.search(rf"\b(criteria|criterion)[\s\-]*{i}\b", combined):
            return f"criteria_{i}"

    if "criteria" in combined or "criterion" in combined:
        return "criteria"

    return None

GET_ELEMENT_OWN_AND_PARENT_JS = r"""
(el) => {

    function safeText(n) {
        try {
            return (
                typeof n?.innerText === 'string'
                    ? n.innerText
                    : (n?.textContent || '')
            ).trim();
        } catch (e) {
            return '';
        }
    }

    function safeClass(n) {
        try {
            if (typeof n?.className === 'string') return n.className.toLowerCase();
            if (n?.className && typeof n.className.baseVal === 'string')
                return n.className.baseVal.toLowerCase();
        } catch (e) {}
        return '';
    }

    function safeTag(n) {
        try {
            return n?.tagName ? n.tagName.toUpperCase() : '';
        } catch (e) {
            return '';
        }
    }

    function findHeading(e) {
        let p = e;

        // üîç walk up parents (guarded)
        for (let depth = 0; depth < 5 && p; depth++) {
            const tag = safeTag(p);
            if (/H[1-6]/.test(tag)) {
                const txt = safeText(p);
                if (txt) return txt;
            }

            const cls = safeClass(p);
            if (
                cls &&
                (
                    cls.includes('title') ||
                    cls.includes('heading') ||
                    cls.includes('section') ||
                    cls.includes('content') ||
                    cls.includes('course') ||
                    cls.includes('post')
                )
            ) {
                const txt = safeText(p);
                if (txt) return txt;
            }

            if (p.getAttribute) {
                const aria = (p.getAttribute('aria-label') || '').trim();
                if (aria) return aria;
            }

            p = p.parentElement;
        }

        // üîç previous siblings
        let s = e;
        for (let i = 0; i < 5 && s; i++) {
            s = s.previousElementSibling;
            if (!s) break;

            const tag2 = safeTag(s);
            if (/H[1-6]/.test(tag2)) {
                const txt = safeText(s);
                if (txt) return txt;
            }

            const cls2 = safeClass(s);
            if (
                cls2 &&
                (
                    cls2.includes('title') ||
                    cls2.includes('heading') ||
                    cls2.includes('section') ||
                    cls2.includes('content') ||
                    cls2.includes('course') ||
                    cls2.includes('post')
                )
            ) {
                const txt = safeText(s);
                if (txt) return txt;
            }
        }

        return '';
    }

    function getOwnText(el) {
        try {
            const own = safeText(el);
            if (own) return own;

            if (el.getAttribute) {
                const title = (el.getAttribute('title') || '').trim();
                if (title) return title;

                const aria = (el.getAttribute('aria-label') || '').trim();
                if (aria) return aria;

                const alt = (el.getAttribute('alt') || '').trim();
                if (alt) return alt;

                const href = el.getAttribute('href');
                if (href) {
                    try {
                        const seg = href.split('/').pop();
                        if (seg)
                            return decodeURIComponent(seg)
                                .replace(/\+|%20/g, ' ')
                                .trim();
                    } catch (e) {}
                }
            }

            return '';
        } catch (e) {
            return '';
        }
    }

    try {
        const own = getOwnText(el);
        const heading = findHeading(el);
        return {
            own: own || '',
            heading: heading || ''
        };
    } catch (e) {
        return { own: '', heading: '' };
    }
}
"""


async def get_element_own_and_parent(elem):
    try:
        res = await safe_elem_evaluate(
            elem,
            GET_ELEMENT_OWN_AND_PARENT_JS,
            default={}
        )
        if not isinstance(res, dict):
            return "", ""
        return (res.get("own", "") or "").strip(), (res.get("heading", "") or "").strip()
    except Exception:
        try:
            own = await elem.inner_text()
            return (own or "").strip(), ""
        except Exception:
            return "", ""
        
async def get_element_fingerprint(elem):
    try:
        return await safe_elem_evaluate(
            elem,
            """
            el => {
                const role = el.getAttribute('role') || '';
                const aria = el.getAttribute('aria-label') || '';
                const txt = (
                    typeof el.innerText === 'string'
                        ? el.innerText
                        : (el.textContent || '')
                ).trim().slice(0, 50);

                const path = [];
                let p = el;
                let depth = 0;
                while (p && depth < 3) {
                    path.push(p.tagName + '.' + (p.className || ''));
                    p = p.parentElement;
                    depth++;
                }
                return [role, aria, txt, path.join('>')].join('|');
            }
            """,
            default = None
        )
    except Exception:
        return None


# ------------------------- CLICK-BASED DISCOVERY (PATCHED to attach keyword/text) -------------------------

CLICK_KEYWORDS = [
    "naac", "nba", "nirf", "aicte", "aqar", "ariia",
    "accreditation", "accredited",
    "accreditations", "certificate",
    "mandatory", "disclosure", "iqac",
    "download", "downloads", "links", "quick links",
]

GET_ELEMENT_CONTEXT_JS = r"""
(el) => {

    function safeText(n) {
        try {
            return (
                typeof n?.innerText === 'string'
                    ? n.innerText
                    : (n?.textContent || '')
            ).trim();
        } catch (e) {
            return '';
        }
    }

    function safeClass(n) {
        try {
            if (typeof n?.className === 'string') return n.className.toLowerCase();
            if (n?.className && typeof n.className.baseVal === 'string')
                return n.className.baseVal.toLowerCase();
        } catch (e) {}
        return '';
    }

    function safeTag(n) {
        try {
            return n?.tagName ? n.tagName.toUpperCase() : '';
        } catch (e) {
            return '';
        }
    }

    function findHeading(e) {
        let p = e;

        // üîç walk up parents
        for (let depth = 0; depth < 5 && p; depth++) {
            const tag = safeTag(p);
            if (/H[1-6]/.test(tag)) {
                const txt = safeText(p);
                if (txt) return txt;
            }

            const cls = safeClass(p);
            if (
                cls &&
                (
                    cls.includes('title') ||
                    cls.includes('heading') ||
                    cls.includes('section') ||
                    cls.includes('content') ||
                    cls.includes('course') ||
                    cls.includes('post')
                )
            ) {
                const txt = safeText(p);
                if (txt) return txt;
            }

            if (p.getAttribute) {
                const aria = (p.getAttribute('aria-label') || '').trim();
                if (aria) return aria;
            }

            p = p.parentElement;
        }

        // üîç previous siblings
        let s = e;
        for (let i = 0; i < 10 && s; i++) {
            s = s.previousElementSibling;
            if (!s) break;

            const tag2 = safeTag(s);
            if (/H[1-6]/.test(tag2)) {
                const txt = safeText(s);
                if (txt) return txt;
            }

            const cls2 = safeClass(s);
            if (
                cls2 &&
                (
                    cls2.includes('title') ||
                    cls2.includes('heading') ||
                    cls2.includes('section') ||
                    cls2.includes('content') ||
                    cls2.includes('course') ||
                    cls2.includes('post')
                )
            ) {
                const txt = safeText(s);
                if (txt) return txt;
            }
        }

        return '';
    }

    try {
        const own = safeText(el);
        const heading = findHeading(el);

        if (heading && own) return (heading + ' ' + own).trim();
        if (heading) return heading;
        return own;
    } catch (e) {
        try {
            return safeText(el);
        } catch (e2) {
            return '';
        }
    }
}
"""


async def get_element_context_text(elem):
    """
    Return the combined heading + element text for a Playwright element handle (locator.nth(i)).
    Falls back to element.inner_text() if evaluation fails.
    """
    try:
        txt = await safe_elem_evaluate(elem, GET_ELEMENT_CONTEXT_JS, default="")
        return (txt or "").strip()
    except Exception:
        try:
            return (await elem.inner_text()).strip()
        except Exception:
            return ""

NON_CRAWLABLE_EXTENSIONS = (
    ".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".svg",
    ".ico",
    ".mp4", ".webm", ".avi", ".mov", ".mkv",
    ".mp3", ".wav", ".ogg",
    ".zip", ".rar", ".7z", ".tar", ".gz",
    ".xlsx", ".xls", ".csv", ".ods" 
)

def is_non_crawlable_url(url: str) -> bool:
    if not url:
        return True
    clean = url.lower().split("?", 1)[0].split("#", 1)[0]
    return clean.endswith(NON_CRAWLABLE_EXTENSIONS)



# ------------------------- DOM URL COLLECT -------------------------
async def collect_dom_urls(page):
    js = """
    () => {
        const urls = new Set();
        const addUrl = (u) => {
            if (u && typeof u === 'string') {
                urls.add(u);
            }
        };

        document.querySelectorAll("a[href]").forEach(el => addUrl(el.href));
        document.querySelectorAll("img[src]").forEach(el => addUrl(el.src));
        document.querySelectorAll("iframe[src], embed[src]").forEach(el => addUrl(el.src));
        document.querySelectorAll("object[data]").forEach(el => addUrl(el.data));

        return Array.from(urls);
    }
    """
    raw = await safe_evaluate(page, js, default=[])
    abs_urls = set()
    for u in raw:
        try:
            u = u.strip()

            if u.startswith("//"):
                u = "https:" + u
            elif u.startswith("www."):
                u = "https://" + u
            elif re.match(r"^[a-z0-9\-]+\.[a-z]{2,}", u):
                u = "https://" + u

            full = urljoin(page.url, u)
        except Exception:
            continue

        if is_interesting_url(full, ""):
            abs_urls.add(full)
    '''
    for u in raw:
        try:
            full = urljoin(page.url, u)
        except Exception:
            full = u
        if is_interesting_url(full, ""):
            abs_urls.add(full)
    '''
    return abs_urls

# ------------------------- POPUP HANDLING -------------------------
async def close_popups(page):
    X_BUTTON_SELECTORS = [
        "button[aria-label*='close']",
        "button[aria-label*='dismiss']",
        "button:has-text('√ó')",
        "button:has-text('‚úï')",
        "div[role='button']:has-text('√ó')",
        "[class*='close']",
        "[id*='close']",
        ".close-btn",
        ".modal-close",
        "[data-bs-dismiss='modal']",
    ]
    CLOSE_TEXTS = [
        "close", "dismiss", "no thanks", "not now",
        "skip", "cancel", "maybe later",
    ]
    CLOSE_SELECTORS = [
        "button:has-text('Close')",
        "button:has-text('CLOSE')",
        "button:has-text('Dismiss')",
        "[role='button']:has-text('Close')",
        "[role='button']:has-text('Dismiss')",
    ]
    ACCEPT_TEXTS = [
        "accept", "agree", "i agree", "accept all",
        "got it", "continue", "ok", "okay", "allow",
    ]
    ACCEPT_SELECTORS = [
        "button:has-text('Accept')",
        "button:has-text('AGREE')",
        "button:has-text('OK')",
        "button:has-text('Got it')",
        "[id*='cookie'] button",
        "[class*='cookie'] button",
    ]

    for _ in range(4):
        clicked_something = False
        for sel in X_BUTTON_SELECTORS:
            locator = page.locator(sel)
            if await locator.count() > 0:
                count = await locator.count()
                for i in range(count):
                    try:
                        elem = locator.nth(i)
                        if await elem.is_visible() and await elem.is_enabled():
                            await elem.click(timeout=1500)
                            clicked_something = True
                            await page.wait_for_timeout(500)
                    except Exception:
                        pass
        if clicked_something:
            continue

        for sel in CLOSE_SELECTORS:
            locator = page.locator(sel)
            if await locator.count() > 0:
                count = await locator.count()
                for i in range(count):
                    try:
                        elem = locator.nth(i)
                        if await elem.is_visible() and await elem.is_enabled():
                            await elem.click(timeout=1500)
                
                            clicked_something = True
                            await page.wait_for_timeout(500)
                    except Exception:
                        pass

        for txt in CLOSE_TEXTS:
            locator = page.get_by_text(txt, exact=False)
            if await locator.count() > 0:

                count = await locator.count()
                for i in range(count):
                    try:
                        elem = locator.nth(i)
                        if await elem.is_visible() and await elem.is_enabled():
                            await elem.click(timeout=1500)
                        
                            clicked_something = True
                            await page.wait_for_timeout(500)
                    except Exception:
                        pass

        if clicked_something:
            continue

        for sel in ACCEPT_SELECTORS:
            locator = page.locator(sel)
            if await locator.count() > 0:
                count = await locator.count()
                for i in range(count):
                    try:
                        elem = locator.nth(i)
                        if await elem.is_visible() and await elem.is_enabled():
                            await elem.click(timeout=1500)
                        
                            clicked_something = True
                            await page.wait_for_timeout(500)
                    except Exception:
                        pass

        for txt in ACCEPT_TEXTS:
            locator = page.get_by_text(txt, exact=False)
            if await locator.count() > 0:
                count = await locator.count()
                for i in range(count):
                    try:
                        elem = locator.nth(i)
                        if await elem.is_visible() and await elem.is_enabled():
                            await elem.click(timeout=1500)
                    
                            clicked_something = True
                            await page.wait_for_timeout(500)
                    except Exception:
                        pass

        if not clicked_something:
            break

# ------------------------- OPPORTUNISTIC POPUP GUARD -------------------------
async def opportunistic_close_popups(page):
    """
    Lightweight, repeatable popup closer.
    Safe to call many times.
    Does NOT block execution.
    """

    selectors = [
        "button[aria-label*='close' i]",
        "button[aria-label*='dismiss' i]",
        "[data-bs-dismiss='modal']",
        ".modal-close",
        ".close-btn",
        "[class*='close']",

        # Cookie / consent
        "button:has-text('Accept')",
        "button:has-text('AGREE')",
        "button:has-text('OK')",
        "button:has-text('Got it')",
        "button:has-text('Allow')",

        # Generic dialog buttons
        "[role='dialog'] button",
    ]

    for sel in selectors:
        try:
            loc = page.locator(sel)
            if await loc.count() > 0:
                for i in range(await loc.count()):
                    btn = loc.nth(i)
                    if await btn.is_visible():
                        await btn.click(timeout=500)
                        await page.wait_for_timeout(150)
        except Exception:
            pass


# ========================= HEADER / NAV DETECTORS =========================
IS_GLOBAL_TOP_NAV_JS = r"""
(el) => {
    try {
        // Semantic fast-path
        if (el.closest('header, nav')) return true;

        const style = getComputedStyle(el);
        const pos = style.position;

        if (pos !== 'fixed' && pos !== 'sticky') return false;

        const r1 = el.getBoundingClientRect();
        const vw = window.innerWidth;

        // Must be near top and wide
        if (r1.top > 80) return false;
        if (r1.width < vw * 0.6 || r1.height < 40) return false;

        // Scroll invariance test
        window.scrollBy(0, 200);
        const r2 = el.getBoundingClientRect();
        window.scrollBy(0, -200);

        return Math.abs(r1.top - r2.top) < 2;
    } catch {
        return false;
    }
}
"""


IS_IN_HEADER_OR_FOOTER_JS = r"""
(el) => {
    try { return !!el.closest('header, footer'); }
    catch(e){ return false; }
}
"""

IS_STICKY_OR_FIXED_NAV_JS = r"""
(el) => {
    try {
        const s = getComputedStyle(el);
        if (!s || (s.position !== 'fixed' && s.position !== 'sticky')) return false;

        const r1 = el.getBoundingClientRect();
        const vh = window.innerHeight;

        const anchored = r1.top <= 5 || r1.bottom >= vh - 5;
        if (!anchored) return false;

        if (r1.height < 40 || r1.width < window.innerWidth * 0.5) return false;

        window.scrollBy(0,200);
        const r2 = el.getBoundingClientRect();
        window.scrollBy(0,-200);

        return Math.abs(r1.top - r2.top) < 2;
    } catch(e){
        return false;
    }
}
"""

async def is_in_header_or_footer(elem):
    return await safe_elem_evaluate(elem, IS_IN_HEADER_OR_FOOTER_JS, default=False)

async def is_sticky_or_fixed_nav(elem):
    return await safe_elem_evaluate(elem, IS_STICKY_OR_FIXED_NAV_JS, default=False)

async def is_global_top_nav(elem):
    try:
        return await safe_elem_evaluate(elem, IS_GLOBAL_TOP_NAV_JS, default=False)
    except Exception:
        return False

# ========================= PLAYWRIGHT HARDENING LAYER =========================



async def safe_wait(page, ms=300):
    try:
        if page.is_closed():
            return
        await page.wait_for_timeout(ms)
    except Exception:
        pass


async def safe_load_wait(page, state="domcontentloaded", timeout=10000):
    try:
        if page.is_closed():
            return False
        await page.wait_for_load_state(state, timeout=timeout)
        return True
    except Exception:
        return False


async def safe_evaluate(page, script, *, timeout_ms=10000, default=None):
    """
    GUARDED page.evaluate
    - Handles navigation
    - Handles reload
    - Handles page close
    - Never throws
    """
    try:
        if page.is_closed():
            return default

        await page.wait_for_load_state("domcontentloaded", timeout=timeout_ms)

        return await page.evaluate(script)

    except (PlaywrightError, TimeoutError):
        return default
    except Exception:
        return default


async def safe_elem_evaluate(elem, script, default=None):
    """
    GUARDED element.evaluate
    """
    try:
        return await elem.evaluate(script)
    except Exception:
        return default


async def safe_click(elem, *, timeout=1500):
    """
    GUARDED click
    - Handles detached elements
    - Handles navigation
    """
    try:
        if not await elem.is_visible():
            return False
        if not await elem.is_enabled():
            return False

        await elem.scroll_into_view_if_needed(timeout=timeout)
        await elem.click(timeout=timeout)
        return True
    except Exception:
        return False


async def safe_select_option(select_elem, *, index=None, value=None, timeout=1000):
    """
    GUARDED select_option
    """
    try:
        if index is not None:
            await select_elem.select_option(index=index, timeout=timeout)
        elif value is not None:
            await select_elem.select_option(value=value, timeout=timeout)
        return True
    except Exception:
        return False


async def safe_inner_text(elem, default=""):
    try:
        return (await elem.inner_text()).strip()
    except Exception:
        return default


async def safe_get_attr(elem, attr):
    try:
        return await elem.get_attribute(attr)
    except Exception:
        return None


# ------------------------- AUTO SCROLL -------------------------
async def auto_scroll(page, rounds=3, delay_ms=1000): #auto-scroll with delay
    """
    Enhanced scroll to trigger lazy-loaded / intersection-observer content.
    """
    last_height = 0

    async with page_task(page) as ok:
        if not ok:
            return

        for r in range(rounds):
            if page._closing or page.is_closed():
                break

            try:
                height = await safe_evaluate(
                    page,
                    "() => document.body.scrollHeight",
                    default=0
                )
                if height == last_height:
                    break
                last_height = height

                await safe_evaluate(
                    page,
                    "() => window.scrollTo(0, document.body.scrollHeight)"
                )
                await page.wait_for_timeout(delay_ms)

                # slight scroll up to re-trigger observers
                await safe_evaluate(
                    page,
                    "() => window.scrollBy(0, -300)"
                )
                await page.wait_for_timeout(250)

                print(f"[DEBUG] Scroll round {r+1}/{rounds} completed")

            except Exception:
                break


# ------------------------- CLICK-BASED DISCOVERY (kept but unused) -------------------------
'''
async def click_and_capture_urls(page, elem, timeout_ms=5000):
    context = page.context
    original_url = page.url
    before_dom_urls = await collect_dom_urls(page)
    new_urls = set()

    try:
        async with context.expect_page(timeout=timeout_ms) as pop_info:
            await elem.click()
        popup = await pop_info.value

        try:
            await popup.wait_for_load_state("load", timeout=timeout_ms)
        except TimeoutError:
            pass

        try:
            popup_url = popup.url
        except Exception:
            popup_url = ""

        if is_interesting_url(popup_url, "") and url_matches_keywords(popup_url, ""):
            new_urls.add(popup_url)

        popup_dom_urls = await collect_dom_urls(popup)
        for u in popup_dom_urls:
            if url_matches_keywords(u, ""):
                new_urls.add(u)

        try:
            await popup.close()
        except Exception:
            pass
        return new_urls
    except TimeoutError:
        pass
    except Exception:
        return new_urls

    try:
        await page.wait_for_timeout(2000)
        new_url = page.url

        if new_url != original_url:
            if is_interesting_url(new_url, "") and url_matches_keywords(new_url, ""):
                new_urls.add(new_url)

            dom_urls = await collect_dom_urls(page)
            for u in dom_urls:
                if url_matches_keywords(u, ""):
                    new_urls.add(u)

            try:
                if page.can_go_back():
                    await page.go_back()
                    await page.wait_for_load_state("load")
            except Exception:
                pass

        else:
            after_dom_urls = await collect_dom_urls(page)
            added = after_dom_urls - before_dom_urls

            for u in added:
                if url_matches_keywords(u, ""):
                    new_urls.add(u)

    except Exception:
        return new_urls

    return new_urls
'''

'''
def is_any_category_related(text: str, url: str) -> bool:
    combined = f"{text} {url}".lower()
    for kws in CATEGORY_KEYWORDS.values():
        for kw in kws:
            kw = kw.strip().lower()
            if not kw:
                continue
            if re.search(rf"\b{re.escape(kw)}\b", combined):
                return True
    return False
'''


# ------------------------- VISIT & COLLECT (HEADING-BASED EXTRACTION IMPLEMENTED) -------------------------
from urllib.parse import unquote

def merge_text(base: str, *extras: str) -> str:
    """
    Safely append contextual text without overwriting.
    Preserves base text ALWAYS.
    """
    parts = []

    if base:
        parts.append(base.strip())

    for e in extras:
        if e and e.strip():
            parts.append(e.strip())

    return " ".join(parts)

def upsert_artifact(page, norm, raw_url, new_text):
    """
    Append text to an existing artifact if already seen.
    Never overwrites richer context.
    """
    if not new_text:
        new_text = ""

    old = page._seen_artifacts.get(norm)

    if old:
        old_text = old.get("text", "")
        merged = merge_text(old_text, new_text)

        if should_accept_context(old_text, merged):
            old["text"] = merged
    else:
        page._seen_artifacts[norm] = {
            "raw": raw_url,
            "text": new_text
        }


def is_pdf(url: str) -> bool:
    return url.lower().endswith(".pdf")

from urllib.parse import urlparse
import re

DOCUMENT_EXTENSIONS = {
    "pdf": [".pdf"],
    "excel": [".xls", ".xlsx", ".csv", ".ods",".tsv", ".xlt", ".xltx"],
    "word": [".doc", ".docx", ".rtf",".odt",".tex",".dot",".dotx"],
    "ppt": [".ppt", ".pptx", ".pps", ".ppsx"],
    "archive": [".zip", ".rar", ".7z", ".tar", ".gz", ".bz2"],
    "text": [".txt", ".log", ".dat"],
    "images": [".jpg", ".jpeg", ".png", ".tiff", ".bmp"]
}

def classify_document_url(url: str) -> str | None:
    if not url:
        return None

    u = url.lower().split("?", 1)[0]

    # üîπ Google Drive (highest priority)
    if any(h in u for h in (
        "drive.google.com",
        "docs.google.com",
        "googleusercontent.com",
        "onedrive.live.com",
        "sharepoint.com",
        "dropbox.com",
        "box.com"
    )):
        return "google_drive"   # or rename bucket to "external_doc"


    # üîπ Normal document extensions
    for category, exts in DOCUMENT_EXTENSIONS.items():
        if any(u.endswith(ext) for ext in exts):
            return category

    '''
    # üîπ Heuristic fallback (download.php, file=, action=)
    if re.search(r"(download|file=|action=)", u):
        return "other_document"
    '''
    return None



CATEGORY_PRIORITY = [
    "criteria_1","criteria_2","criteria_3","criteria_4",
    "criteria_5","criteria_6","criteria_7",
    "mandatory_disclosure",
    "aicte",
    "naac",
    "nba",
    "nirf",
    "iqac",
    "aqar",
    "ariia",
    "accreditation",
]

def normalize_token(s: str) -> str:
    """
    Normalize string for URL-safe comparison.
    Removes all non-alphanumeric characters.
    """
    if not s:
        return ""
    return re.sub(r"[^a-z0-9]", "_", s.lower())

def pick_category(
    text: str,
    href: str,
    page_category: str | None = None
) -> str | None:

    text_l = (text or "").lower()
    href_l = (href or "").lower()

    norm_text = normalize_token(text_l)
    norm_href = normalize_token(href_l)

    keywords = CATEGORY_KEYWORDS.copy()

    # NAAC pages are special ‚Äî inherit deeper terms
    if page_category == "naac":
        keywords["naac"] = keywords["naac"] + [
            "iqac", "iiqa", "ssr", "dvv", "qif",
            "quality indicator framework",
            "criterion", "criteria", "metric", "cycle"
        ]

    for cat in CATEGORY_PRIORITY:
        for kw in keywords.get(cat, []):
            if not kw:
                continue

            kw_l = kw.lower()
            norm_kw = normalize_token(kw_l)
        
            # 1Ô∏è‚É£ STRICT TEXT MATCH (best signal)
            if text_l:
                if re.search(rf"\b{re.escape(kw_l)}\b", text_l):
                    return cat
       
            # 2Ô∏è‚É£ NORMALIZED URL MATCH (critical)
            pattern = rf"(?:^|_){re.escape(norm_kw)}(?:_|$)"
            if re.search(pattern, norm_href):
                return cat

            # 3Ô∏è‚É£ RAW URL substring fallback
            if kw_l in href_l:
                return cat

    # -----------------------------
    # 4Ô∏è‚É£ CONTEXT FALLBACK
    # -----------------------------
    return page_category


'''
def pick_category(text: str, href: str, page_category: str | None = None) -> str | None:
    combined = f"{text} {href}".lower()

    keywords = CATEGORY_KEYWORDS.copy()

    if page_category == "naac":
        keywords["naac"] = keywords["naac"] + [
            "iqac","iiqa","ssr","dvv","qif",
            "quality indicator framework",
            "criterion","criteria","metric","cycle","crit"
        ]

    for cat in CATEGORY_PRIORITY:
        for kw in keywords.get(cat, []):
            if not kw:
                continue
            pattern = rf"\b{re.escape(kw)}\b"
            if re.search(pattern, combined):
                return cat
    return None
'''
def is_document_file(url: str) -> bool:
    if not url:
        return False

    clean = url.lower()

    if "drive.google.com" in clean:
        return True

    clean = clean.split("?", 1)[0]

    for exts in DOCUMENT_EXTENSIONS.values():
        if any(clean.endswith(ext) for ext in exts):
            return True

    return False

def is_category_related(text: str, url: str) -> bool:
    # ---------- RAW combined (for regex) ----------
    raw_combined = f"_{text}_{url}_"

    # ---------- NORMALIZED combined (for keyword matching) ----------
    combined = normalize_token(raw_combined)

    # =========================================================
    # 1Ô∏è‚É£ CRITERION / METRIC DETECTION (HIGH PRIORITY)
    # =========================================================
    if CRITERION_METRIC_REGEX.search(raw_combined):
        # print(
        #     f"nonpdf-category (criterion): "
        #     f"url:{url}, text:{text}"
        # )
        return True

    # =========================================================
    # 2Ô∏è‚É£ CATEGORY KEYWORD DETECTION
    # =========================================================
    for kws in CATEGORY_KEYWORDS.values():
        for kw in kws:
            if not kw:
                continue

            norm_kw = normalize_token(kw)
            if not norm_kw:
                continue

            
            # üîí STRICT TOKEN MATCH ON NORMALIZED STRING
            pattern = rf"(?:^|_){re.escape(norm_kw)}(?:_|$)"

            if re.search(pattern, combined):
                #print(f"nonpdf-category (keyword-regex): "f"url:{url}, text:{text}, pattern:{pattern}, combined:{combined}")
                return True
            
            '''
            and norm_kw in combined:
                print(
                    f"nonpdf-category (keyword): "
                    f"url:{url}, text:{text}, "
                    f"kw:{kw}, norm_kw:{norm_kw}"
                )
                return True
            '''
    # =========================================================
    # 3Ô∏è‚É£ FALLBACK ‚Üí non-category
    # =========================================================
    # print(
    #     f"nonpdf-other: "
    #     f"url:{url}, text:{text}, combined:{combined}"
    # )
    return False

'''

def is_category_related(text: str, url: str) -> bool:
    combined = normalize_token(f"{text} {url}")

    for kws in CATEGORY_KEYWORDS.values():
        for kw in kws:
            if not kw:
                continue
            if normalize_token(kw) in combined:
                print(f"nonpdf-category: url:{url}, text:{text}, combined:{combined},kw:{kw},  normalized_token(kw):{normalize_token(kw)}\n")
                return True
    print(f"nonpdf-other: url:{url}, text:{text}, combined:{combined}\n")
    return False
'''

def classify_url(raw_url: str, text: str):
    """
    PURE ROW classification.
    """
    if is_category_related(text, raw_url):
        return "nonpdf_category_related"
    return "nonpdf_other"


YEAR_CUTOFF = 2020

YEAR_REGEX = re.compile(r"\b(20\d{2})\b")

def is_old_year_content(text: str, url: str) -> bool:
    """
    Return True if text or URL contains a year < YEAR_CUTOFF.
    """
    combined = f"_{text}_{url}_"
    years = YEAR_REGEX.findall(combined)

    if not years:
        return False  # no year ‚Üí keep

    try:
        years_int = [int(y) for y in years]
    except Exception:
        return False  # malformed ‚Üí keep safe

    return False  # max(years_int) < YEAR_CUTOFF

def accept_and_store(
    *,
    raw_url: str,
    enriched_text: str,
    page_category: str | None,
    category_links: dict,
    page,
):
    # üîí HARD STOP: discard old-year content entirely
    if is_old_year_content(enriched_text, raw_url):
        return

    # =====================================================
    # 1Ô∏è‚É£ CANONICAL URL (single identity for everything)
    # =====================================================
    norm = normalize_url(raw_url, "")

    # =====================================================
    # 2Ô∏è‚É£ MERGE CONTEXT FIRST (NO CLASSIFICATION YET)
    # =====================================================
    # If URL already seen on this page, just enrich text
    existing = page._seen_artifacts.get(norm)
    if existing:
        merged = merge_text(existing["text"], enriched_text)
        existing["text"] = merged
        return
    else:
        page._seen_artifacts[norm] = {
            "raw": raw_url,
            "text": enriched_text,
            "page_category": page_category,
        }
        return

'''

def accept_and_store(
    *,
    raw_url: str,
    enriched_text: str,
    page_category: str | None,
    category_links: dict,
    page,
):
    # üîí HARD STOP: discard old-year content entirely
    if is_old_year_content(enriched_text, raw_url):
        return

    # =====================================================
    # 1Ô∏è‚É£ DOCUMENT CLASSIFICATION (single source of truth)
    # =====================================================
    doc_type = classify_document_url(raw_url)

    if doc_type:
        bucket = doc_type
        inferred_cat = page_category or "accreditation"
    else:
        # =================================================
        # 2Ô∏è‚É£ NON-DOCUMENT CLASSIFICATION
        # =================================================
        bucket= classify_url(
            raw_url, enriched_text
        )
        inferred_cat = page_category or "accreditation"

    # =====================================================
    # 3Ô∏è‚É£ Decide column (category)
    # =====================================================
    criteria_col = infer_criteria_column(enriched_text, raw_url)
    if criteria_col:
        store_cat = criteria_col
    elif page_category:
        store_cat = page_category
    else:
        store_cat = inferred_cat or "accreditation"

    # =====================================================
    # 4Ô∏è‚É£ Typed dedupe key
    # =====================================================
    dedupe_key = make_dedupe_key(raw_url, bucket)

    # Page-level dedupe
    if dedupe_key in page._page_seen_urls:
        return
    page._page_seen_urls.add(dedupe_key)

    # Global dedupe
    if dedupe_key in GLOBAL_SEEN_ARTIFACTS:
        return
    GLOBAL_SEEN_ARTIFACTS.add(dedupe_key)

    # =====================================================
    # 5Ô∏è‚É£ STORE (NO re-classification later)
    # =====================================================
    category_links[store_cat].append(
        (bucket, f"{raw_url} || {enriched_text}")
    )

'''

'''
def assign_category(
    category_links: dict,
    *,
    url: str,
    text: str,
    depth: int,
    page_category: str | None,
):
    norm = normalize_url(url, "")

    # üîí HARD STOP: URL already seen anywhere
    if norm in GLOBAL_URL_OWNERS:
        return

    # 1Ô∏è‚É£ Criteria override
    criteria_cat = detect_criteria_category(text, url)
    if criteria_cat:
        if claim_url_once(url, text, criteria_cat, depth):
            category_links[criteria_cat].append(f"{url} || {text}")
        return

    # 2Ô∏è‚É£ PDF rule
    if is_pdf(url):
        cat = page_category or "accreditation"
        if claim_url_once(url, text, cat, depth):
            category_links[cat].append(f"{url} || {text}")
        return

    # 3Ô∏è‚É£ Inherit page category
    if page_category:
        if claim_url_once(url, text, page_category, depth):
            category_links[page_category].append(f"{url} || {text}")
        return

    # 4Ô∏è‚É£ Deterministic category pick
    cat = pick_category(text, url,page_category)
    if cat:
        if claim_url_once(url, text, cat, depth):
            category_links[cat].append(f"{url} || {text}")
        return
'''

async def wait_for_new_pdf(page, before_set, timeout_ms=5000, poll_ms=200):
    waited = 0
    while waited < timeout_ms:
        await page.wait_for_timeout(poll_ms)
        if page._local_seen_pdf - before_set:
            return True
        waited += poll_ms
    return False

'''
async def is_interactive_page(page) -> bool:
    """
    Heuristically determine whether a page requires interaction
    to reveal content.
    """
    js = """
    () => {
        return Boolean(
            document.querySelector('[role="tab"]') ||
            document.querySelector('[role="tab"]') ||
            document.querySelector('details > summary') ||
            document.querySelector('select') ||
            document.querySelector('select option') ||
            document.querySelector('[aria-controls]') 
        );
    }
    """
    try:
        return await safe_evaluate(page, js, default=False)
    except Exception:
        return False
'''
async def interact_and_capture_pdfs(page, elem,*, force_click= False):
    """
    Click an element ONLY if:
    - it has relevant keywords OR
    - it is a <button> (form submit / viewer triggers)
    """

    allow_click = False
    text = ""

    try:
        text = (await elem.inner_text()).lower()
    except:
        pass

    # 1Ô∏è‚É£ Keyword-based allow (existing logic)
    if force_click:
        allow_click = True
    else:
        if any(k in text for k in CLICK_KEYWORDS):
            allow_click = True


    # 2Ô∏è‚É£ NEW: Allow generic <button> clicks (form submits)
    try:
        tag = await elem.evaluate("el => el.tagName")
        if tag == "BUTTON":
            allow_click = True
    except Exception:
        pass

    if not allow_click:
        return set()

    before = set(page._local_seen_pdf)

    try:
        await opportunistic_close_popups(page)   # ‚úÖ ADD
        if not await elem.is_visible():
            return set()
        await elem.scroll_into_view_if_needed(timeout=1500)
        await opportunistic_close_popups(page)   # ‚úÖ ADD
        await safe_click(elem, timeout=1500)
        await safe_load_wait(page)
    except Exception:
        return set()


    await wait_for_new_pdf(page, before_set=before, timeout_ms=6000)
    return page._local_seen_pdf - before

'''
def get_resource_state(page):
    """
    The ONLY state that determines convergence.
    """
    return frozenset(page._local_seen_pdf)
'''
def get_page_resource_state(page):
    """
    Canonical convergence state:
    ANY new URL (PDF or non-PDF, DOM or network) counts as progress.
    """
    return frozenset(page._seen_artifacts.keys())

async def is_cyclic_ui_element(elem):
    """
    Detect infinite / cyclic UI components (carousels, sliders, rotators)
    and permanently blacklist them.
    """
    try:
        return await safe_elem_evaluate(
            elem,
            """
            el => {
                function isCarouselContainer(n) {
                    if (!n || n === document.body) return false;

                    const cls = (n.className || '').toLowerCase();
                    const role = (n.getAttribute && n.getAttribute('role')) || '';
                    const aria = (n.getAttribute && n.getAttribute('aria-roledescription')) || '';

                    const isCarousel =
                        aria === 'carousel' ||
                        cls.includes('carousel') ||
                        cls.includes('slider') ||
                        cls.includes('owl-') ||
                        cls.includes('swiper') ||
                        cls.includes('slick');

                    if (isCarousel) {
                        // üîí POISON PILL: permanently mark subtree
                        n.dataset.__crawler_skip = "1";
                        return true;
                    }

                    const style = getComputedStyle(n);

                    if (style.transform && style.transform !== 'none') {
                        n.dataset.__crawler_skip = "1";
                        return true;
                    }

                    if (style.overflow === 'hidden' && n.children.length > 1) {
                        n.dataset.__crawler_skip = "1";
                        return true;
                    }

                    if (n.querySelectorAll('[class*="cloned"]').length > 0) {
                        n.dataset.__crawler_skip = "1";
                        return true;
                    }

                    if (n.querySelectorAll('.active').length > 1) {
                        n.dataset.__crawler_skip = "1";
                        return true;
                    }

                    return isCarouselContainer(n.parentElement);
                }

                return isCarouselContainer(el);
            }
        """,
        default= False
        )
    except Exception:
        return False


'''
async def is_cyclic_ui_element(elem):
    """
    Detect infinite / cyclic UI components (carousels, sliders, rotators).
    Framework-agnostic.
    """
    try:
        return await elem.evaluate("""
            el => {
                function isCarouselContainer(n) {
                    if (!n || n === document.body) return false;

                    const cls = (n.className || '').toLowerCase();
                    const role = (n.getAttribute && n.getAttribute('role')) || '';
                    const aria = (n.getAttribute && n.getAttribute('aria-roledescription')) || '';

                    // Known semantic signals
                    if (aria === 'carousel') return true;
                    if (role === 'tablist' && n.querySelector('[aria-selected]')) return true;

                    // Common class indicators
                    if (
                        cls.includes('carousel') ||
                        cls.includes('slider') ||
                        cls.includes('owl-') ||
                        cls.includes('swiper') ||
                        cls.includes('slick')
                    ) return true;

                    const style = getComputedStyle(n);

                    // Sliding transforms
                    if (style.transform && style.transform !== 'none')
                        return true;

                    // Masked viewport (slides move inside)
                    if (style.overflow === 'hidden' && n.children.length > 1)
                        return true;

                    // Cloned / rotating items
                    if (n.querySelectorAll('[class*="cloned"]').length > 0)
                        return true;

                    // Multiple active elements = rotation
                    if (n.querySelectorAll('.active').length > 1)
                        return true;

                    return isCarouselContainer(n.parentElement);
                }

                return isCarouselContainer(el);
            }
        """)
    except Exception:
        return False
'''

async def is_poisoned(elem):
    try:
        return await safe_elem_evaluate(
            elem,
            "el => el.closest('[data-__crawler-skip=\"1\"]') !== null",
            default=False
        )

    except Exception:
        return False

async def handle_tabs(page,depth):

    tabs = page.locator('[role="tab"]')
    count = await tabs.count()
    if count == 0:
        return set(), False

    found_pdfs = set()
    did_progress = False

    for i in range(count):
        await opportunistic_close_popups(page)   # ‚úÖ ADD
        tab = tabs.nth(i)
        if depth >= 1:
            if await is_global_top_nav(tab):
                continue
        '''
        if depth >= 1:
            if await is_in_header_or_footer(tab): 
                continue
            if await is_sticky_or_fixed_nav(tab):
                continue
        '''
        #if await is_cyclic_ui_element(tab):
         #   continue 

        try:
            if await tab.is_disabled():
                continue
        except Exception:
            pass

        if await is_poisoned(tab):
            continue

        fingerprint = await get_element_fingerprint(tab)
        if fingerprint in page._clicked_elements:
            continue

        try:
            selected = await tab.get_attribute("aria-selected")
            if selected == "true":
                continue
        except Exception:
            pass

        before_pdfs = len(page._local_seen_pdf)
        before_dom = await get_structural_dom_signature(page)

        # üîπ capture tab text BEFORE clicking
        tab_text = await get_element_context_text(tab)

        # üîπ click and capture PDFs
        pdfs = await interact_and_capture_pdfs(page, tab, force_click=True)

        after_pdfs = len(page._local_seen_pdf)
        after_dom = await get_structural_dom_signature(page)

        # ‚úÖ ONLY NOW decide progress
        if after_pdfs > before_pdfs or after_dom != before_dom:
            did_progress = True

        # ‚úÖ ALWAYS mark as clicked (even if useless)
        page._clicked_elements.add(fingerprint)


        # üîπ NEW: enrich PDF context with tab text
        for pdf in pdfs:
            norm = normalize_url(pdf, "")
            upsert_artifact(page, norm, pdf, tab_text)

        found_pdfs |= pdfs

    return found_pdfs, did_progress



async def handle_accordions(page,depth):
    toggles = page.locator('[aria-expanded][role="button"], details > summary')

    found_pdfs = set()
    did_progress = False

    for i in range(await toggles.count()):
        await opportunistic_close_popups(page)   # ‚úÖ ADD
        el = toggles.nth(i)

        if depth >= 1:
            if await is_global_top_nav(el):
                continue
        '''
        if depth >= 1:
            if await is_in_header_or_footer(el): 
                continue
            if await is_sticky_or_fixed_nav(el):
                continue
        '''
        #if await is_cyclic_ui_element(el):
         #   continue
        # üö´ EXCLUDE SLIDERS / CAROUSELS EARLY
        try:
            cls = (await el.get_attribute("class") or "").lower()
            role = (await el.get_attribute("role") or "").lower()

        except Exception:
            pass

        # üîé Skip already-open accordions
        try:
            expanded = await el.get_attribute("aria-expanded")
            if expanded == "true":
                continue
        except Exception:
            pass

        if await is_poisoned(el):
            continue

        fingerprint = await get_element_fingerprint(el)
        if fingerprint in page._clicked_elements:
            continue

        # ‚úÖ REAL accordion discovered
        before_pdfs = len(page._local_seen_pdf)
        before_dom = await get_structural_dom_signature(page)

        pdfs = await interact_and_capture_pdfs(page, el, force_click=True)
        found_pdfs |= pdfs

        after_pdfs = len(page._local_seen_pdf)
        after_dom = await get_structural_dom_signature(page)

        if after_pdfs > before_pdfs or after_dom != before_dom:
            did_progress = True

        # ‚úÖ ALWAYS mark as clicked
        page._clicked_elements.add(fingerprint)
        

    return found_pdfs, did_progress



async def handle_dropdowns(page,depth):
    found_pdfs = set()
    did_progress = False
    selects = page.locator("select")

    for i in range(await selects.count()):
        await opportunistic_close_popups(page)   # ‚úÖ ADD
        sel = selects.nth(i)
        
        if depth >= 1:
            if await is_global_top_nav(sel):
                continue

        ''''
        if depth >= 1:
            if await is_in_header_or_footer(sel): 
                continue
            if await is_sticky_or_fixed_nav(sel):
                continue
        '''
        #if await is_cyclic_ui_element(sel):
        #    continue

        if await is_poisoned(sel):
            continue

        fingerprint = await get_element_fingerprint(sel)
        if fingerprint in page._clicked_elements:
            continue

        # ‚úÖ NEW INTERACTABLE FOUND ‚Üí PROGRESS
        before_pdfs = len(page._local_seen_pdf)
        before_dom = await get_structural_dom_signature(page)

        options = sel.locator("option")

        for j in range(await options.count()):
            try:
                opt = options.nth(j)
                if not await opt.is_enabled():
                    continue

                before = set(page._local_seen_pdf)

                try:
                    await sel.select_option(index=j, timeout=1000)
                    await safe_load_wait(page)  
                except PlaywrightError:
                    break  # üîí page closed / element invalid ‚Üí stop dropdown safely

                await wait_for_new_pdf(page, before_set=before, timeout_ms=3000)
                found_pdfs |= (page._local_seen_pdf - before)

            except Exception:
                break


        after_pdfs = len(page._local_seen_pdf)
        after_dom = await get_structural_dom_signature(page)

        if after_pdfs > before_pdfs or after_dom != before_dom:
            did_progress = True

        # ‚úÖ ALWAYS mark dropdown as clicked
        page._clicked_elements.add(fingerprint)
        
    return found_pdfs, did_progress

'''
async def handle_sliders(page):
    # Slider already tested once ‚Üí stop forever
    if getattr(page, "_slider_exhausted", False):
        return set(), False

    candidates = page.locator(
        '[aria-label*="next"], [aria-label*="Next"], button:has-text(">"), button:has-text("‚Üí")'
    )

    if await candidates.count() == 0:
        return set(), False

    page._slider_exhausted = True  # üîí critical line

    found_pdfs = set()

    for i in range(min(await candidates.count(), 2)):
        btn = candidates.nth(i)
        try:
            await btn.scroll_into_view_if_needed(timeout=1500)
            pdfs = await interact_and_capture_pdfs(page, btn)
            found_pdfs |= pdfs
        except Exception:
            pass

    # ‚ùå sliders never signal "progress"
    return found_pdfs, False

'''
async def get_structural_dom_signature(page):
    return await safe_evaluate(
        page,
        """
        () => {
            const body = document.body;
            if (!body) return 0;
            return body.querySelectorAll(
                'a[href], iframe, embed, object, details, summary, select, option'
            ).length;
        }
        """,
        default=0
    )


async def get_dom_text_signature(page):
    try:
        return await safe_evaluate(
            page,
            "() => document.body ? document.body.innerText.length : 0",
            default=0
        )
    except Exception:
        return 0


async def get_anchor_count(page):
    try:
        return await safe_evaluate(
            page,
            "() => document.querySelectorAll('a[href]').length",
            default=0
        )

    except Exception:
        return 0


async def get_iframe_count(page):
    try:
        return len(page.frames)
    except Exception:
        return 0

def log_soft_error(college_name: str, url: str, reason: str):
    try:
        with open(ERROR_LOG_FILE, "a", encoding="utf-8") as f:
            f.write("\n" + "=" * 100 + "\n")
            f.write(f"TIMESTAMP : {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"COLLEGE   : {college_name}\n")
            f.write(f"URL       : {url}\n")
            f.write("SOFT ERROR:\n")
            f.write(reason + "\n")
    except Exception:
        pass

async def goto_with_retry(
    page,
    url: str,
    *,
    retries: int = 3,
    timeout: int = PAGE_LOAD_TIMEOUT,
    wait_until: str = "domcontentloaded",
    retry_delay_ms: int = 1500,
):
    """
    Robust page.goto with retry on timeout / navigation failure.
    Returns True if navigation succeeds, False otherwise.

    First attempt: original behavior (domcontentloaded)
    Second+ attempts: warm retry using SAME page + commit
    """
    for attempt in range(1, retries + 1):
        try:
            if attempt == 1:
                # üîπ FIRST ATTEMPT ‚Äî keep EXACT existing behavior
                await page.goto(
                    url,
                    wait_until=wait_until,
                    timeout=timeout,
                )
            else:
                # üîπ RETRIES ‚Äî warm retry on SAME page
                try:
                    # server / DNS / TLS warm-up
                    await page.wait_for_timeout(8000)
                except Exception:
                    pass

                await page.goto(
                    url,
                    wait_until="commit",   # üîë ONLY CHANGE
                    timeout=timeout,
                )

            return True

        except (PlaywrightTimeoutError, PlaywrightError) as e:
            print(
                f"[WARN] goto failed (attempt {attempt}/{retries}) :: {url} :: {e}"
            )
            log_soft_error(
                CURRENT_COLLEGE_CONTEXT.get("college_name"),
                url,
                f"goto failed (attempt {attempt}/{retries}) :: {e}"
            )

            domain = urlparse(url).netloc
            DOMAIN_FAILURE_COUNT[domain] += 1

            # üîí Adaptive backoff after repeated failures on same domain
            if DOMAIN_FAILURE_COUNT[domain] >= 2:
                try:
                    await page.wait_for_timeout(15000)  # 15 seconds
                except Exception:
                    pass
                
            if attempt < retries:
                try:
                    await page.wait_for_timeout(retry_delay_ms)
                except Exception:
                    pass
            else:
                return False

        except Exception as e:
            # unexpected error ‚Üí do not loop forever
            print(f"[ERROR] fatal goto error :: {url} :: {e}")
            return False


'''
async def goto_with_retry(
    page,
    url: str,
    *,
    retries: int = 3,
    timeout: int = PAGE_LOAD_TIMEOUT,
    wait_until: str = "domcontentloaded",
    retry_delay_ms: int = 1500,
):
    """
    Robust page.goto with retry on timeout / navigation failure.
    Returns True if navigation succeeds, False otherwise.
    """
    for attempt in range(1, retries + 1):
        try:
            await page.goto(url, wait_until=wait_until, timeout=timeout)
            return True
        except (PlaywrightTimeoutError, PlaywrightError) as e:
            print(
                f"[WARN] goto failed (attempt {attempt}/{retries}) :: {url} :: {e}"
            )
            log_soft_error(
                CURRENT_COLLEGE_CONTEXT.get("college_name"),
                url,
                f"goto failed (attempt {attempt}/{retries}) :: {e}"
            )

            if attempt < retries:
                try:
                    await page.wait_for_timeout(retry_delay_ms)
                except Exception:
                    pass
            else:
                return False
        except Exception as e:
            # unexpected error ‚Üí do not loop forever
            print(f"[ERROR] fatal goto error :: {url} :: {e}")
            return False
'''        

from contextlib import asynccontextmanager

@asynccontextmanager
async def page_task(page):
    if getattr(page, "_closing", False) or page.is_closed():
        yield False
        return
    page._active_tasks += 1
    try:
        yield True
    finally:
        page._active_tasks -= 1


async def safe_close_page(page, timeout_ms=3000):
    if page.is_closed():
        return

    # üîí signal shutdown
    page._closing = True

    # üîí stop response listeners if present
    try:
        if hasattr(page, "_on_response"):
            page.remove_listener("response", page._on_response)
    except Exception:
        pass

    # üîí wait for running tasks
    waited = 0
    while getattr(page, "_active_tasks", 0) > 0 and waited < timeout_ms:
        await asyncio.sleep(0.05)
        waited += 50

    # üîí final close
    try:
        if not page.is_closed():
            await page.close()
    except Exception:
        pass

def resolve_final_url(requested_url: str, page) -> str:
    """
    Return the final resolved URL after navigation.
    Falls back safely.
    """
    try:
        final_url = page.url
        if final_url:
            return final_url
    except Exception:
        pass

    return requested_url


def detect_terminal_document(final_url: str) -> str | None:
    """
    If the final URL is a document, return it.
    Else None.
    """
    if not final_url:
        return None

    if classify_document_url(final_url):
        return final_url

    return None
def bucket_from_content_type(ct: str) -> str | None:
    ct = ct.lower()

    if "application/pdf" in ct:
        return "pdf"

    if "application/vnd.ms-excel" in ct or \
       "application/vnd.openxmlformats-officedocument.spreadsheetml" in ct or \
       "text/csv" in ct:
        return "excel"

    if "application/msword" in ct or \
       "application/vnd.openxmlformats-officedocument.wordprocessingml" in ct:
        return "word"

    if "application/vnd.ms-powerpoint" in ct or \
       "application/vnd.openxmlformats-officedocument.presentationml" in ct:
        return "ppt"
    '''
    if "application/zip" in ct or "application/x-zip-compressed" in ct:
        return "archive"

    if ct.startswith("image/"):
        return "images"
    '''
    return None


async def visit_and_collect(page, depth, url, base_url, page_category):

    # ------------------ SAFE NAVIGATION ------------------
    ok = await goto_with_retry(
        page,
        url,
        retries=3,
        timeout=PAGE_LOAD_TIMEOUT,
    )

    if not ok:
        print(f"[ERROR] Failed after retries: {url}")

        log_soft_error(
            CURRENT_COLLEGE_CONTEXT.get("college_name"),
            url,
            "Navigation failed after all retries (non-HTML resource)"
        )

        return {c: [] for c in CATEGORY_KEYWORDS.keys()}

    await safe_wait(page, 500)
    await safe_load_wait(page)


    # =================== FINAL URL RESOLUTION ===================
    final_url = resolve_final_url(url, page)

    terminal_doc = detect_terminal_document(final_url)

    if terminal_doc:
        # üîí TERMINAL DOCUMENT (PDF / Excel / Image / Drive / etc.)
        page._seen_artifacts = {}
        page._page_seen_urls = set()

        upsert_artifact(
            page,
            normalize_url(terminal_doc, ""),
            terminal_doc,
            "Final resolved document"
        )

        category_links = {c: [] for c in CATEGORY_KEYWORDS}

        accept_and_store(
            raw_url=terminal_doc,
            enriched_text="Final resolved document",
            page_category=page_category,
            category_links=category_links,
            page=page,
        )

        return category_links


    # ------------------ PAGE STATE ------------------
    page._seen_artifacts = {}
    page._page_seen_urls = set()
    page._clicked_elements = set()
    page._local_seen_pdf = set()

    # ------------------ GLOBAL DOM POISONING (SAFE) ------------------
    await safe_evaluate(page, """
    () => {
        document.querySelectorAll('*').forEach(el => {
            try {
                const cls = (el.className || '').toString().toLowerCase();
                const aria = el.getAttribute && el.getAttribute('aria-roledescription');

                if (
                    aria === 'carousel' ||
                    cls.includes('carousel') ||
                    cls.includes('slider') ||
                    cls.includes('owl-') ||
                    cls.includes('swiper') ||
                    cls.includes('slick')
                ) {
                    el.dataset.__crawler_skip = "1";
                }
            } catch(e){}
        });
    }
    """)

    if depth>=1:
        await safe_evaluate(page, """
        () => {
            document.querySelectorAll('header, nav').forEach(el => {
                el.dataset.__crawler_skip = "1";
            });

            document.querySelectorAll('*').forEach(el => {
                try {
                    const s = getComputedStyle(el);
                    if ((s.position === 'fixed' || s.position === 'sticky')) {
                        const r = el.getBoundingClientRect();
                        if (r.top <= 80 && r.width > window.innerWidth * 0.6) {
                            el.dataset.__crawler_skip = "1";
                        }
                    }
                } catch(e){}
            });
        }
        """)

    category_links = {c: [] for c in CATEGORY_KEYWORDS}

    # ------------------ NETWORK Document(PDF) CAPTURE ------------------
    async def on_response(response):
        if page._closing or page.is_closed():
            return

        async with page_task(page) as ok:
            if not ok:
                return

            try:
                ct = (response.headers.get("content-type") or "").lower()
                bucket = bucket_from_content_type(ct)

                if not bucket:
                    return

                raw_url = response.url
                norm = normalize_url(raw_url, "")

                # track locally (for convergence)
                page._local_seen_pdf.add(norm)

                # enrich artifact store
                upsert_artifact(
                    page,
                    norm,
                    raw_url,
                    f"Network {bucket.upper()}"
                )

            except Exception:
                return

    '''
    async def on_response(response):
        if page._closing or page.is_closed():
            return

        async with page_task(page) as ok:
            if not ok:
                return

            try:
                ct = (response.headers.get("content-type") or "").lower()
                if "application/pdf" in ct or response.url.lower().endswith(".pdf"):
                    norm = normalize_url(response.url, "")
                    page._local_seen_pdf.add(norm)
                    upsert_artifact(
                        page,
                        norm,
                        response.url,
                        "Network PDF"
                    )
            except Exception:
                return
    '''
    # üîí keep reference so it can be removed safely
    page._on_response = on_response
    page.on("response", on_response)

    try:
        # ------------------ PAGE SETTLING ------------------
        await safe_wait(page, EXTRA_WAIT_MS)
        await close_popups(page)
        await opportunistic_close_popups(page)
        await auto_scroll(page)

        # ------------------ INTERACTION CONVERGENCE ------------------
        for _ in range(10):
            progress = False

            for handler in (handle_tabs, handle_accordions, handle_dropdowns):
                try:
                    _, did = await handler(page, depth)
                    if did:
                        progress = True
                        break
                except Exception:
                    continue

            if not progress:
                break

            await safe_wait(page, 300)

        # ------------------ ANCHOR EXTRACTION ------------------
        anchors = await page.query_selector_all("a[href]")

        for el in anchors:
            try:
                href = await safe_get_attr(el, "href")
                if not href:
                    continue

                full = href.strip()
                # üîí NORMALIZE BEFORE FILTERING
                if full.startswith("//"):
                    full = "https:" + full
                elif full.startswith("www."):
                    full = "https://" + full
                '''
                elif re.match(r"^[a-z0-9\-]+\.[a-z]{2,}", full):
                    full = "https://" + full
                '''
                full = urljoin(page.url, full)

                parsed = urlparse(full)
                if parsed.scheme not in ("http", "https"):
                    continue

                """
                full = urljoin(page.url, href)
                if not full.startswith(("http://", "https://")):
                    continue
                """

                text = await safe_inner_text(el)
                norm = normalize_url(full, "")

                upsert_artifact(
                    page,
                    norm,
                    full,
                    text
                )


            except Exception:
                continue

    finally:
        page.remove_listener("response", on_response)

    # ------------------ FINAL STORE ------------------
    for norm, obj in page._seen_artifacts.items():
        raw_url = obj["raw"]
        text = obj["text"]
        page_category = obj.get("page_category")

        # =====================================================
        # FINAL ROW CLASSIFICATION (single source of truth)
        # =====================================================
        doc_type = classify_document_url(raw_url)

        if doc_type:
            bucket = doc_type
        else:
            bucket = classify_url(raw_url, text)  # nonpdf_category_related / nonpdf_other

        # =====================================================
        # FINAL COLUMN DECISION (independent)
        # =====================================================
        criteria_col = infer_criteria_column(text, raw_url)
        if criteria_col:
            store_cat = criteria_col
        elif page_category:
            store_cat = page_category
        else:
            store_cat = "accreditation"

        category_links[store_cat].append(
            (bucket, f"{raw_url} || {text}")
        )


    return category_links

'''
    for frame in page.frames:
        if frame.url.lower().endswith(".pdf"):
            page._local_seen_pdf.add(normalize_url(frame.url, ""))
        frame_url = frame.url or ""
        norm = normalize_url(frame_url, "")
        old = page._seen_artifacts.get(norm)

        if should_accept_context(old, "Network PDF"):
            page._seen_artifacts[norm] = "Network PDF"

        try:
            parsed = urlparse(frame_url)
            for _, v in parse_qsl(parsed.query):
                decoded = unquote(v)
                if decoded.lower().endswith(".pdf"):
                    raw_url = decoded
                    enriched_text = "Iframe Viewer PDF"
                    bucket, cat = classify_url(raw_url, enriched_text, page_category)

                    # dedupe MUST still apply
                    norm = normalize_url(raw_url, "")
                    if norm in page._page_seen_urls:
                        continue
                    page._page_seen_urls.add(norm)

                    if not accept_url(norm):
                        continue

                    category_links[cat].append((bucket, f"{raw_url} || {enriched_text}"))

        except Exception:
            pass
    '''
    # ===================== NETWORK PDF ASSIGNMENT =====================
'''
    for item in network_pdfs:
        norm = normalize_url(item["url"], "")
        old = page._seen_artifacts.get(norm)
        new_text = "Network PDF"

        if should_accept_context(old, new_text):
            page._seen_artifacts[norm] = new_text
            assign_category(
                category_links,
                url=item["url"],
                text=new_text,
                depth=depth,
                page_category=page_category,
            )
'''








'''
def matches_category(cat: str, text: str, url: str) -> bool:
    combined = f"{text} {url}".lower()
    return any(kw in combined for kw in CATEGORY_KEYWORDS.get(cat, []))
'''

def matches_category_own(cat: str, text: str, url: str) -> bool:
    combined = normalize_token(f"_{text}_{url}_")

    for kw in CATEGORY_KEYWORDS.get(cat, []):
        norm_kw = normalize_token(kw)
        if not norm_kw:
            continue

        pattern = rf"(?:^|_){re.escape(norm_kw)}(?:_|$)"
        if re.search(pattern, combined):
            return True

    '''
    combined = f"_{text}_{url}_".lower()
    for kw in CATEGORY_KEYWORDS.get(cat, []):
        if kw and kw in combined:
            return True
    return False
'''

from urllib.parse import urlparse
import re

ACADEMIC_TLDS = {
    "ac", "edu", "org", "gov", "in"
}

def extract_domain_token(base_url: str) -> str | None:
    try:
        netloc = urlparse(base_url).netloc.lower().lstrip("www.")
        parts = netloc.split(".")

        # Example:
        # nitk.ac.in ‚Üí ["nitk","ac","in"]
        # bits-pilani.ac.in ‚Üí ["bits-pilani","ac","in"]
        # rcciit.org ‚Üí ["rcciit","org"]

        if len(parts) >= 3 and parts[-2] in ACADEMIC_TLDS:
            token = parts[-3]
        else:
            token = parts[0]

        # normalize: remove hyphens, dots
        token = re.sub(r"[^a-z0-9]", "", token)

        return token or None
    except Exception:
        return None

# üö´ NEVER useful for accreditation
HARD_BLOCK_DOMAINS = {
    "youtube", "youtu.be",
    "facebook", "fb.com",
    "instagram",
    "twitter", "x.com",
    "linkedin",
    "bing",
    "wikipedia",
    "telegram",
    "whatsapp",
    "nirfindia.org",
    "nbaind.org",
    "mhrd.gov.in",
    "shiksha.com",
    "collegedunia.com",
    "careers360.com",
    "indiastudychannel.com",
    "examresults.net",
    "entranceexam.net" ,
    "naac.gov",
    "nbaind.org",
    "nirf.gov",
    "ugc.ac",
    "aicte-india.org"
}

# ‚ö†Ô∏è External hosts that MAY contain valid docs
SOFT_EXTERNAL_HOSTS = {
    "docs.google",
    "drive.google",
    "googleusercontent",
    "onedrive.live",
    "sharepoint",
    "dropbox",
    "box.com"
}

'''
ACADEMIC_KEYWORDS = {
    "naac",
    "iqac",
    "nirf",
    "aicte",
    "nba",
    "ariia",
    "aqar",
    "accreditation",
    "approval",
    "extension of approval",
    "mandatory disclosure"
}
'''
def should_enqueue_url(url: str, text: str, institution_token: str) -> bool:
    combined = f"{url} {text}".lower()
    norm = re.sub(r"[^a-z0-9]", "", combined)

    # 1Ô∏è‚É£ Hard block (unchanged)
    for bad in HARD_BLOCK_DOMAINS:
        if bad in combined:
            return False

    '''
    # 2Ô∏è‚É£ College ownership (strongest signal)
    if institution_token and institution_token in norm:
        return True
    '''
    # 3Ô∏è‚É£ Category relevance (single source of truth)
    combined_norm = normalize_token(f"_{url}_{text}_")

    for kws in CATEGORY_KEYWORDS.values():
        for kw in kws:
            if not kw:
                continue

            norm_kw = normalize_token(kw)
            if not norm_kw:
                continue

            pattern = rf"(?:^|_){re.escape(norm_kw)}(?:_|$)"
            if re.search(pattern, combined_norm):
                return True

    # ‚ùå Otherwise skip
    return False

'''
def should_enqueue_url(
    url: str,
    text: str,
    institution_token: str
) -> bool:
    combined = f"{url} {text}".lower()

    # 1Ô∏è‚É£ HARD BLOCK (always useless)
    for bad in HARD_BLOCK_DOMAINS:
        if bad in combined:
            return False

    # 2Ô∏è‚É£ Normalize for ownership checks
    normalized = re.sub(r"[^a-z0-9]", "", combined)

    # 3Ô∏è‚É£ College ownership (strongest signal)
    if institution_token and institution_token in normalized:
        return True

    # 4Ô∏è‚É£ Academic relevance keywords
    academic_hit = any(kw in combined for kw in ACADEMIC_KEYWORDS)

    # 5Ô∏è‚É£ Soft external hosts (Google Docs, Drive, etc.)
    if any(h in combined for h in SOFT_EXTERNAL_HOSTS):
        # allow ONLY if academically relevant
        return academic_hit

    # 6Ô∏è‚É£ Non-soft external ‚Üí allow only if academic keyword exists
    if academic_hit:
        return True

    # ‚ùå Otherwise skip
    return False
'''

# ------------------------- CORE PROCESSING -------------------------
def college_to_filename(college_name: str) -> Path:
    safe = re.sub(r"[^a-zA-Z0-9]+", "_", college_name).strip("_")
    return OUTPUT_DIR / f"college_info_{safe}.xlsx"
def chunk_list(items, max_chars=30000):
    chunk, chunks = "", []
    for it in items:
        if len(chunk) + len(it) + 1 > max_chars:
            chunks.append(chunk)
            chunk = it
        else:
            chunk += ("\n" if chunk else "") + it
    if chunk:
        chunks.append(chunk)
    return chunks


def save_progress_excel(output_file):
    """
    Persist PROGRESS_ROWS to Excel.
    - Appends rows logically via PROGRESS_ROWS rebuild
    - No CSV
    - No column splitting
    - Safe for large PDF lists (handled earlier via row chunking)
    """
    if not PROGRESS_ROWS:
        return

    df = pd.DataFrame(list(PROGRESS_ROWS.values()))

    # ---- Stable column ordering ----
    base_cols = []
    for c in ("college_name", "base_url", "depth", "row_type"):
        if c in df.columns:
            base_cols.append(c)

    link_cols = sorted(
        c for c in df.columns
        if c.endswith("_links") and c not in base_cols
    )

    other_cols = [c for c in df.columns if c not in base_cols + link_cols]

    ordered_cols = base_cols + link_cols + other_cols

    try:
        df = df[ordered_cols]
    except Exception:
        pass

    # ---- Write Excel ONLY ----
    df.to_excel(output_file, index=False)

    print(
        f"[PROGRESS] Updated {output_file} "
        f"with {len(df)} row(s)."
    )

DOCUMENT_BUCKETS = {
    "pdf": "pdf",
    "excel": "excel",
    "word": "word",
    "ppt": "ppt",
    "archive": "archive",
    "google_drive": "google_drive",
    "other_document": "other_document",
    "images":"images",
}

from collections import deque

def create_empty_college_excel(college_name: str, base_url: str):
    """
    Create an empty Excel file with correct columns for a college.
    This guarantees the file exists even if no rows are ever written.
    """
    output_file = college_to_filename(college_name)

    # Build empty row structure (headers only)
    cols = ["college_name", "base_url", "depth", "row_type"]
    cols += [f"{c}_links" for c in CATEGORY_KEYWORDS.keys()]

    df = pd.DataFrame(columns=cols)

    # Optional: keep college metadata visible even if empty
    # (comment out if you prefer a fully empty sheet)
    # df.loc[0, "college_name"] = college_name
    # df.loc[0, "base_url"] = base_url

    df.to_excel(output_file, index=False)

    print(f"[INIT] Created empty Excel: {output_file}")


async def process_college(browser, college_name: str, base_url: str):
    print(f"\n[COLLEGE] Starting processing: {college_name} | {base_url}")
    
    # ‚úÖ CREATE EMPTY EXCEL IMMEDIATELY
    create_empty_college_excel(college_name, base_url)

    global PROGRESS_ROWS
    #GLOBAL_SEEN_URLS.clear()
    GLOBAL_SEEN_ARTIFACTS.clear()
    GLOBAL_SEEN_PAGES.clear()
    GLOBAL_EXCEL_URLS.clear()
    GLOBAL_BFS_URLS.clear()

    last_flushed_depth = -1
    '''
    # üîí GLOBAL URL OWNERSHIP (single source of truth)
    global GLOBAL_URL_OWNERS
    GLOBAL_URL_OWNERS.clear()
    '''

    #browser = await playwright.chromium.launch(headless=HEADLESS)
    #page = await browser.new_page( viewport={"width": 1920, "height": 1080})

    output_file = college_to_filename(college_name)
    visited_pages = set()          # pages already crawled
   
    queued_pages = set()

    start_url = base_url
    start_norm = normalize_url(start_url, "")
    #queue = [(start_url, 0, None)]
    queue = deque([(start_url, 0, None)])
    queued_pages.add(start_norm)
    #GLOBAL_SEEN_URLS.add(start_norm)
    GLOBAL_SEEN_PAGES.add(start_norm)
    start_canon = canonicalize_for_dedupe(start_norm)
    GLOBAL_BFS_URLS.add(start_canon)



    # depth -> category -> row_type -> set(entries)
    all_links_by_depth = {}

    def ensure_depth_struct(d):
        if d not in all_links_by_depth:
            all_links_by_depth[d] = {
                c: {
                    "nonpdf_category_related": set(),
                    "nonpdf_other": set(),
                    "pdf": set(),
                    "excel": set(),
                    "word": set(),
                    "ppt": set(),
                    "archive": set(),
                    "google_drive": set(),
                    "other_document": set(),
                    "images": set(),  # ‚úÖ ADD THIS
                }
                for c in CATEGORY_KEYWORDS.keys()
            }


    def build_rows():
        rows = []

        for depth, cats in sorted(all_links_by_depth.items()):
            for rt in sorted({
                rt for cats in all_links_by_depth.values()
                for cat in cats.values()
                for rt in cat.keys()
                if rt != "nonpdf_other" #can include if needed
            }): 

                # process each category independently
                for c in CATEGORY_KEYWORDS:
                    items = sorted(cats[c].get(rt, ()))
                    if not items:
                        continue

                    chunks = chunk_list(items)

                    for chunk in chunks:
                        row = {
                            "college_name": college_name,
                            "base_url": base_url,
                            "depth": depth,
                            "row_type": rt,
                        }

                        # initialize all link columns empty
                        for col in CATEGORY_KEYWORDS:
                            row[f"{col}_links"] = ""

                        # only this category gets data
                        row[f"{c}_links"] = chunk

                        rows.append(row)

        return rows

    # ===================== BFS =====================
    while queue:
        #current_url, depth, page_category = queue.pop(0)
        current_url, depth, page_category = queue.popleft()
        norm_current = normalize_url(current_url, "")

        if norm_current in visited_pages :
            continue
        # must already be globally accepted
        '''
        if norm_current not in GLOBAL_SEEN_PAGES:
            continue
        '''

        visited_pages.add(norm_current)

        print(f"[INFO] Visiting depth {depth}: {current_url}")

        # üîí UPDATE CONTEXT FOR PAGE-LEVEL ERRORS
        CURRENT_COLLEGE_CONTEXT["url"] = current_url

        #page = await browser.new_page(viewport={"width": 1920, "height": 1080})
        domain = urlparse(current_url).netloc

        async with DOMAIN_SEMAPHORES[domain]:
            page = await browser.new_page(
                viewport={"width": 1920, "height": 1080}
            )

        # üîí lifecycle guards (MANDATORY)
        page._closing = False
        page._active_tasks = 0

        try:
            # visit_and_collect returns: { category -> list[(bucket, entry)] }
            cat_links = await visit_and_collect(
                page, depth, current_url, base_url, page_category
            )
        except Exception as e:
            print(f"[WARN] Skipping page due to error: {current_url}")
            log_college_error(college_name, current_url, e)
            continue
        finally:
            await safe_close_page(page)

        ensure_depth_struct(depth)

        # ---------- INSERT INTO EXCEL STRUCTURE ----------
        for cat, items in cat_links.items():
            for bucket,entry in items:
                parts = [p.strip() for p in entry.split("||", 1)]
                url = parts[0]
                text = parts[1] if len(parts) > 1 else ""

                url = url.strip()
                text = text.strip()

                canon = canonicalize_for_dedupe(url)

                # üîí FINAL EXCEL DEDUPE
                if canon in GLOBAL_EXCEL_URLS:
                    continue

                GLOBAL_EXCEL_URLS.add(canon)

                # bucket already decided earlier ‚Äî NO re-classification
                # üîç Classify at ROW CREATION TIME
                if bucket in DOCUMENT_BUCKETS:
                    all_links_by_depth[depth][cat][bucket].add(entry)
                else:
                    # non-document logic stays unchanged
                    if bucket == "nonpdf_category_related":
                        all_links_by_depth[depth][cat]["nonpdf_category_related"].add(entry)
                    else:
                        all_links_by_depth[depth][cat]["nonpdf_other"].add(entry)

        # ================== INCREMENTAL DEPTH FLUSH ==================

        # üîí FORCE FLUSH EVEN IF NO BFS ENQUEUE
        # If this page produced ANY stored artifacts, write them immediately

        produced_any = any(
            all_links_by_depth[depth][c][rt]
            for c in CATEGORY_KEYWORDS
            for rt in all_links_by_depth[depth][c]
        )

        if produced_any and depth >= last_flushed_depth:
            new_rows = []

            for rt in (
                "pdf", "excel", "word", "ppt",
                "archive", "google_drive", "other_document",
                "images", "nonpdf_category_related"
            ):
                for c in CATEGORY_KEYWORDS:
                    items = sorted(all_links_by_depth[depth][c].get(rt, ()))
                    if not items:
                        continue

                    for chunk in chunk_list(items):
                        row = {
                            "college_name": college_name,
                            "base_url": base_url,
                            "depth": depth,
                            "row_type": rt,
                        }
                        for col in CATEGORY_KEYWORDS:
                            row[f"{col}_links"] = ""
                        row[f"{c}_links"] = chunk
                        new_rows.append(row)

            if new_rows:
                start = len(PROGRESS_ROWS)
                for i, r in enumerate(new_rows, start=start):
                    PROGRESS_ROWS[f"{college_name}___{i}"] = r

                save_progress_excel(output_file)

            last_flushed_depth = depth

        # ---------- QUEUE EXPANSION ----------
        for category in CATEGORY_KEYWORDS.keys():
            for entry in all_links_by_depth[depth][category]["nonpdf_category_related"]:
                parts = entry.split("||", 1)
                url = entry.split("||")[0].strip()
                text = parts[1].strip() if len(parts) > 1 else ""
                
                # üîí HARD STOP: never crawl documents
                doc_type = classify_document_url(url)
                if doc_type:
                    continue
                
                # üîí HARD STOP: never crawl binary / media URLs
                if is_non_crawlable_url(url):
                    continue

                if is_pdf(url):
                    continue
                
                institution_token = extract_domain_token(base_url)

                if not should_enqueue_url(
                    url=url,
                    text=text,
                    institution_token=institution_token
                ):
                    continue


                '''
                # üîí CRITICAL: enforce intrinsic category match
                if not matches_category_own(category, text, url):
                    continue
                '''
                norm = normalize_url(url, "")
                if norm in visited_pages or norm in queued_pages:
                    continue

                # üîí IQAC ADMIN PAGE ‚Äî extract but DO NOT enqueue
                if category == "iqac":
                    if is_iqac_admin_page(f"{url} {text}"):
                        continue  # ‚ùå skip BFS enqueue only

                canon = canonicalize_for_dedupe(url)
                # üîí FINAL BFS DEDUPE
                if canon in GLOBAL_BFS_URLS:
                    continue

                GLOBAL_BFS_URLS.add(canon)

                queue.append((url, depth + 1, category))
                queued_pages.add(norm)
                #GLOBAL_SEEN_PAGES.add(norm)


    #await browser.close()
    return build_rows()



# ------------------------- EXCEL HELPERS -------------------------


def build_row_dict(college_name: str, base_url: str, all_category_links: dict) -> dict:
    def join_sorted(s):
        return "\n".join(sorted(s)) if s else ""
    return {
        "college_name": college_name,
        "base_url": base_url,
        "mandatory_disclosure_links": join_sorted(all_category_links.get("mandatory_disclosure", set())),
        "naac_links": join_sorted(all_category_links.get("naac", set())),
        "nba_links": join_sorted(all_category_links.get("nba", set())),
        "nirf_links": join_sorted(all_category_links.get("nirf", set())),
        "iqac_links": join_sorted(all_category_links.get("iqac", set())),
        "aicte_links": join_sorted(all_category_links.get("aicte", set())),
        "aqar_links": join_sorted(all_category_links.get("aqar", set())),
        "ariia_links": join_sorted(all_category_links.get("ariia", set())),
        "accreditation_links": join_sorted(all_category_links.get("accreditation", set())),
        "criteria_links": join_sorted(all_category_links.get("criteria", set())),
    }



  '''


In [89]:
'''
Docstring for source_links_extractio_from_url_2_0 copy 6-final.ipynb


sudo apt update
sudo apt install python3.12-venv

python3 -m venv .venv

source .venv/bin/activate

pip install pandas playwright openpyxl

playwright install

'''


'\nDocstring for source_links_extractio_from_url_2_0 copy 6-final.ipynb\n\n\nsudo apt update\nsudo apt install python3.12-venv\n\npython3 -m venv .venv\n\nsource .venv/bin/activate\n\npip install pandas playwright openpyxl\n\nplaywright install\n\n'

In [4]:



import pandas as pd

def load_colleges_from_excel(excel_path, sheet_name="Sheet1"):
    df = pd.read_excel(excel_path, sheet_name=sheet_name)

    required_cols = {"College Name", "College Website URL"}
    if not required_cols.issubset(df.columns):
        raise ValueError(
            f"Excel must contain columns: {required_cols}. Found: {df.columns}"
        )

    colleges = []

    for _, row in df.iterrows():
        name = str(row["College Name"]).strip()
        url = str(row["College Website URL"]).strip()

        if not name or not url or url.lower() == "nan":
            continue

        colleges.append({
            "college_name": name,
            "base_url": url
        })

    print(f"[INFO] Loaded {len(colleges)} colleges from Excel")
    return colleges




# ------------------------- RUNNER -------------------------
async def run_scraping(colleges):
    setup_asyncio_exception_logger()
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=HEADLESS)
        number =0
        for c in colleges:
            number +=1
            global PROGRESS_ROWS
            PROGRESS_ROWS = {}

            name = f"{number}_{c.get('college_name')}"
            url = c.get("base_url")
            # üîí UPDATE GLOBAL CONTEXT
            CURRENT_COLLEGE_CONTEXT["college_name"] = name
            CURRENT_COLLEGE_CONTEXT["url"] = url
            
            if not name or not url:
                print(f"[WARN] Skipping invalid entry: {c}")
                continue

            try:
                await process_college(browser, name, url)

            except Exception as e:
                print(f"[ERROR] Failed college: {name}")
                print(f"[ERROR] URL: {url}")
                print(f"[ERROR] Logged to file.")

                log_college_error(name, url, e)
                continue

        await browser.close()

    print("[DONE] All colleges processed.")


async def run_scraping_from_excel():
    colleges = load_colleges_from_excel(
        EXCEL_INPUT_FILE,
        sheet_name=SHEET_NAME
    )
    await run_scraping(colleges)


    '''
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=HEADLESS)

            for c in colleges:
                global PROGRESS_ROWS
                PROGRESS_ROWS = {}

                name = c["college_name"]
                url = c["base_url"]

                print(f"\n[INFO] Processing: {name}")
                try:
                    await process_college(browser, name, url)

                except Exception as e:
                    print(f"[ERROR] Failed college: {name}")
                    print(f"[ERROR] URL: {url}")
                    print(f"[ERROR] Logged to file.")

                    log_college_error(name, url, e)
                    continue

            await browser.close()

        print("[DONE] All colleges processed.")

    '''


# ------------------------- USAGE EXAMPLE -------------------------
# example = [{"college_name": "Guru Nanak Institute of Technology", "base_url": "https://www.gnithyd.ac.in/"}]
# df = asyncio.run(run_scraping(example))


In [36]:
example = [
     {
        "college_name": "just_testing",
        "base_url":" https://www.dttcollege.org"
    },

]

df = await run_scraping(example)
df


[COLLEGE] Starting processing: just_testing |  https://www.dttcollege.org
[INIT] Created empty Excel: output_college_info_6_7/college_info_just_testing.xlsx
[INFO] Visiting depth 0:  https://www.dttcollege.org
[DEBUG] Scroll round 1/3 completed
nonpdf-category (keyword-regex): url:https://www.dttcollege.org/dttc.php?id=Mandatory, text:, pattern:(?:^|_)mandatory(?:_|$), combined:__https___www_dttcollege_org_dttc_php_id_mandatory_
[PROGRESS] Updated output_college_info_6_7/college_info_just_testing.xlsx with 2 row(s).
[INFO] Visiting depth 1: https://www.dttcollege.org/dttc.php?id=Mandatory
[DEBUG] Scroll round 1/3 completed
nonpdf-category (keyword-regex): url:https://www.dttcollege.org/dttc.php?id=Mandatory, text:Infrastructure, pattern:(?:^|_)mandatory(?:_|$), combined:_infrastructure_https___www_dttcollege_org_dttc_php_id_mandatory_
nonpdf-category (keyword-regex): url:https://www.dttcollege.org/dttc.php?id=mandatory_disclosure, text:Mandatory Disclosure 2022-23, pattern:(?:^|_)mand

In [14]:
example = [
     {
        "college_name": "just_testing_1",
        "base_url":" https://www.dttcollege.org"
    },

]

df = await run_scraping(example)
df


[COLLEGE] Starting processing: just_testing_1 |  https://www.dttcollege.org
[INIT] Created empty Excel: output_college_info_6_7/college_info_just_testing_1.xlsx
[INFO] Visiting depth 0:  https://www.dttcollege.org
[DEBUG] Scroll round 1/3 completed
nonpdf-other: url:https://www.dttcollege.org/index.php, text:Home, combined:home_https___www_dttcollege_org_index_php
nonpdf-other: url:https://www.dttcollege.org/dttc.php?id=About Us, text:About College, combined:about_college_https___www_dttcollege_org_dttc_php_id_about_us
nonpdf-other: url:https://www.dttcollege.org/dttc.php?id=History, text:History of College, combined:history_of_college_https___www_dttcollege_org_dttc_php_id_history
nonpdf-other: url:https://www.dttcollege.org/dttc.php?id=Vision, text:Vision, combined:vision_https___www_dttcollege_org_dttc_php_id_vision
nonpdf-other: url:https://www.dttcollege.org/dttc.php?id=Mission, text:Mission Statement, combined:mission_statement_https___www_dttcollege_org_dttc_php_id_mission
nonp

In [25]:
example = [
     {
        "college_name": "just_testing_2",
        "base_url":" https://www.dttcollege.org"
    },

]

df = await run_scraping(example)
df


[COLLEGE] Starting processing: just_testing_2 |  https://www.dttcollege.org
[INIT] Created empty Excel: output_college_info_6_7/college_info_just_testing_2.xlsx
[INFO] Visiting depth 0:  https://www.dttcollege.org
[DEBUG] Scroll round 1/3 completed
nonpdf-other: url:https://www.dttcollege.org/index.php, text:Home, combined:_home_https___www_dttcollege_org_index_php_
nonpdf-other: url:https://www.dttcollege.org/dttc.php?id=About Us, text:About College, combined:_about_college_https___www_dttcollege_org_dttc_php_id_about_us_
nonpdf-other: url:https://www.dttcollege.org/dttc.php?id=History, text:History of College, combined:_history_of_college_https___www_dttcollege_org_dttc_php_id_history_
nonpdf-other: url:https://www.dttcollege.org/dttc.php?id=Vision, text:Vision, combined:_vision_https___www_dttcollege_org_dttc_php_id_vision_
nonpdf-other: url:https://www.dttcollege.org/dttc.php?id=Mission, text:Mission Statement, combined:_mission_statement_https___www_dttcollege_org_dttc_php_id_mis

In [30]:
example = [
     {
        "college_name": "just_testing_3",
        "base_url":"http://www.aarm.ac.in"
    },

]

df = await run_scraping(example)
df


[COLLEGE] Starting processing: just_testing_3 | http://www.aarm.ac.in
[INIT] Created empty Excel: output_college_info_6_7/college_info_just_testing_3.xlsx
[INFO] Visiting depth 0: http://www.aarm.ac.in
[DEBUG] Scroll round 1/3 completed
nonpdf-category (keyword-regex): url:http://www.aarm.ac.in/mandatory-disclosure.php, text:, pattern:(?:^|_)mandatory(?:_|$), combined:__http___www_aarm_ac_in_mandatory_disclosure_php_
[PROGRESS] Updated output_college_info_6_7/college_info_just_testing_3.xlsx with 4 row(s).
[INFO] Visiting depth 1: http://www.aarm.ac.in/mandatory-disclosure.php
[DEBUG] Scroll round 1/3 completed
nonpdf-category (keyword-regex): url:http://www.aarm.ac.in/mandatory-disclosure.php#admissionForm, text:Click here, pattern:(?:^|_)mandatory(?:_|$), combined:_click_here_http___www_aarm_ac_in_mandatory_disclosure_php_admissionform_
nonpdf-category (keyword-regex): url:http://www.aarm.ac.in/aicte.php, text:AICTE EOA, pattern:(?:^|_)aicte(?:_|$), combined:_aicte_eoa_http___www_aa

In [8]:
await run_scraping_from_excel()

[INFO] Loaded 345 colleges from Excel

[COLLEGE] Starting processing: 1_DR. B.R. AMBEDKAR INSTITUTE OF TECHNOLOGY | https://drait.edu.in/
[INIT] Created empty Excel: output_college_info_6_7/college_info_1_DR_B_R_AMBEDKAR_INSTITUTE_OF_TECHNOLOGY.xlsx
[INFO] Visiting depth 0: https://drait.edu.in/
[DEBUG] Scroll round 1/3 completed
[PROGRESS] Updated output_college_info_6_7/college_info_1_DR_B_R_AMBEDKAR_INSTITUTE_OF_TECHNOLOGY.xlsx with 1 row(s).
[INFO] Visiting depth 1: https://drait.edu.in/home/NIRF
[DEBUG] Scroll round 1/3 completed
[PROGRESS] Updated output_college_info_6_7/college_info_1_DR_B_R_AMBEDKAR_INSTITUTE_OF_TECHNOLOGY.xlsx with 2 row(s).
[INFO] Visiting depth 1: https://drait.edu.in/home/Affiliation-from-VTU
[DEBUG] Scroll round 1/3 completed
[PROGRESS] Updated output_college_info_6_7/college_info_1_DR_B_R_AMBEDKAR_INSTITUTE_OF_TECHNOLOGY.xlsx with 4 row(s).
[INFO] Visiting depth 1: https://drait.edu.in/home/IQAC-Coordinators
[DEBUG] Scroll round 1/3 completed
[PROGRESS] U

  '''


CancelledError: 

In [None]:
example = [
     {
        "college_name": "Cooch Behar Government Engineering College",
        "base_url": "http://cgec.org.in/",
    },

]

df = await run_scraping(example)
df




[COLLEGE] Starting processing: Cooch Behar Government Engineering College | http://cgec.org.in/
[INIT] Created empty Excel: output_college_info_6_7/college_info_Cooch_Behar_Government_Engineering_College.xlsx
[INFO] Visiting depth 0: http://cgec.org.in/
[DEBUG] Scroll round 1/3 completed
[PROGRESS] Updated output_college_info_6_7/college_info_Cooch_Behar_Government_Engineering_College.xlsx with 4 row(s).
[INFO] Visiting depth 1: https://cgec.org.in/37_aicte_approval.php
[PROGRESS] Updated output_college_info_6_7/college_info_Cooch_Behar_Government_Engineering_College.xlsx with 5 row(s).
[DONE] All colleges processed.


In [37]:
example = [
    {"college_name": "pallavi_engineering_college", "base_url": "https://pallaviengineeringcollege.ac.in/"},
]

df = await run_scraping(example)
df




[COLLEGE] Starting processing: pallavi_engineering_college | https://pallaviengineeringcollege.ac.in/
[INIT] Created empty Excel: output_college_info_6_7/college_info_pallavi_engineering_college.xlsx
[INFO] Visiting depth 0: https://pallaviengineeringcollege.ac.in/
[DEBUG] Scroll round 1/3 completed
[DEBUG] Scroll round 2/3 completed
nonpdf-category (keyword-regex): url:https://pallaviengineeringcollege.ac.in/eligibility-criteria.php, text:Eligibility Criteria, pattern:(?:^|_)criteria(?:_|$), combined:_eligibility_criteria_https___pallaviengineeringcollege_ac_in_eligibility_criteria_php_
nonpdf-category (keyword-regex): url:https://pallaviengineeringcollege.ac.in/mandatory-disclosure.php, text:Mandatory Disclosure, pattern:(?:^|_)mandatory(?:_|$), combined:_mandatory_disclosure_https___pallaviengineeringcollege_ac_in_mandatory_disclosure_php_
nonpdf-category (keyword-regex): url:https://www.aicte-india.org/feedback/index.php, text:AICTE
                                    Feedback, pa

  '''


CancelledError: 

In [7]:
example_colleges = [
    {
        "college_name": "Siddhartha Institute of Technology and Sciences",
        "base_url": "https://siddhartha.org.in/",
    },

]

await run_scraping(example_colleges)


[INFO] Visiting depth 0: https://siddhartha.org.in/
[DEBUG] Scroll round 1/3 completed
[DEBUG] Scroll round 2/3 completed
[PROGRESS] Updated output_college_info/college_info_Siddhartha_Institute_of_Technology_and_Sciences.xlsx with 2 row(s).
[INFO] Visiting depth 1: https://siddhartha.org.in/iqac/
[DEBUG] Scroll round 1/3 completed
[DEBUG] Scroll round 2/3 completed
[PROGRESS] Updated output_college_info/college_info_Siddhartha_Institute_of_Technology_and_Sciences.xlsx with 5 row(s).
[INFO] Visiting depth 1: https://siddhartha.org.in/accreditations/
[DEBUG] Scroll round 1/3 completed
[DEBUG] Scroll round 2/3 completed
[PROGRESS] Updated output_college_info/college_info_Siddhartha_Institute_of_Technology_and_Sciences.xlsx with 5 row(s).
[DONE] All colleges processed.


In [25]:
example_colleges = [
    {
        "college_name": "VIGNAN'S INSTITUTE OF MANAGEMENT AND TECHNOLOGY FOR WOMEN",
        "base_url": "https://www.vmtw.in/",
    },
]

await run_scraping(example_colleges)


[INFO] Visiting depth 0: https://www.vmtw.in/
[DEBUG] Scroll round 1/3 completed
[INFO] Page fully converged ‚Äî stopping interactions
[PROGRESS] Updated output_college_info_5/college_info_VIGNAN_S_INSTITUTE_OF_MANAGEMENT_AND_TECHNOLOGY_FOR_WOMEN.xlsx and output_college_info_5/college_info_VIGNAN_S_INSTITUTE_OF_MANAGEMENT_AND_TECHNOLOGY_FOR_WOMEN.csv with 3 row(s).
[INFO] Visiting depth 1: https://www.vmtw.in/naac.php
[DEBUG] Scroll round 1/3 completed
[INFO] Page fully converged ‚Äî stopping interactions
[PROGRESS] Updated output_college_info_5/college_info_VIGNAN_S_INSTITUTE_OF_MANAGEMENT_AND_TECHNOLOGY_FOR_WOMEN.xlsx and output_college_info_5/college_info_VIGNAN_S_INSTITUTE_OF_MANAGEMENT_AND_TECHNOLOGY_FOR_WOMEN.csv with 6 row(s).
[INFO] Visiting depth 1: https://www.vmtw.in/nirf.php
[DEBUG] Scroll round 1/3 completed
[INFO] Page fully converged ‚Äî stopping interactions
[PROGRESS] Updated output_college_info_5/college_info_VIGNAN_S_INSTITUTE_OF_MANAGEMENT_AND_TECHNOLOGY_FOR_WOMEN.x