In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

# CONFIGURATION
BASE_URL = "https://wiki.metakgp.org"
SEED_URL = "https://wiki.metakgp.org/w/Special:AllPages"

# Namespaces to strictly ignore
IGNORED_NAMESPACES = [
    "Special:", "Talk:", "User:", "User_talk:", "Metakgp:", 
    "Metakgp_talk:", "File:", "File_talk:", "MediaWiki:", 
    "Template:", "Help:", "Category:", "Category_talk:"
]

def crawl_all_urls():
    print("üï∏Ô∏è  Starting Universal Crawler (v3)...")
    current_url = SEED_URL
    all_links = []
    page_counter = 1

    while current_url:
        print(f"üìñ Reading Page {page_counter}...")
        
        try:
            response = requests.get(current_url)
            if response.status_code != 200:
                print(f"‚ùå Failed to load: {current_url}")
                break

            soup = BeautifulSoup(response.text, 'html.parser')

            # --- 1. REMOVE NOISE (Sidebar & Footer) ---
            # We destroy the sidebar and footer from the soup object before searching.
            # This ensures we don't accidentally grab "Main Page" or "About" links.
            for garbage in soup.find_all(class_=['mw-panel', 'vector-menu-portal', 'footer', 'mw-footer']):
                garbage.decompose()
            for garbage in soup.find_all(id=['mw-panel', 'footer', 'mw-navigation']):
                garbage.decompose()

            # --- 2. FIND ALL REMAINING LINKS ---
            # Now the only links left should be in the content area.
            links = soup.find_all('a', href=True)
            found_on_this_page = 0
            
            for link in links:
                href = link['href']
                full_url = urljoin(BASE_URL, href)
                
                # --- 3. FILTER LOGIC ---
                # A. Must be a Wiki link
                if "/wiki/" not in href:
                    continue
                
                # B. Must NOT be an Admin/System page
                is_banned = False
                for ns in IGNORED_NAMESPACES:
                    if ns in href:
                        is_banned = True
                        break
                
                if is_banned:
                    continue

                # C. Must NOT be the "Next Page" pagination link
                if "Next page" in link.text or "Previous page" in link.text:
                    continue

                # If we passed all checks, it's a valid article!
                all_links.append(full_url)
                found_on_this_page += 1

            print(f"   -> Found {found_on_this_page} valid links on this page.")
            
            # --- DEBUG: If 0 found, print what we DID see to help debug ---
            if found_on_this_page == 0:
                print("   ‚ö†Ô∏è DEBUG: Here are the first 5 raw links I saw (and rejected):")
                for l in links[:5]:
                    print(f"      - Text: '{l.text}' | Href: '{l['href']}'")

            # --- 4. PAGINATION ---
            # We look for the "Next page" link specifically.
            next_link = None
            # Re-fetch all links including navigation (since we decomposed them earlier, 
            # we might need to check if we deleted the nav. 
            # Actually, the 'Next' link is usually in the content body or top/bottom of list.
            # If we decomposed 'mw-navigation', we might have killed it.
            # Let's check the UN-MODIFIED text for pagination link.
            
            # Strategy: Search the raw text for the 'Next page' link pattern if soup failed
            pagination_soup = BeautifulSoup(response.text, 'html.parser') # Fresh soup
            nav_links = pagination_soup.find_all("a", href=True)
            
            for link in nav_links:
                if "Next page" in link.text:
                    next_link = urljoin(BASE_URL, link['href'])
                    break
            
            if next_link:
                current_url = next_link
                page_counter += 1
                time.sleep(0.5)
            else:
                print("‚úÖ Reached end of the list (No 'Next page' link found).")
                break

        except Exception as e:
            print(f"CRITICAL ERROR: {e}")
            break

    print(f"\nüéâ Crawler Finished! Found {len(all_links)} total pages.")
    return all_links

if __name__ == "__main__":
    final_list = crawl_all_urls()

üï∏Ô∏è  Starting Universal Crawler (v3)...
üìñ Reading Page 1...
   -> Found 0 valid links on this page.
   ‚ö†Ô∏è DEBUG: Here are the first 5 raw links I saw (and rejected):
      - Text: 'Jump to content' | Href: '#bodyContent'
      - Text: 'Main page' | Href: '/w/Main_Page'
      - Text: 'Yellow pages' | Href: '/w/Yellow_pages'
      - Text: 'Recent changes' | Href: '/w/Special:RecentChanges'
      - Text: 'Random article' | Href: '/w/Special:Random'
üìñ Reading Page 2...
   -> Found 0 valid links on this page.
   ‚ö†Ô∏è DEBUG: Here are the first 5 raw links I saw (and rejected):
      - Text: 'Jump to content' | Href: '#bodyContent'
      - Text: 'Main page' | Href: '/w/Main_Page'
      - Text: 'Yellow pages' | Href: '/w/Yellow_pages'
      - Text: 'Recent changes' | Href: '/w/Special:RecentChanges'
      - Text: 'Random article' | Href: '/w/Special:Random'
üìñ Reading Page 3...
   -> Found 0 valid links on this page.
   ‚ö†Ô∏è DEBUG: Here are the first 5 raw links I saw (and 