In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
import re

# Configuration
CATEGORY = "Science"
DEPTH = 1
MAX_ROWS = 1000
OUTPUT_FILE = f"wikipedia_{CATEGORY.lower()}_expanded.csv"
BASE_URL = "https://en.wikipedia.org"

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (compatible; WikipediaScraper/1.0; +https://yourdomain.com/bot)'
}

session = requests.Session()
session.headers.update(HEADERS)

def get_soup(url, retries=3, backoff=2):
    for attempt in range(retries):
        try:
            response = session.get(url, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.text, 'html.parser')
        except (requests.RequestException, requests.Timeout) as e:
            print(f"Error fetching {url}: {e}")
            if attempt < retries - 1:
                sleep_time = backoff ** attempt
                print(f"Retrying in {sleep_time} seconds...")
                time.sleep(sleep_time)
            else:
                print("Max retries reached. Skipping this URL.")
                return None

def get_first_edit_year(article_url):
    # Use MediaWiki API as future improvement (optional)
    history_url = article_url + "?action=history&limit=1&dir=next"
    soup = get_soup(history_url)
    if not soup:
        return 'Unknown'
    history_entry = soup.find('a', class_='mw-changeslist-date')
    if history_entry:
        match = re.search(r'\d{4}', history_entry.text)
        return match.group(0) if match else 'Unknown'
    return 'Unknown'

def extract_article_data(article_url):
    soup = get_soup(article_url)
    if not soup:
        return None

    data = {}

    # Title
    title = soup.find('h1', id='firstHeading')
    data['Title'] = title.text.strip() if title else 'Unknown'

    # Summary (first non-empty paragraph)
    content = soup.find('div', class_='mw-parser-output')
    summary = None
    if content:
        for p in content.find_all('p', recursive=False):
            text = p.get_text(strip=True)
            if text:
                summary = text
                break
    data['Summary'] = summary if summary else 'Unknown'

    # Categories
    cat_div = soup.find('div', id='mw-normal-catlinks')
    if cat_div:
        categories = [a.text for a in cat_div.find_all('a')[1:]]  # skip 'Categories'
        data['Categories'] = ';'.join(categories) if categories else 'Unknown'
    else:
        data['Categories'] = 'Unknown'

    # References count
    refs = soup.find_all('ol', class_='references')
    ref_count = 0
    if refs:
        try:
            ref_count = sum(len(ol.find_all('li')) for ol in refs)
        except Exception:
            ref_count = 0
    data['References'] = ref_count

    # Internal links count in main content
    content_div = soup.find('div', id='mw-content-text')
    internal_links_count = 0
    if content_div:
        internal_links_count = len([a for a in content_div.find_all('a', href=True)
                                    if a['href'].startswith('/wiki/') and
                                    not a['href'].startswith('/wiki/Category:') and
                                    not a['href'].startswith('/wiki/File:')])
    data['Links'] = internal_links_count

    # External links count in main content
    external_links_count = 0
    if content_div:
        external_links_count = len([a for a in content_div.find_all('a', href=True)
                                    if a['href'].startswith('http') and
                                    'wikipedia.org' not in a['href']])
    data['External_Links'] = external_links_count

    # Last edited date
    footer = soup.find('li', id='footer-info-lastmod')
    if footer and footer.text:
        parts = footer.text.split('on ')
        if len(parts) > 1:
            date_part = parts[1].split(', at')[0].strip()
            data['Last_Edited'] = date_part
        else:
            data['Last_Edited'] = 'Unknown'
    else:
        data['Last_Edited'] = 'Unknown'

    # Word count
    word_count = 0
    if content:
        text = ' '.join(p.get_text() for p in content.find_all('p'))
        word_count = len(text.split())
    data['Word_Count'] = word_count

    # Image count in main content
    image_count = 0
    if content:
        image_count = len(content.find_all('img'))
    data['Image_Count'] = image_count

    # Section count (h2 and h3 headings in main content)
    section_count = 0
    if content:
        section_count = len(content.find_all(['h2', 'h3']))
    data['Section_Count'] = section_count

    # First edit year
    data['First_Edit_Year'] = get_first_edit_year(article_url)

    return data

def get_article_urls(category_url, depth, visited_categories=None):
    if visited_categories is None:
        visited_categories = set()

    article_urls = set()
    subcategories = []

    soup = get_soup(category_url)
    if not soup:
        return article_urls, subcategories

    # Articles in this category
    content_div = soup.find('div', id='mw-pages')
    if content_div:
        for link in content_div.find_all('a', href=True):
            href = link['href']
            if href.startswith('/wiki/') and not href.startswith('/wiki/Category:') and not href.startswith('/wiki/File:'):
                full_url = BASE_URL + href
                article_urls.add(full_url)

    # Pagination handling for articles
    next_page_link = content_div.find('a', string='next page') if content_div else None
    if next_page_link:
        next_page_url = BASE_URL + next_page_link['href']
        more_articles, more_subcats = get_article_urls(next_page_url, depth, visited_categories)
        article_urls.update(more_articles)
        subcategories.extend(more_subcats)

    # Subcategories
    subcat_div = soup.find('div', id='mw-subcategories')
    if subcat_div and depth > 0:
        for link in subcat_div.find_all('a', href=True):
            href = link['href']
            if href.startswith('/wiki/Category:'):
                full_url = BASE_URL + href
                if full_url not in visited_categories:
                    visited_categories.add(full_url)
                    subcategories.append(full_url)

        # For each subcategory, recurse only if depth allows
        for subcat_url in list(subcategories):
            if depth > 0:
                sub_articles, _ = get_article_urls(subcat_url, depth - 1, visited_categories)
                article_urls.update(sub_articles)

    return article_urls, subcategories

def crawl_category(category, depth):
    category_url = f"{BASE_URL}/wiki/Category:{category.replace(' ', '_')}"
    article_urls, _ = get_article_urls(category_url, depth)
    # Limit and return as list
    return list(article_urls)[:MAX_ROWS]


print(f"Starting crawl for category '{CATEGORY}' with depth {DEPTH} (max {MAX_ROWS} articles)...")
article_urls = crawl_category(CATEGORY, DEPTH)
print(f"Collected {len(article_urls)} article URLs to scrape.")

with open(OUTPUT_FILE, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Title', 'Summary', 'Categories', 'References', 'Links', 'Last_Edited',
                  'Word_Count', 'Image_Count', 'Section_Count', 'External_Links', 'First_Edit_Year']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for i, url in enumerate(article_urls):
        data = extract_article_data(url)
        if data:
            writer.writerow(data)
            print(f"[{i+1}/{len(article_urls)}] Scraped: {data['Title']}")
        else:
            print(f"[{i+1}/{len(article_urls)}] Failed to scrape: {url}")
        time.sleep(random.uniform(1, 2))  # Be polite to Wikipedia

print(f"Scraping complete! Data saved to {OUTPUT_FILE}")

Starting crawl for category 'Science' with depth 1 (max 1000 articles)...
Collected 1000 article URLs to scrape.
[1/1000] Scraped: List of discoveries influenced by chance circumstances
[2/1000] Scraped: ABC@Home


KeyboardInterrupt: 