# **WEB SCRAPING**

In [None]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import xml.etree.ElementTree as ET
from tqdm import tqdm
import time
import json

**Database Setup **

In [None]:
def setup_db(db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    cursor.execute("""
    CREATE TABLE IF NOT EXISTS pages (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        url TEXT UNIQUE,
        title TEXT,
        content TEXT
    );
    """)

    cursor.execute("""
    CREATE TABLE IF NOT EXISTS page_sections (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        url TEXT,
        section_heading TEXT,
        section_content TEXT
    );
    """)

    cursor.execute("""
    CREATE TABLE IF NOT EXISTS page_tables (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        url TEXT,
        row_content TEXT
    );
    """)

    conn.commit()
    return conn

**Parsing Sitemaps**

In [None]:
def parse_sitemap(sitemap_url):
    urls = []
    try:
        response = requests.get(sitemap_url)
        if response.status_code == 200:
            root = ET.fromstring(response.content)
            namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
            for child in root.findall('ns:sitemap', namespace):
                loc = child.find('ns:loc', namespace)
                if loc is not None:
                    urls.extend(parse_nested_sitemap(loc.text))
    except Exception as e:
        print(f"Error parsing sitemap index: {e}")
    return urls

def parse_nested_sitemap(nested_url):
    urls = []
    try:
        response = requests.get(nested_url)
        if response.status_code == 200:
            root = ET.fromstring(response.content)
            namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
            for url in root.findall('ns:url', namespace):
                loc = url.find('ns:loc', namespace)
                if loc is not None:
                    urls.append(loc.text)
    except Exception as e:
        print(f"Error parsing nested sitemap: {e}")
    return urls

def extract_all_urls(sitemap_url):
    return parse_sitemap(sitemap_url)

Check for any Duplicates if re-scraping

In [None]:
def is_url_in_db(conn, url):
    cursor = conn.cursor()
    cursor.execute("SELECT 1 FROM pages WHERE url = ?", (url,))
    return cursor.fetchone() is not None

**Content Extraction**

In [None]:
def extract_main_content(soup):
    return soup.get_text(separator="\n", strip=True)

def extract_sections(soup):
    sections = []
    for heading in soup.find_all(['h2', 'h3']):
        section_heading = heading.get_text(strip=True)
        section_content = ''
        for sibling in heading.find_next_siblings():
            if sibling.name in ['h2', 'h3']:
                break
            section_content += sibling.get_text(separator=" ", strip=True) + " "
        if section_heading and section_content.strip():
            sections.append((section_heading, section_content.strip()))
    return sections

def extract_tables(soup):
    tables_data = []
    tables = soup.find_all("table")
    for table in tables:
        rows = table.find_all("tr")
        for row in rows:
            cols = row.find_all(["td", "th"])
            row_text = [col.get_text(strip=True) for col in cols]
            if row_text:
                tables_data.append(row_text)
    return tables_data


**Save Data to DB**

In [None]:
def save_page_data(conn, url, title, content):
    cursor = conn.cursor()
    cursor.execute("INSERT OR IGNORE INTO pages (url, title, content) VALUES (?, ?, ?)", (url, title, content))
    conn.commit()

def save_section_data(conn, url, sections):
    cursor = conn.cursor()
    for heading, content in sections:
        cursor.execute("INSERT INTO page_sections (url, section_heading, section_content) VALUES (?, ?, ?)",
                       (url, heading, content))
    conn.commit()

def save_table_data(conn, url, tables_data):
    cursor = conn.cursor()
    for row in tables_data:
        cursor.execute("INSERT INTO page_tables (url, row_content) VALUES (?, ?)", (url, json.dumps(row)))
    conn.commit()

**Main Code for Execution**

In [None]:
if __name__ == "__main__":
    sitemap_url = "https://www.mahindrauniversity.edu.in/sitemap_index.xml"
    db_path = "mahindra_university_data.db"
    conn = setup_db(db_path)

    print("Parsing sitemap...")
    all_urls = extract_all_urls(sitemap_url)
    print(f"Total URLs found: {len(all_urls)}")

    new_urls = [url for url in all_urls if not is_url_in_db(conn, url)]
    print(f"URLs to scrape: {len(new_urls)}")

    for url in tqdm(new_urls, desc="Scraping"):
        try:
            res = requests.get(url, timeout=10)
            if res.status_code == 200:
                soup = BeautifulSoup(res.text, "lxml")
                title = soup.title.string.strip() if soup.title else ""
                full_content = extract_main_content(soup)

                save_page_data(conn, url, title, full_content)

                sections = extract_sections(soup)
                if sections:
                    save_section_data(conn, url, sections)

                tables_data = extract_tables(soup)
                if tables_data:
                    save_table_data(conn, url, tables_data)
        except Exception as e:
            print(f"[ERROR] Scraping {url}: {e}")

    print("Scraping complete. Page, section, and table content stored in SQLite.")
