In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from urllib.parse import urljoin
import time
import urllib3
import os

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

BASE_URL = "https://www.thinkbroadband.com/"
NEWS_URL = f"{BASE_URL}topics/broadband"

EXCLUDE_PATHS = [
    '/category/', '/tag/', '/author/', '/wp-admin', '/feed',
    '/subscribe', '/contact', '/about', '/privacy', '/terms',
    '?s=', '/search', '/newsletter'
]

# ---------- Selenium Setup -----------
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
driver = webdriver.Chrome(options=chrome_options)


def fetch_page(url):
    """Load a webpage using Selenium and return BeautifulSoup."""
    driver.get(url)
    time.sleep(1.5)  # ensure JS content loads
    html = driver.page_source
    return BeautifulSoup(html, "html.parser")


def is_valid_article_url(url):
    """Check if URL is likely an article."""
    if not url.startswith(BASE_URL):
        return False

    path = url[len(BASE_URL):]

    if len(path) < 10 or path == '/':
        return False

    if any(excluded in path.lower() for excluded in EXCLUDE_PATHS):
        return False

    return '-' in path


def get_article_links():
    print(f"Fetching articles from {NEWS_URL}")
    soup = fetch_page(NEWS_URL)

    links = set()

    # Find main article containers
    containers = soup.find_all(
        lambda tag: tag.name in ("article", "div") and tag.find(["h2", "h3"], class_="entry-title")
    )

    if not containers:
        containers = soup.find_all("div", class_="entry")

    for item in containers:
        h_tag = item.find(["h2", "h3"], class_="entry-title")
        a_tag = h_tag.find("a", href=True) if h_tag else item.find("a", href=True)

        if a_tag:
            url = urljoin(BASE_URL, a_tag["href"].strip())
            if is_valid_article_url(url):
                links.add(url)

    print(f"Found {len(links)} article URLs")
    return list(links)


def scrape_article(url):
    """Extract content from a single article."""
    try:
        soup = fetch_page(url)

        # Title
        title_tag = soup.find("h1") or soup.find("h2", class_="entry-title") or soup.find("h2")
        title = title_tag.get_text(strip=True) if title_tag else ""

        # Description
        desc_tag = (
            soup.find("div", class_="entry-content") or
            soup.find("div", class_="article-content") or
            soup.find("p", class_="lead")
        )
        description = desc_tag.get_text(" ", strip=True) if desc_tag else ""

        # Date
        date_text = ""
        date_div = soup.find("div", class_="posted-on")

        if date_div:
            a = date_div.find("a")
            if a:
                date_text = a.get("title", "").strip() or a.get_text(" ", strip=True)

        if not date_text:
            time_tag = soup.find("time")
            if time_tag:
                date_text = time_tag.get("datetime", "").strip() or time_tag.get_text(" ", strip=True)

        return {
            "Nguồn": "Tin quốc tế",
            "Tiêu đề": title,
            "Mô tả": title,
            "Ngày": date_text,
            "URL": url
        }

    except Exception as e:
        print(f" Error scraping {url}: {e}")
        return None


def main():
    articles = []

    urls = get_article_links()

    for url in urls:
        print(f"Scraping: {url}")
        data = scrape_article(url)
        if data:
            articles.append(data)
        time.sleep(1)

    df = pd.DataFrame(articles)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    output_dir = r"C:\Users\Admin\Documents\VIETTEL\2-crawl-articles\scraper\output-scraper"
    os.makedirs(output_dir, exist_ok=True)

    if not df.empty:
        excel_file = f"{output_dir}/thinkbroadband_{timestamp}.xlsx"
        df.to_excel(excel_file, index=False)
        print(f"\nSaved results to {excel_file}")
    else:
        print("\nNo articles matched the keywords in URLs.")

    driver.quit()


if __name__ == "__main__":
    main()


Fetching articles from https://www.thinkbroadband.com/topics/broadband
Found 10 article URLs
Scraping: https://www.thinkbroadband.com/news/oopsie-for-virgin-media-website-with-404-error-on-pages-for-10-minutes
Scraping: https://www.thinkbroadband.com/news/bduk-update-to-project-gigabit-contracts
Scraping: https://www.thinkbroadband.com/news/weekly-brief-21st-november-2025
Scraping: https://www.thinkbroadband.com/news/ofcom-connected-nations-report-2025-fixed-line
Scraping: https://www.thinkbroadband.com/news/6000-homes-in-ripon-can-get-virgin-media-o2-full-fibre-service
Scraping: https://www.thinkbroadband.com/news/totsco-approaches-2-million-ots-switches
Scraping: https://www.thinkbroadband.com/news/ofcom-connection-nations-report-2025-mobile
Scraping: https://www.thinkbroadband.com/news/connexin-full-fibre-network-now-on-cityfibre-platform
Scraping: https://www.thinkbroadband.com/news/cloudflare-outage-affects-websites-globally
Scraping: https://www.thinkbroadband.com/news/weekly-bri