In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from urllib.parse import urljoin
import time
import urllib3
import os

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

BASE_URL = "https://www.pcmag.com/"
NEWS_URLS = {
    f"{BASE_URL}news/categories/smart-home": "Smart Home News",
    f"{BASE_URL}picks/categories/smart-home?test_uuid=03iF1uOjHbmoZSTXr58OMhT&test_variant=A": "Smart Home Products"
}

EXCLUDE_PATHS = [
    '/category/', '/tag/', '/author/', '/wp-admin', '/feed',
    '/subscribe', '/contact', '/about', '/privacy', '/terms',
    '?s=', '/search', '/newsletter'
]


# ------------------- SELENIUM SETUP ---------------------
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
driver = webdriver.Chrome(options=chrome_options)


def fetch_page(url):
    """Fetch and parse a webpage via Selenium."""
    try:
        driver.get(url)
        time.sleep(1.5)
        html = driver.page_source
        return BeautifulSoup(html, "html.parser")
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None


def is_valid_article_url(url):
    """Check if URL is likely an article."""
    if not url.startswith(BASE_URL):
        return False

    path = url[len(BASE_URL):]

    if len(path) < 10 or path == '/':
        return False

    if any(excluded in path.lower() for excluded in EXCLUDE_PATHS):
        return False

    return '-' in path


def get_article_links():
    """Extract article URLs from PCMag Smart Home pages with source tracking."""
    all_links = {}  # {article_url: {'source': source_name, 'date': date}}

    for news_url, source_name in NEWS_URLS.items():
        print(f"Fetching articles from {source_name} ({news_url})")

        soup = fetch_page(news_url)
        if not soup:
            continue

        article_cards = soup.find_all("div", class_="flex flex-1 flex-col gap-3")

        for item in article_cards:
            h2_tag = item.find("h2", class_="font-stretch-ultra-condensed text-lg font-semibold leading-compact md:text-xl")

            if not h2_tag:
                continue

            a_tag = h2_tag.find("a", href=True)
            if not a_tag:
                continue

            url = urljoin(BASE_URL, a_tag["href"].strip())
            if not is_valid_article_url(url):
                continue

            # Date extraction
            parent = item.find_parent()
            date_span = parent.find("span", {"data-content-published-date": ""}) if parent else None
            date_text = date_span.get_text(strip=True) if date_span else ""

            all_links[url] = {
                'source': source_name,
                'date': date_text
            }

    print(f"Found {len(all_links)} unique article URLs")
    return all_links


def scrape_article(url, source_name, date_from_listing):
    """Extract content from a single article."""
    try:
        soup = fetch_page(url)
        if not soup:
            return None

        # Title
        title = soup.find("h1")
        if not title:
            title = soup.find("h2", class_="font-stretch-ultra-condensed")

        # Description
        description = soup.find("meta", property="og:description")
        if description:
            description_text = description.get("content", "")
        else:
            desc_tag = soup.find("p", class_="line-clamp-2")
            description_text = desc_tag.get_text(strip=True) if desc_tag else ""

        # Date fallback
        date_text = date_from_listing
        if not date_text:
            date_tag = soup.find("time") or soup.find("span", {"data-content-published-date": ""})
            if date_tag:
                date_text = date_tag.get("datetime", date_tag.get_text(strip=True))

        return {
            "Nguồn": source_name,
            "Tiêu đề": title.get_text(strip=True) if title else "",
            "Mô tả": description_text,
            "Ngày": date_text,
            "URL": url
        }

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None


def main():
    articles = []

    article_links = get_article_links()

    print(f"\nScraping {len(article_links)} articles...\n")

    for i, (url, info) in enumerate(article_links.items(), start=1):
        print(f"[{i}/{len(article_links)}] Scraping: {url} [{info['source']}]")
        data = scrape_article(url, info["source"], info["date"])
        if data and data["Tiêu đề"]:
            articles.append(data)
        time.sleep(1)

    # Save file
    output_dir = r"C:\Users\Admin\Documents\VIETTEL\2-crawl-articles\scraper\output-scraper"
    os.makedirs(output_dir, exist_ok=True)

    if articles:
        df = pd.DataFrame(articles)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        excel_file = f"{output_dir}/pcmag_smarthome_{timestamp}.xlsx"
        df.to_excel(excel_file, index=False)

        print(f"\n✓ Successfully scraped {len(articles)} articles")
        print(f"✓ Saved results to {excel_file}")

        print("\nBreakdown by source:")
        for source, count in df["Nguồn"].value_counts().items():
            print(f"  - {source}: {count} articles")
    else:
        print("\n✗ No articles were successfully scraped.")

    driver.quit()


if __name__ == "__main__":
    main()

Fetching articles from Smart Home News (https://www.pcmag.com/news/categories/smart-home)
Fetching articles from Smart Home Products (https://www.pcmag.com/picks/categories/smart-home?test_uuid=03iF1uOjHbmoZSTXr58OMhT&test_variant=A)
Found 40 unique article URLs

Scraping 40 articles...

[1/40] Scraping: https://www.pcmag.com/news/judge-axes-police-program-which-monitored-sacramento-residents-electricity [Smart Home News]
[2/40] Scraping: https://www.pcmag.com/news/robot-vacuum-mop-early-black-friday-2025-deals-irobot-ecovacs-nov-22 [Smart Home News]
[3/40] Scraping: https://www.pcmag.com/news/your-security-camera-may-soon-work-with-more-apps-thanks-to-matter-15 [Smart Home News]
[4/40] Scraping: https://www.pcmag.com/news/best-robot-vacuum-mop-early-black-friday-2025-deals-irobot-shark-nov-18 [Smart Home News]
[5/40] Scraping: https://www.pcmag.com/news/google-pulls-the-plug-on-older-nest-thermostats-5-alternatives-we-recommend [Smart Home News]
[6/40] Scraping: https://www.pcmag.com/