In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from urllib.parse import urljoin
import time
import urllib3
import os

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

BASE_URL = "https://www.iot-now.com/"
NEWS_URLS = {
    f"{BASE_URL}smart-homes-2/": "Smart Home"
}
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

EXCLUDE_PATHS = ['/category/', '/tag/', '/author/', '/wp-admin', '/feed', 
                 '/subscribe', '/contact', '/about-us/', '/privacy', '/terms', 
                 '?s=', '/search', '/newsletter']


def fetch_page(url):
    """Fetch and parse a webpage."""
    try:
        response = requests.get(url, headers=HEADERS, verify=False, timeout=15)
        response.raise_for_status()
        return BeautifulSoup(response.text, "html.parser")
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None


def is_valid_article_url(url):
    """Check if URL is likely an article."""
    if not url.startswith(BASE_URL):
        return False
    
    path = url[len(BASE_URL):]
    
    if len(path) < 10 or path == '/' or path == 'smart-homes-2/':
        return False
    
    if any(excluded in path.lower() for excluded in EXCLUDE_PATHS):
        return False
    
    return True


def get_article_links():
    """Extract article URLs from IoT Now Smart Home pages with source tracking."""
    all_links = {}  # {article_url: {'source': source_name, 'date': date}}
    
    for news_url, source_name in NEWS_URLS.items():
        print(f"Fetching articles from {source_name} ({news_url})")
        soup = fetch_page(news_url)
        
        if not soup:
            continue
        
        # Find article containers
        for item in soup.find_all("div", class_="category__post"):
            h2_tag = item.find("h2", class_="category__title")
            if h2_tag:
                a_tag = h2_tag.find("a", href=True)
                if a_tag:
                    url = urljoin(BASE_URL, a_tag["href"].strip())
                    if is_valid_article_url(url):
                        # Find the date in the item container
                        date_tag = item.find("time", class_="entry-date published updated")
                        if not date_tag:
                            date_tag = item.find("time", class_="entry-date")
                        date_text = date_tag.get_text(strip=True) if date_tag else ""
                        
                        all_links[url] = {
                            'source': source_name,
                            'date': date_text
                        }
    
    print(f"Found {len(all_links)} unique article URLs")
    return all_links


def scrape_article(url, source_name, date_from_listing):
    """Extract content from a single article."""
    try:
        soup = fetch_page(url)
        
        if not soup:
            return None
        
        # Find title - look for h1 on the article page
        title = soup.find("h1", class_="entry-title")
        if not title:
            title = soup.find("h1")
        
        # Find description/summary - look for article excerpt or meta description
        description = soup.find("meta", property="og:description")
        if description:
            description_text = description.get("content", "")
        else:
            # Try to get the first paragraph of content
            content_div = soup.find("div", class_="entry-content")
            if content_div:
                first_p = content_div.find("p")
                description_text = first_p.get_text(strip=True) if first_p else ""
            else:
                description_text = ""
        
        # Use date from listing page, or try to find on article page as fallback
        date_text = date_from_listing
        if not date_text:
            date_tag = soup.find("time", class_="entry-date published")
            if not date_tag:
                date_tag = soup.find("time")
            if date_tag:
                date_text = date_tag.get("datetime", date_tag.get_text(strip=True))
        
        return {
            "Nguồn": "Smart Home News",
            "Tiêu đề": title.get_text(strip=True) if title else "",
            "Mô tả": description_text,
            "Ngày": date_text,
            "URL": url
        }
    
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None


def main():
    """Main scraping workflow."""
    articles = []
    
    article_links = get_article_links()
    
    print(f"\nScraping {len(article_links)} articles...")
    
    for i, (url, info) in enumerate(article_links.items(), 1):
        print(f"[{i}/{len(article_links)}] Scraping: {url} [{info['source']}]")
        data = scrape_article(url, info['source'], info['date'])
        if data and data["Tiêu đề"]:  # Only add if title was found
            articles.append(data)
        time.sleep(1)  # Be polite to the server
    
    # Save results
    output_dir = r"C:\Users\Admin\Documents\VIETTEL\2-crawl-articles\scraper\output-scraper"
    os.makedirs(output_dir, exist_ok=True)
    
    if articles:
        df = pd.DataFrame(articles)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        excel_file = f"{output_dir}/iotnow_smarthome_{timestamp}.xlsx"
        df.to_excel(excel_file, index=False)
        print(f"\n✓ Successfully scraped {len(articles)} articles")
        print(f"✓ Saved results to {excel_file}")
        
        # Show breakdown by source
        print("\nBreakdown by source:")
        for source in df["Nguồn"].value_counts().items():
            print(f"  - {source[0]}: {source[1]} articles")
    else:
        print("\n✗ No articles were successfully scraped.")


if __name__ == "__main__":
    main()

Fetching articles from Smart Home (https://www.iot-now.com/smart-homes-2/)
Found 16 unique article URLs

Scraping 16 articles...
[1/16] Scraping: https://www.iot-now.com/2025/11/05/153958-quectel-launches-new-lora-v2x-wi-fi-and-5g-durable-fiberglass-antennas/ [Smart Home]
[2/16] Scraping: https://www.iot-now.com/2025/11/05/153952-ai-driven-network-automation-the-top-priority-for-telcos-in-the-next-12-months-new-research-from-motive/ [Smart Home]
[3/16] Scraping: https://www.iot-now.com/2025/10/22/153652-smart-home-security-enjoys-prpl-patch-at-paris-summit/ [Smart Home]
[4/16] Scraping: https://www.iot-now.com/2025/10/22/153646-ceva-introduces-wi-fi-7-1x1-client-ip/ [Smart Home]
[5/16] Scraping: https://www.iot-now.com/2025/08/19/152654-driving-the-new-era-of-ai-and-iot-with-powercasts-one-stop-shop-for-wireless-power-solutions/ [Smart Home]
[6/16] Scraping: https://www.iot-now.com/2025/08/18/152636-the-number-of-home-energy-management-systems-in-europe-and-north-america-reached-4-5-mi