In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from urllib.parse import urljoin
import time
import random
import os

BASE_URL = "https://fptsmarthome.vn/"
CATEGORY_URL = f"{BASE_URL}tin-tuc/"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "vi-VN,vi;q=0.9,en-US;q=0.8,en;q=0.7"
}


def fetch_page(url):
    try:
        time.sleep(random.uniform(1, 3))
        response = requests.get(url, headers=HEADERS, timeout=30, verify=False)
        print(f"  Status: {response.status_code}")
        return BeautifulSoup(response.text, "html.parser")
    except Exception as e:
        print(f"  Error: {e}")
        return None


def get_articles():
    print("Fetching category page...")
    soup = fetch_page(CATEGORY_URL)
    if not soup:
        return {}
    
    articles = {}
    for item in soup.find_all("div", class_="box has-hover"):
        link = item.find("a", href=True)
        date_tag = item.find("div", class_="post-date mt-1")
        
        if link:
            url = urljoin(BASE_URL, link["href"])
            date_text = date_tag.get_text(strip=True) if date_tag else ""
            articles[url] = date_text
    
    return articles


def scrape_article(url, date):
    soup = fetch_page(url)
    if not soup:
        return None
    
    title = soup.find("h1")
    if not title:
        title = soup.find("h1")
    
    desc_tag = soup.find("div", class_="is-xsmall mt-1")
    if not desc_tag:
        desc_tag = soup.find("div", class_="content")
    
    desc = ""
    if desc_tag:
        first_p = desc_tag.find("p")
        desc = first_p.get_text(strip=True) if first_p else ""
    
    if not title:
        return None
    
    return {
        "Nguồn": "FPT Smart Home",
        "Tiêu đề": title.get_text(strip=True),
        "Mô tả": desc,
        "Ngày": date,
        "URL": url
    }


def main():
    articles_dict = get_articles()
    print(f"Found {len(articles_dict)} articles\n")
    
    if not articles_dict:
        print("⚠ No articles found")
        return
    
    results = []
    for i, (url, date) in enumerate(articles_dict.items(), 1):
        print(f"\n[{i}/{len(articles_dict)}] {url}")
        data = scrape_article(url, date)
        if data:
            results.append(data)
            print(f"  ✓ {data['Tiêu đề'][:60]}")
        else:
            print(f"  ✗ Failed")
    
    print(f"\nTotal: {len(results)}/{len(articles_dict)} articles")
    
    if results:
        df = pd.DataFrame(results)
        output_dir = r"C:\Users\Admin\Documents\VIETTEL\2-crawl-articles\scraper\output-scraper"
        os.makedirs(output_dir, exist_ok=True)
        filename = f"{output_dir}/fptsmarthome_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
        df.to_excel(filename, index=False)
        print(f"✓ Saved to: {filename}")


if __name__ == "__main__":
    main()

Fetching category page...




  Status: 200
Found 0 articles

⚠ No articles found
