In [None]:
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import pandas as pd
import os
from datetime import datetime

BASE = "https://fpt.vn"

# Danh sách các categories muốn crawl
CATEGORIES = [
    "/tin-tuc/khuyen-mai"
]

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/100.0.0.0 Safari/537.36"
}

KEYWORDS = [
    "internet", "cáp quang", "cáp đồng", "cáp đồng trục", "ethernet", "thiết bị đầu cuối", "5g", "4g", 
    "wifi", "công nghệ", "viettel", "cố định băng rộng", "băng thông", "hạ tầng mạng", "modem", 
    "gói cước", "tcp/ip", "ipv4", "ipv6", "docsis", "pppoe", "dhcp", "truyền hình", "camera", 
    "viễn thông", "robot", "dữ liệu", "nhà mạng", "hợp tác", "dịch vụ"
]

def get_page_url(category_path, page_num):
    if page_num == 1:
        return f"{BASE}{category_path}"
    return f"{BASE}{category_path}/?page={page_num}"

def crawl_listing_page(category_path, page_num):
    """Return list of article links on one listing page."""
    url = get_page_url(category_path, page_num)
    print(f"Fetching listing page {page_num}: {url}")
    try:
        resp = requests.get(url, headers=HEADERS, verify=False, timeout=10)
        if resp.status_code != 200:
            print("Failed to fetch", url)
            return []
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return []
    
    soup = BeautifulSoup(resp.text, "html.parser")
    results = []
    for div in soup.find_all("div", class_="hotnews-item"):
        a = div.find("a")
        if not a or not a.get("href"):
            continue
        link = urljoin(BASE, a["href"])
        results.append(link)
    return results

def crawl_article(article_url, apply_keyword_filter=False):
    """Extract tiêu đề, mô tả, main-content from one article page."""
    print("   Crawling article:", article_url)
    try:
        resp = requests.get(article_url, headers=HEADERS, verify=False, timeout=10)
        if resp.status_code != 200:
            print("   Failed:", resp.status_code)
            return None
    except Exception as e:
        print(f"   Error: {e}")
        return None
    
    soup = BeautifulSoup(resp.text, "html.parser")
    
    # Tiêu đề
    title_tag = soup.find("h1", class_="toc--label-head")
    title = title_tag.get_text(strip=True) if title_tag else None
    
    # Date
    date_tag = soup.find("span", class_="public-date")
    date = date_tag.get_text(strip=True) if date_tag else None
    
    # Mô tả
    desc_tag = soup.find("div", class_="toc--sub-title")
    description = desc_tag.get_text(strip=True) if desc_tag else None

    # Chỉ filter keyword nếu được yêu cầu (cho tin-fpt)
    if apply_keyword_filter and description:
        desc_lower = description.lower()
        if not any(keyword in desc_lower for keyword in KEYWORDS):
            print("   Skipped (no keyword match).")
            return None
    
    # Main content
    body_div = soup.find("div", class_="news-detail-body")
    content_parts = []
    if body_div:
        # lấy <p>
        for p in body_div.find_all("p"):
            text = p.get_text(" ", strip=True)
            if text:
                content_parts.append(text)
        # lấy <li>
        for li in body_div.find_all("li"):
            text = li.get_text(" ", strip=True)
            if text:
                content_parts.append("• " + text)
    
    content = "\n\n".join(content_parts)
    
    return {
        "Nguồn": "FPT Strategy",
        "Tiêu đề": title,
        "Mô tả": description,
        "Ngày": date,
        "URL": article_url
    }

def crawl_category(category_path, n_pages=3, delay=1.0):
    """Crawl n pages from one category."""
    print(f"\n{'='*60}")
    print(f"CRAWLING CATEGORY: {category_path}")
    print(f"{'='*60}")
    
    # Chỉ áp dụng keyword filter cho tin-fpt
    apply_filter = (category_path == "/tin-tuc/tin-fpt")
    if apply_filter:
        print(">>> Keyword filtering ENABLED for this category")
    else:
        print(">>> Keyword filtering DISABLED for this category")
    
    articles = []
    seen = set()
    for page in range(1, n_pages + 1):
        links = crawl_listing_page(category_path, page)
        for link in links:
            if link in seen:
                continue
            seen.add(link)
            art = crawl_article(link, apply_keyword_filter=apply_filter)
            if art:
                art['category'] = category_path  # Thêm cột category để biết từ đâu
                articles.append(art)
            time.sleep(0.5)
        time.sleep(delay)
    return articles

def main():
    all_articles = []
    
    # Crawl từng category
    for category in CATEGORIES:
        category_articles = crawl_category(category, n_pages=3)
        all_articles.extend(category_articles)
    
    print(f"\n{'='*60}")
    print(f"TOTAL: Crawled {len(all_articles)} articles from {len(CATEGORIES)} categories")
    print(f"{'='*60}")

    # Convert to DataFrame
    df = pd.DataFrame(all_articles)

    # Save results with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = r"C:\Users\Admin\Documents\VIETTEL\2-crawl-articles\scraper\output-scraper"
    os.makedirs(output_dir, exist_ok=True)

    if not df.empty:
        # In thống kê trước khi lưu
        print("\nStatistics by category:")
        print(df['category'].value_counts())
        
        # Sắp xếp lại thứ tự cột và loại bỏ 'category' khi lưu
        df_to_save = df[['Nguồn', 'Tiêu đề', 'Mô tả', 'Ngày', 'URL']]
        
        excel_file = f"{output_dir}/fpt_strategy_{timestamp}.xlsx"
        df_to_save.to_excel(excel_file, index=False)
        print(f"\nSaved results to {excel_file}")
    else:
        print("\nNo articles crawled.")


if __name__ == "__main__":
    main()


CRAWLING CATEGORY: /tin-tuc/tin-fpt
>>> Keyword filtering ENABLED for this category
Fetching listing page 1: https://fpt.vn/tin-tuc/tin-fpt
   Crawling article: https://fpt.vn/tin-tuc/ai-chuan-chat-lat-mat-song-trung-fpt-camera-dua-ai-len-san-khau-gameshow-viet-13341.html
   Crawling article: https://fpt.vn/tin-tuc/giai-ma-speed-test-vi-sao-mot-phep-do-lai-tro-thanh-chuan-muc-chat-luong-internet-13339.html
   Crawling article: https://fpt.vn/tin-tuc/internet-fpt-va-fpt-play-duoc-de-cu-tai-tech-award-2025-13333.html
   Crawling article: https://fpt.vn/tin-tuc/dang-ky-internet-fpt-san-combo-du-lich-thai-lan-co-vu-sea-games-33-13330.html
   Skipped (no keyword match).
   Crawling article: https://fpt.vn/tin-tuc/fpt-play-phat-truc-tiep-cac-tran-u17-viet-nam-da-vong-loai-afc-u17-asian-cup-13304.html
   Skipped (no keyword match).
   Crawling article: https://fpt.vn/tin-tuc/fpt-play-phat-song-sea-games-thailand-2025-13302.html
   Skipped (no keyword match).
Fetching listing page 2: https://