In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib3
import os
from datetime import datetime

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#  CONFIG
SITES = {
    "vietnamnet": {
        "base_url": "https://vietnamnet.vn",
        "categories": ['cong-nghe']
    },
    "vnexpress": {
        "base_url": "https://vnexpress.net",
        "categories": ['khoa-hoc-cong-nghe']
    },
    "tuoitre": {
        "base_url": "https://tuoitre.vn",
        "categories": ['cong-nghe']
    },
    "dantri": {
        "base_url": "https://dantri.com.vn",
        "categories": ['cong-nghe']
    },
    "thanhnien": {
        "base_url": "https://thanhnien.vn",
        "categories": ['cong-nghe']
    },
    "tienphong": {
        "base_url": "https://tienphong.vn",
        "categories": ['khoa-hoc']
    },
    "kinhtedubao": {
        "base_url": "https://kinhtevadubao.vn/",
        "categories": ['dien-dan-khoa-hoc']
    },
    "khoahoccongnghe": {
        "base_url": "https://vjst.vn/",
        "categories": ['khoa-hoc-cong-nghe']
    },
    "vietnamplus": {
        "base_url": "https://www.vietnamplus.vn",
        "categories": ['khoahoc', 'congnghe']
    }
}

KEYWORDS = ['internet', 'cap-quang', 'cap-dong', 'cap-dong-truc', 'ethernet', 'thiet-bi-dau-cuoi',
            '5g', '4g', 'wifi', 'cong-nghe', 'viettel', 'co-dinh-bang-rong',
            'bang-thong', 'ha-tang-mang', 'modem', 'goi-cuoc',
            'TCP/IP', 'IPv4', 'IPv6', 'DOCSIS', 'PPPoE', 'DHCP',
            'truyen-hinh', 'camera', 'vien-thong',
            'robot', 'du-lieu', 'nha-mang', 'thue-bao', 'mang-di-dong',
            'fiber-optic', 'copper-cable', 'coaxial-cable', 'terminal-equipment',
            'technology', 'fixed-broadband', 'bandwidth', 'network-infrastructure',
            'subscription-plan', 'television', 'telecommunications',
            'data', 'network-operator', 'subscriber', 'mobile-network'
            ]

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}

# LINK CRAWLER
def get_article_links(site_name, category_url, base_url):
    try:
        res = requests.get(category_url, headers=HEADERS, verify=False, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, 'html.parser')

        links = set()
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            if href.endswith(('.html', '.htm', '.tpo', '.vnp')):
                full_url = href if href.startswith('http') else base_url + href
                if full_url.startswith(base_url) and any(keyword in full_url.lower() for keyword in KEYWORDS):
                    links.add(full_url)
        return links
    except Exception as e:
        print(f"Failed to get links from {category_url} ({site_name}): {e}")
        return set()

# ARTICLE EXTRACTORS
def extract_article_parts(site_name, article_url):
    try:
        res = requests.get(article_url, headers=HEADERS, verify=False, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, 'html.parser')

        title, desc, content, date = '', '', '', ''

        if site_name == "vietnamnet":
            # title
            title_tag = soup.find('h1')
            title = title_tag.get_text(strip=True) if title_tag else ''

            # desc
            desc_tag = soup.find('h2', class_='content-detail-sapo sm-sapo-mb-0')
            if not desc_tag:
                meta_desc = soup.find('meta', attrs={'name': 'description'})
                desc = meta_desc['content'].strip() if meta_desc and meta_desc.has_attr('content') else ''
            else:
                desc = desc_tag.get_text(strip=True)

            # date
            date_tag = soup.find('div', class_='bread-crumb-detail__time')
            date = date_tag.get_text(strip=True) if date_tag else ''

            # content
            body_div = soup.find("div", class_="maincontent main-content")
            content_parts = []
            if body_div:
            # lấy <p>
                for p in body_div.find_all("p"):
                    text = p.get_text(" ", strip=True)
                    if text:
                        content_parts.append(text)
            # lấy <li>
                for li in body_div.find_all("li"):
                    text = li.get_text(" ", strip=True)
                    if text:
                        content_parts.append("• " + text)
    
            content = "\n\n".join(content_parts)

        elif site_name == "vnexpress":
            title_tag = soup.find('h1', class_="title-detail")
            title = title_tag.get_text(strip=True) if title_tag else ''
            desc_tag = soup.find('p', class_="description")
            desc = desc_tag.get_text(strip=True) if desc_tag else ''
            date_tag = soup.find('span', class_='date')
            date = date_tag.get_text(strip=True) if date_tag else ''
            content_div = soup.find('article', class_="fck_detail")
            if content_div:
                paragraphs = content_div.find_all(['p', 'h2', 'h3', 'li'])
                content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

        elif site_name == "tuoitre":
            title_tag = soup.find('h1', class_='detail-title article-title')
            title = title_tag.get_text(strip=True) if title_tag else ''
            desc_tag = soup.find('h2', class_='detail-sapo')
            desc = desc_tag.get_text(strip=True) if desc_tag else ''
            date_tag = soup.find('div', class_='detail-time')
            date = date_tag.get_text(strip=True) if date_tag else ''
            content_div = soup.find('div', class_="detail-cmain clearfix")
            if content_div:
                paragraphs = content_div.find_all(['p', 'h2', 'h3', 'li'])
                content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

        elif site_name == "dantri":
            title_tag = soup.find('h1', class_='e-magazine__title')
            title = title_tag.get_text(strip=True) if title_tag else ''
            desc_tag = soup.find('h2', class_='e-magazine__sapo')
            desc = desc_tag.get_text(strip=True) if desc_tag else ''
            date_tag = soup.find('time', class_='author-time')
            date = date_tag.get_text(strip=True) if date_tag else ''
            content_div = soup.find('div', class_="e-magazine__body dnews__body")
            if content_div:
                paragraphs = content_div.find_all(['p', 'h2', 'h3', 'li'])
                content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

        elif site_name == "thanhnien":
            title_tag = soup.find('h1', class_='detail-title')
            title = title_tag.get_text(strip=True) if title_tag else ''
            desc_tag = soup.find('h2', class_='detail-sapo')
            desc = desc_tag.get_text(strip=True) if desc_tag else ''
            date_tag = soup.find('div', class_='detail-time')
            date = date_tag.get_text(strip=True) if date_tag else ''
            content_div = soup.find('div', class_="detail-cmain")
            if content_div:
                paragraphs = content_div.find_all(['p', 'h2', 'h3', 'li'])
                content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

        elif site_name == "tienphong":
            title_tag = soup.find('h1', class_='article__title cms-title')
            title = title_tag.get_text(strip=True) if title_tag else ''
            desc_tag = soup.find('div', class_='article__sapo cms-desc')
            desc = desc_tag.get_text(strip=True) if desc_tag else ''
            date_tag = soup.find('time', class_='time')
            date = date_tag.get_text(strip=True) if date_tag else ''
            content_div = soup.find('div', class_="article__body zce-content-body cms-body")
            if content_div:
                paragraphs = content_div.find_all(['p', 'h2', 'h3', 'li'])
                content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

        elif site_name == "kinhtedubao":
            title_tag = soup.find('h1', class_='post-title')
            title = title_tag.get_text(strip=True) if title_tag else ''
            desc_tag = soup.find('div', class_='post-desc')
            desc = desc_tag.get_text(strip=True) if desc_tag else ''
            date_tag = soup.find('span', class_='format_date')
            date = date_tag.get_text(strip=True) if date_tag else ''

        elif site_name == "khoahoccongnghe":
            title_tag = soup.find('h1', class_='sc-longform-header-title block-sc-title')
            title = title_tag.get_text(strip=True) if title_tag else ''
            desc_tag = soup.find('p', class_='sc-longform-header-sapo block-sc-sapo')
            desc = desc_tag.get_text(strip=True) if desc_tag else ''
            date_tag = soup.find('span', class_='sc-longform-header-date block-sc-publish-time')
            date = date_tag.get_text(strip=True) if date_tag else ''

        elif site_name == "vietnamplus":
            title_tag = soup.find('h1', class_='article__title cms-title')
            title = title_tag.get_text(strip=True) if title_tag else ''
            desc_tag = soup.find('div', class_='article__sapo cms-desc')
            desc = desc_tag.get_text(strip=True) if desc_tag else ''
            date_tag = soup.find('time', class_='time')
            date = date_tag.get_text(strip=True) if date_tag else ''             

        return title, desc, content, date

    except Exception as e:
        print(f"Failed to extract article {article_url} ({site_name}): {e}")
        return None, None, None, None

# MAIN
def main():
    matched_articles = []

    for site_name, cfg in SITES.items():
        base_url = cfg["base_url"]
        for category in cfg["categories"]:
            print(f"Fetching articles from {site_name} / {category}")
            if site_name in ["tuoitre", "dantri", "thanhnien"]:
                category_url = f"{base_url}/{category}.htm"
            else:
                category_url = f"{base_url}/{category}"
            article_links = get_article_links(site_name, category_url, base_url)
            print(f"Found {len(article_links)} keyword-matched URLs in {site_name}/{category}")

            for link in article_links:
                title, desc, content, date = extract_article_parts(site_name, link)
                if title:
                    print(f"Article saved: {link}")
                    matched_articles.append({
                        'Nguồn': "Tin trong nước",
                        'Tiêu đề': title,
                        'Mô tả': desc,
                        'Ngày': date,
                        'URL': link
                    })

    df = pd.DataFrame(matched_articles)

    # Save results with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = r"C:\Users\Admin\Documents\VIETTEL\2-crawl-articles\scraper\output-scraper"
    os.makedirs(output_dir, exist_ok=True)

    if not df.empty:
        excel_file = f"{output_dir}/general_articles_{timestamp}.xlsx"
        df.to_excel(excel_file, index=False)
        print(f"\nSaved results to {excel_file}")
    else:
        print("\nNo articles matched the keywords in URLs.")

if __name__ == "__main__":
    main()

Fetching articles from vietnamnet / cong-nghe
Found 7 keyword-matched URLs in vietnamnet/cong-nghe
Article saved: https://vietnamnet.vn/elon-musk-mot-lan-nua-khang-dinh-ai-va-robot-se-khien-tien-bac-tro-nen-vo-nghia-2464820.html
Article saved: https://vietnamnet.vn/vivo-x300-series-bo-doi-vua-camera-thiet-lap-chuan-moi-cua-nhiep-anh-di-dong-2465339.html
Article saved: https://vietnamnet.vn/nguoi-dan-vung-lu-khong-dien-khong-song-nha-mang-xuyen-dem-khac-phuc-su-co-2464896.html
Article saved: https://vietnamnet.vn/bi-mat-cong-nghe-tau-me-drone-bns-oostende-quet-sach-min-10-lan-khong-rui-ro-2461608.html
Article saved: https://vietnamnet.vn/viettel-dung-drone-vuot-lu-tiep-te-gan-3-tan-hang-cuu-tro-khan-cap-cho-nguoi-dan-2465592.html
Article saved: https://vietnamnet.vn/buc-tranh-u-am-cua-nhan-su-cong-nghe-my-2465176.html
Article saved: https://vietnamnet.vn/game-thu-combat-than-toc-nho-loat-cong-nghe-tren-man-hinh-gaming-samsung-odyssey-2465043.html
Fetching articles from vnexpress / khoa-