In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from urllib.parse import urljoin
import time
import urllib3
import os

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

BASE_URL = "https://itwire.com/"
NEWS_URL = f"{BASE_URL}/it-industry-news/telecoms-and-nbn.html"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

EXCLUDE_PATHS = ['/category/', '/tag/', '/author/', '/wp-admin', '/feed', 
                 '/subscribe', '/contact', '/about', '/privacy', '/terms', 
                 '?s=', '/search', '/newsletter']


def fetch_page(url):
    """Fetch and parse a webpage."""
    response = requests.get(url, headers=HEADERS, verify=False, timeout=15)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")


def is_valid_article_url(url):
    """Check if URL is likely an article."""
    if not url.startswith(BASE_URL):
        return False
    
    path = url[len(BASE_URL):]
    
    if len(path) < 10 or path == '/':
        return False
    
    if any(excluded in path.lower() for excluded in EXCLUDE_PATHS):
        return False
    
    return '-' in path


def get_article_links():
    """Extract article URLs from the iTWire Telecoms & NBN page."""
    print(f"Fetching articles from {NEWS_URL}")
    soup = fetch_page(NEWS_URL)
    
    links = set()
    
    # Find only actual article containers
    for item in soup.find_all("div", class_="catItemView"):
        a_tag = item.find("h3", class_="catItemTitle").find("a", href=True) if item.find("h3", class_="catItemTitle") else None
        if a_tag:
            url = urljoin(BASE_URL, a_tag["href"].strip())
            if is_valid_article_url(url):
                links.add(url)
    
    print(f"Found {len(links)} article URLs")
    return list(links)


def scrape_article(url):
    """Extract content from a single article."""
    try:
        soup = fetch_page(url)
        
        title = soup.find("h2", class_="itemTitle")
        description = soup.find("div", class_="itemIntroText")
        date = soup.find("span", class_="itemDateCreated")
        
        return {
            "Nguồn": "Tin quốc tế",
            "Tiêu đề": title.get_text(strip=True) if title else "",
            "Mô tả": description.get_text(strip=True) if description else "",
            "Ngày": date.get("datetime", date.get_text(strip=True)) if date else "",
            "URL": url
        }
    
    except Exception as e:
        print(f" Error scraping {url}: {e}")
        return None


def main():
    """Main scraping workflow."""
    articles = []
    
    for url in get_article_links():
        print(f"Scraping: {url}")
        data = scrape_article(url)
        if data:
            articles.append(data)
        time.sleep(1)
    
    df = pd.DataFrame(articles)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = r"C:\Users\Admin\Documents\VIETTEL\2-crawl-articles\scraper\output-scraper"
    os.makedirs(output_dir, exist_ok=True)

    if not df.empty:
        excel_file = f"{output_dir}/itwire_{timestamp}.xlsx"
        df.to_excel(excel_file, index=False)
        print(f"\nSaved results to {excel_file}")
    else:
        print("\nNo articles matched the keywords in URLs.")


if __name__ == "__main__":
    main()

Fetching articles from https://itwire.com//it-industry-news/telecoms-and-nbn.html
Found 10 article URLs
Scraping: https://itwire.com/telecoms-and-nbn/no-mirage-telstra-s-fun-with-puns-an-oasis-of-data-stats.html
Scraping: https://itwire.com/telecoms-and-nbn/telstra-celebrates-partner-award-2025-winners-across-australia.html
Scraping: https://itwire.com/telecoms-and-nbn/copper-thieves-cause-major-optus-outage,-disrupting-triple-zero-access-for-14,500-customers.html
Scraping: https://itwire.com/telecoms-and-nbn/telstra-is-blocking-certain-samsung-galaxy-phones-because-they-are-unable-to-reliably-make-emergency-000-calls.html
Scraping: https://itwire.com/telecoms-and-nbn/myriota-marks-a-decade-of-applied-innovation-with-new-fund-to-support-remote-stem-students.html
Scraping: https://itwire.com/telecoms-and-nbn/push-for-better-telecommunications-new-legislation-promises-stronger-coverage,-clearer-safeguards,-and-fairer-access.html
Scraping: https://itwire.com/telecoms-and-nbn/next-stop-fut