In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from urllib.parse import urljoin
import time
import urllib3
import os

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

BASE_URL = "https://www.mobileeurope.co.uk"
NEWS_URL = f"{BASE_URL}/category/content-type/news/"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

EXCLUDE_PATHS = ['/category/', '/tag/', '/author/', '/wp-admin', '/feed', 
                 '/subscribe', '/contact', '/about', '/privacy', '/terms', 
                 '?s=', '/search', '/newsletter']


def fetch_page(url):
    """Fetch and parse a webpage."""
    response = requests.get(url, headers=HEADERS, verify=False, timeout=15)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")


def is_valid_article_url(url):
    """Check if URL is likely an article."""
    if not url.startswith(BASE_URL):
        return False
    
    path = url[len(BASE_URL):]
    
    if len(path) < 10 or path == '/':
        return False
    
    if any(excluded in path.lower() for excluded in EXCLUDE_PATHS):
        return False
    
    return '-' in path


def get_article_links():
    """Extract article URLs from the news page."""
    print(f"Fetching articles from {NEWS_URL}")
    soup = fetch_page(NEWS_URL)
    
    links = set()
    for a in soup.find_all("a", href=True):
        url = urljoin(BASE_URL, a['href'].strip())
        if is_valid_article_url(url):
            links.add(url)
    
    print(f"Found {len(links)} article URLs")
    return list(links)


def scrape_article(url):
    """Extract content from a single article."""
    try:
        soup = fetch_page(url)
        
        title = soup.find("h1", class_="tdb-title-text")
        description = soup.find("h2", class_="wp-block-heading")
        date = soup.find("time", class_="entry-date")
        
        return {
            "Nguồn": "Tin quốc tế",
            "Tiêu đề": title.get_text(strip=True) if title else "",
            "Mô tả": description.get_text(strip=True) if description else "",
            "Ngày": date.get("datetime", date.get_text(strip=True)) if date else "",
            "URL": url
        }
    
    except Exception as e:
        print(f" Error scraping {url}: {e}")
        return None


def main():
    """Main scraping workflow."""
    articles = []
    
    for url in get_article_links():
        print(f"Scraping: {url}")
        data = scrape_article(url)
        if data:
            articles.append(data)
        time.sleep(1)
    
    df = pd.DataFrame(articles)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = r"C:\Users\Admin\Documents\VIETTEL\2-crawl-articles\scraper\output-scraper"
    os.makedirs(output_dir, exist_ok=True)

    if not df.empty:
        excel_file = f"{output_dir}/mobileeurope_{timestamp}.xlsx"
        df.to_excel(excel_file, index=False)
        print(f"\nSaved results to {excel_file}")
    else:
        print("\nNo articles matched the keywords in URLs.")

if __name__ == "__main__":
    main()

Fetching articles from https://www.mobileeurope.co.uk/category/content-type/news/
Found 21 article URLs
Scraping: https://www.mobileeurope.co.uk/germany-finally-takes-decisive-action-on-chinese-net-equipment-providers/
Scraping: https://www.mobileeurope.co.uk/network-slicing-progresses-apace-at-scale-as-5gsa-accelerates/
Scraping: https://www.mobileeurope.co.uk/here-is-the-short-list-for-mobile-europes-cto-of-the-year-2025-awards/
Scraping: https://www.mobileeurope.co.uk/the-briefing2/
Scraping: https://www.mobileeurope.co.uk/nokia-announces-strategy-restructures-and-makes-changes-at-the-top/
Scraping: https://www.mobileeurope.co.uk/eu-to-investigate-if-amazon-microsoft-are-gatekeepers-for-their-cloud-services/
Scraping: https://www.mobileeurope.co.uk/gartner-finds-european-tech-execs-opting-for-sovereign-solutions/
Scraping: https://www.mobileeurope.co.uk/bnetza-to-revisit-germanys-2019-5g-spectrum-auction-after-court-loss/
Scraping: https://www.mobileeurope.co.uk/reuters-europes-oper