#**Web Scraper for the New Straits Time**

> **by Group E**

Group Members:

1.   DANIAL HARRIZ BIN MOHD ASINEH @ MOHD ASNEH A22EC0152
2.   CHAI YU TONG A22EC0145
3.   KOH SU XUAN A22EC0060
4.   TIEW CHUAN RONG  A22EC0112





In [None]:
  # Install playwright
  !pip install -q playwright
  !playwright install

import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import csv
import random

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
    at async Registry._validateHostRequirements (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/l

In [None]:
MAX_PAGE_TIMEOUT = 600_000  # Increased timeout for slow loading pages
RETRY_WAIT = 10_000
MAX_RECORDS = 6_000
NUM_WORKERS = 8  # Number of concurrent tabs
PAGES_PER_WORKER = 1000  # How many pages each tab handles

lock = asyncio.Lock()  # for writing to CSV from multiple workers

async def scrape_page(page, url):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            print(f"[{url}] Attempt {attempt + 1}")
            await page.goto(url, timeout=MAX_PAGE_TIMEOUT, wait_until="domcontentloaded")

            # Increase wait time to allow more time for page content to load
            await page.wait_for_timeout(5000 + random.randint(4000, 8000))  # Increased wait time
            return await page.content()
        except Exception as e:
            print(f"⚠️ Error loading {url}: {e}")
            if attempt == max_retries - 1:
                return None
            await page.wait_for_timeout(RETRY_WAIT)

async def worker(browser, writer, start_page, end_page, worker_id):
    total_scraped = 0
    page = await browser.new_page()

    for page_num in range(start_page, end_page + 1):
        if total_scraped >= MAX_RECORDS // NUM_WORKERS:
            break

        url = f'https://www.nst.com.my/news/nation?page={page_num}' # "nation" is changed to "crime-courts", "government-public-policy", and, "politics" for different sections inside "News"
        html = await scrape_page(page, url)
        if not html:
            continue

        soup = BeautifulSoup(html, 'html.parser')
        articles = soup.find_all('a', class_='d-flex article listing mb-3 pb-3')

        if not articles:
            print(f"🚫 Worker {worker_id}: No articles on page {page_num}")
            continue

        records = []
        for article in articles:
            if total_scraped >= MAX_RECORDS // NUM_WORKERS:
                break

            title_tag = article.find('h6', class_='field-title')
            teaser_tag = article.find('div', class_='d-block article-teaser')
            link_tag = article.get('href')
            category_tag = article.find('span', class_='field-category')

            title = title_tag.get_text(strip=True) if title_tag else 'No Title'
            teaser = teaser_tag.get_text(strip=True) if teaser_tag else 'No Teaser'
            full_url = f"https://www.nst.com.my{link_tag}" if link_tag else 'No URL'
            category = category_tag.get_text(strip=True) if category_tag else 'No Category'

            records.append([title, full_url, teaser, category])
            total_scraped += 1

        async with lock:
            writer.writerows(records)

        print(f"✅ Worker {worker_id}: Page {page_num} done, total: {total_scraped}")

    await page.close()

async def run():
    with open('nst_nation.csv', 'w', newline='', encoding='utf-8') as csvfile: #"nation" is changed to "crime-courts", "government-public-policy", and, "politics" for different sections inside "News"
        writer = csv.writer(csvfile)
        writer.writerow(['Title', 'URL', 'Teaser', 'Category'])

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            tasks = []

            for i in range(NUM_WORKERS):
                start_page = i * PAGES_PER_WORKER + 1
                end_page = (i + 1) * PAGES_PER_WORKER
                task = worker(browser, writer, start_page, end_page, worker_id=i+1)
                tasks.append(task)

            await asyncio.gather(*tasks)
            await browser.close()

    print(f"🎉 Scraping complete! {MAX_RECORDS} records across {NUM_WORKERS} workers.")

# Run the async function
await run()

[https://www.nst.com.my/news/nation?page=4001] Attempt 1
[https://www.nst.com.my/news/nation?page=1] Attempt 1
[https://www.nst.com.my/news/nation?page=3001] Attempt 1
[https://www.nst.com.my/news/nation?page=1001] Attempt 1
[https://www.nst.com.my/news/nation?page=2001] Attempt 1
[https://www.nst.com.my/news/nation?page=5001] Attempt 1
[https://www.nst.com.my/news/nation?page=6001] Attempt 1
[https://www.nst.com.my/news/nation?page=7001] Attempt 1
✅ Worker 5: Page 4001 done, total: 20
[https://www.nst.com.my/news/nation?page=4002] Attempt 1
✅ Worker 6: Page 5001 done, total: 20
[https://www.nst.com.my/news/nation?page=5002] Attempt 1
✅ Worker 4: Page 3001 done, total: 20
[https://www.nst.com.my/news/nation?page=3002] Attempt 1
✅ Worker 3: Page 2001 done, total: 20
[https://www.nst.com.my/news/nation?page=2002] Attempt 1
✅ Worker 2: Page 1001 done, total: 20
[https://www.nst.com.my/news/nation?page=1002] Attempt 1
✅ Worker 1: Page 1 done, total: 20
[https://www.nst.com.my/news/nation?p