## Web scraping for tender impulse

In [5]:
import os
import requests
import json
from bs4 import BeautifulSoup


In [6]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}

# Asia pacific
#LIST_URL = "https://tenderimpulse.com/singapore-tenders" # unable to scrap from main page

# Single tender page for SG
LIST_URL = "https://tenderimpulse.com/government-tenders/singapore/invitation-to-quote-for-the-production-supply-and-delivery-of-marketing-collater-10311167"

In [7]:
def get_page(url):
    """Fetch a webpage with polite sleeping."""
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")

def scrape_tenders(soup):
    tenders = []
    # Extract from meta tags and title for a single tender detail page
    title_tag = soup.find('title')
    title_text = title_tag.get_text(strip=True) if title_tag else ""
    meta_desc = soup.find('meta', attrs={"name": "description"})
    description = meta_desc["content"] if meta_desc and meta_desc.has_attr("content") else ""
    meta_og_desc = soup.find('meta', attrs={"property": "og:description"})
    og_description = meta_og_desc["content"] if meta_og_desc and meta_og_desc.has_attr("content") else ""
    meta_og_title = soup.find('meta', attrs={"property": "og:title"})
    og_title = meta_og_title["content"] if meta_og_title and meta_og_title.has_attr("content") else ""
    meta_canonical = soup.find('link', rel="canonical")
    detail_url = meta_canonical["href"] if meta_canonical and meta_canonical.has_attr("href") else ""
    meta_og_image = soup.find('meta', attrs={"property": "og:image"})
    image_url = meta_og_image["content"] if meta_og_image and meta_og_image.has_attr("content") else ""
    # Try to extract tender id from description
    import re
    tender_id_match = re.search(r'Tender Id: (\d+)', description)
    tender_id = tender_id_match.group(1) if tender_id_match else ""
    # Try to extract country from title or description
    country_match = re.search(r'Tender in ([A-Za-z ]+)', title_text)
    country = country_match.group(1) if country_match else ""
    if not country:
        country_match2 = re.search(r'([A-Za-z ]+) Tenders', title_text)
        country = country_match2.group(1) if country_match2 else ""
    # Extract publish date and deadline date from tender-details paragraphs
    publish_date = ""
    deadline_date = ""
    for p in soup.find_all('p', class_='tender-details'):
        if 'Publish Date:' in p.text:
            strong = p.find('strong')
            if strong:
                publish_date = strong.get_text(strip=True)
        if 'Deadline Date:' in p.text:
            strong = p.find('strong')
            if strong:
                deadline_date = strong.get_text(strip=True)
    tenders.append({
        "no": 1,
        "title": title_text or og_title,
        "description": description or og_description,
        "country": country,
        "tender_id": tender_id,
        "detail_url": detail_url,
        "image_url": image_url,
        "publish_date": publish_date,
        "deadline_date": deadline_date
    })
    return tenders

In [8]:
if __name__ == "__main__":
    os.makedirs("output", exist_ok=True)
    soup = get_page(LIST_URL)
    # HTML page
    with open("output/listing_page.html", "w", encoding="utf-8") as f:
        f.write(str(soup))
    
    # Scraped tenders for the first page
    tenders = scrape_tenders(soup)
    with open("output/scrap_output.json", "w", encoding="utf-8") as f:
        json.dump(tenders, f, ensure_ascii=False, indent=2)

Future dev: scrap for all pages