## Web scraping for bisInfo
Example use case: Singapore tenders

In [1]:
import os
import requests
import json
from bs4 import BeautifulSoup

In [2]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.google.com/",
    "Connection": "keep-alive",
}

LIST_URL = "https://www.bidsinfo.com/country/singapore-tenders"

In [3]:
def get_page(url):
    """Fetch a webpage using Selenium and webdriver-manager for ChromeDriver."""
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.chrome.options import Options
    from webdriver_manager.chrome import ChromeDriverManager
    from bs4 import BeautifulSoup
    import time
    
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.get(url)
    time.sleep(5)  # Wait for page to load (adjust if needed)
    html = driver.page_source
    driver.quit()
    return BeautifulSoup(html, "html.parser")

def scrape_tenders(soup):
    tenders = []
    articles = soup.find_all('article', class_='ee-post')
    for idx, article in enumerate(articles, 1):
        # Title and detail URL
        title_tag = article.find('a', class_='bde-text-link-111-107')
        title = ""
        detail_url = ""
        if title_tag:
            h5 = title_tag.find('h5')
            title = h5.get_text(strip=True) if h5 else title_tag.get_text(strip=True)
            detail_url = title_tag.get('href', "")
        # Organization (first icon-list text)
        org = ""
        country = ""
        publish_date = ""
        deadline_date = ""
        icon_texts = article.find_all('span', class_='bde-icon-list__text')
        if icon_texts:
            org = icon_texts[0].get_text(strip=True) if len(icon_texts) > 0 else ""
            country = icon_texts[1].get_text(strip=True) if len(icon_texts) > 1 else ""
            # Publish date is usually the third
            publish_date = icon_texts[2].get_text(strip=True) if len(icon_texts) > 2 else ""
            # Deadline/Closing date is usually the fourth, may have 'Closing Date:' prefix
            if len(icon_texts) > 3:
                deadline_raw = icon_texts[3].get_text(strip=True)
                if 'Closing Date:' in deadline_raw:
                    deadline_date = deadline_raw.replace('Closing Date:', '').strip()
                else:
                    deadline_date = deadline_raw
        tenders.append({
            "no": idx,
            "title": title,
            "organization": org,
            "country": country,
            "publish_date": publish_date,
            "deadline_date": deadline_date,
            "detail_url": detail_url
        })
    return tenders

In [4]:
if __name__ == "__main__":
    os.makedirs("output", exist_ok=True)
    soup = get_page(LIST_URL)
    # Save HTML for inspection
    with open("output/listing_page.html", "w", encoding="utf-8") as f:
        f.write(str(soup.prettify()))
    # Scraped tenders for the first page
    tenders = scrape_tenders(soup)
    with open("output/scrap_output.json", "w", encoding="utf-8") as f:
        json.dump(tenders, f, ensure_ascii=False, indent=2)

blocked by cloudfare

### Another approach to scrap bidsInfo data

In [5]:
# Improved Selenium scraping for Cloudflare-protected pages
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time

def get_real_page(url, wait_selector=None, wait_time=15):
    """
    Use Selenium to bypass Cloudflare and wait for real content.
    wait_selector: CSS selector for an element that only appears on the real page.
    wait_time: Max seconds to wait for the real content.
    """
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.get(url)

    # Wait for Cloudflare to finish and real content to appear
    if wait_selector:
        try:
            WebDriverWait(driver, wait_time).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, wait_selector))
            )
        except Exception as e:
            print(f"Timeout waiting for real content: {e}")
    else:
        time.sleep(wait_time)

    html = driver.page_source
    driver.quit()
    return html

# Example usage: wait for a real tender article to appear
LIST_URL = "https://www.bidsinfo.com/country/singapore-tenders"
REAL_CONTENT_SELECTOR = "article.ee-post"  # Change if needed for your target page

html = get_real_page(LIST_URL, wait_selector=REAL_CONTENT_SELECTOR, wait_time=20)

# Save the correct HTML for inspection
with open("output/listing_page.html", "w", encoding="utf-8") as f:
    f.write(html)

# Parse and extract tenders as before
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
tenders = scrape_tenders(soup)
with open("output/scrap_output.json", "w", encoding="utf-8") as f:
    import json
    json.dump(tenders, f, ensure_ascii=False, indent=2)

print(f"Scraped {len(tenders)} tenders.")

Timeout waiting for real content: Message: 
Stacktrace:
0   chromedriver                        0x00000001008caecc cxxbridge1$str$ptr + 2941512
1   chromedriver                        0x00000001008c2b88 cxxbridge1$str$ptr + 2907908
2   chromedriver                        0x00000001003da2b0 _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 74020
3   chromedriver                        0x000000010042188c _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 366336
4   chromedriver                        0x0000000100462d54 _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 633800
5   chromedriver                        0x0000000100415ef0 _RNvCsgXDX2mvAJAg_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 318820
6   chromedriver                        0x000000010088e0c8 cxxbridge1$str$ptr + 2692164
7   chromedriver                        0x00000001008918dc cxxbridge1$str$ptr + 2706520
8   chromedriver                        0x000000010086e84

Still blocked by Cloudfare