In [11]:
import os
import re
import time
import requests
import sqlite3
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service

In [14]:
url = 'https://subway.com.my/find-a-subway'

# disable it as current version only support 114.0.5735.90
# s = Service(os.getcwd() + "\\chromedriver.exe")
# driver = webdriver.Chrome(service = s)

driver = webdriver.Chrome()
driver.get(url)

# wait for the results to load
# time.sleep(2)

# driver.find_element("id", "fp_searchAddress").send_keys('kuala lumpur')
# wait for the search field to be visible and enter the search query
search_field = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, "fp_searchAddress")))
search_field.send_keys('kuala lumpur')

# driver.find_element("id", "fp_searchAddressBtn").click()
# click the search button
search_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "fp_searchAddressBtn")))
search_button.click()

# wait for 10 seconds after clicking the button
time.sleep(10)

soup = BeautifulSoup(driver.page_source, 'html.parser')

pattern = re.compile(r'fp_listitem fp_list_marker\d+')
items = soup.find_all('div', class_ = pattern, style = lambda value: value is None or 'display: none' not in value)
sorted_items = sorted(items, key = lambda x: int(x['style'].split(':')[1].replace(';', '').strip()))

print("total items found:", len(items))
print("total sorted items found:", len(sorted_items))

# connect to SQLite database
conn = sqlite3.connect('database.db')

# create a cursor object
c = conn.cursor()

for item in sorted_items:
    insert_location_name, insert_address, insert_operating_hour, \
        insert_waze_link, insert_latitude, insert_longitude = '', '', '', '', '', ''

    # check if the item is hidden just to be safe
    if 'style' in item.attrs and 'display: none' in item['style']:
        continue  # skip this item if it's hidden

    location_left_div = item.find('div', class_ = 'location_left')
    location_right_div = item.find('div', class_ = 'location_right')

    if location_left_div:
        h4_tag = location_left_div.find('h4')

        if h4_tag:
            insert_location_name = h4_tag.text.strip()
            print("location name:", h4_tag.text.strip())

        infoboxcontent_div = location_left_div.find('div', class_ = 'infoboxcontent')

        if infoboxcontent_div:
            paragraphs = infoboxcontent_div.find_all('p', class_ = lambda x: x != 'infoboxlink')
            # paragraphs = infoboxcontent_div.select('p:not(.infoboxlink)')
            print(len(paragraphs))

            if len(paragraphs) >= 4:
                address = paragraphs[0].text.strip()
                operating_hour = paragraphs[2].text.strip()
                insert_address = address
                insert_operating_hour = operating_hour
                print("address:", address)
                print("operating hour:", operating_hour)
            elif len(paragraphs) == 1:
                address = paragraphs[0].text.strip()
                operating_hour = None
                insert_address = address
                insert_operating_hour = operating_hour
                print("address:", address)
                print("operating hour:", operating_hour)
            elif len(paragraphs) == 3:
                address = None
                operating_hour = paragraphs[1].text.strip()
                insert_address = address
                insert_operating_hour = operating_hour
                print("address:", address)
                print("operating hour:", operating_hour)
            else:
                print("missed")

    if location_right_div:
        direction_button = location_right_div.find('div', class_ = 'directionButton')
        a_links = location_right_div.find_all('a')

        if len(a_links) >= 2:
            waze_link = a_links[1]['href'] if a_links[1]['href'] else None
            insert_waze_link = waze_link
            print("waze link:", waze_link)

    # retrieve geocoding data
    if 'data-latitude' and 'data-longitude' in item.attrs:
        insert_latitude = item['data-latitude']
        insert_longitude = item['data-longitude']
        print(f"latitude: {item['data-latitude']}, longitude: {item['data-longitude']}")
        print("")

    # insert data into database
    c.execute('''
        INSERT INTO subway (location_name, address, operating_hour, waze_link, latitude, longitude)
        VALUES (?, ?, ?, ?, ?, ?)
    ''', (insert_location_name, insert_address, insert_operating_hour, insert_waze_link, insert_latitude, insert_longitude))

driver.quit()

# commit the transaction
conn.commit()

# close the connection
conn.close()

total items found: 135
total sorted items found: 135
location name: Subway NU Sentral
4
address: L4.13, Level Four, NU Sentral, No. 201 Jalan Tun Sambanthan, Kuala Lumpur, 50470
operating hour: Monday - Sunday, 9:00 AM - 9:00 PM
waze link: https://www.waze.com/en/live-map/directions/subway-nu-sentral-l4.13,-level-four,-nu-sentral-kuala-lumpur?place=w.66650143.666435897.27398085
latitude: 3.133075, longitude: 101.687034

location name: Subway Menara UOA Bangsar
4
address: Jalan Bangsar Utama 1, Unit 1-2-G, Menara UOA Bangsar, Kuala Lumpur, 59000
operating hour: Monday - Sunday, 8:00 AM - 8:00 PM
waze link: https://www.waze.com/en/live-map/directions/my/federal-territory-of-kuala-lumpur/kuala-lumpur/subway-@-menara-uoa-bangsar?place=ChIJPWFRH5RJzDERvHvlO1uTQpY
latitude: 3.128099, longitude: 101.678678

location name: Subway Mid Valley KL
4
address: 1st Floor, Zone 3, Kiosk FK-05, Mid Valley Megamall,  Lingkaran Syed Putra, Mid Valley City, 59200 Kuala Lumpur
operating hour: Monday - Sund