In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from timeit import default_timer as timer
import time
import pandas as pd

In [2]:
# initialise Chrome driver
PATH = '/Users/samuelchai/chromedriver'
driver = webdriver.Chrome(PATH)
# website to scrap
driver.get('https://www.vaksincovid.gov.my/en/ppv/')

In [3]:
# function that scrapes PPV centres from website
def ppv_scraper():
    ppv_list = []
    # get dropdown elements
    select_state = Select(driver.find_element_by_id('selstate'))
    select_district = Select(driver.find_element_by_id('seldistrict'))
    # iterate through all State options
    for state in select_state.options:
        if state.text != 'Show All States' and state.text != 'Choose your state':
            # select state from dropdown
            select_state.select_by_visible_text(state.text)
            for district in select_district.options:
                if district.text != 'Show All Districts':
                    # select district from dropdown
                    select_district.select_by_visible_text(district.text)
                    # get the WebElement of the PPVs
                    ppvrow_elem = driver.find_element_by_id('ppvrow')
                    ppvs_elem = ppvrow_elem.find_elements_by_tag_name('tr')
                    for ppv in ppvs_elem:
                        ppv_list.append([state.text, district.text, ppv.text])
    return ppv_list

In [4]:
start = timer()
ppv_list = ppv_scraper()
end = timer()
print(f'Total Time Taken: {end-start} seconds')

Total Time Taken: 30.835554805999998 seconds


In [9]:
# function that geocode location names (convert address to coordinates)
def geocode(ppv_list):
    ppv_full_list = []
    for ppv in ppv_list:
        # search for location on Google Maps
        map_search_url = 'https://www.google.com/maps/search/' + ppv[2]
        driver.get(map_search_url)
        # explicit delay to allow time for Google Maps to load
        time.sleep(1)
        # find the url WebElement
        map_url = driver.find_element_by_css_selector('meta[itemprop=image]').get_attribute('content')
        # URL might be incomplete for locations that can't be found, return None instead.
        try:
            # string formatting
            coord = map_url.split('?center=')[1].split('&zoom=')[0].split('%2C')
            ppv_full_list.append(ppv + [coord[0], coord[1]])
        except:
            ppv_full_list.append(ppv + [None, None])
    return ppv_full_list

In [10]:
start = timer()
ppv_list = geocode(ppv_list)
driver.close()
end = timer()
print(f'Total Time Taken: {end-start} seconds')

Total Time Taken: 1448.073677901 seconds


In [11]:
df = pd.DataFrame(ppv_list, columns=['state', 'district', 'ppv_name', 'latitude', 'longitude'])
# df.to_csv('ppv_full.csv')