<a href="https://colab.research.google.com/github/cslm1/computability_lecture/blob/main/selenium_energy_made_easy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# setup a headless chrome driver
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

import sys
sys.path.insert(0,'/usr/local/lib/chromium-browser/chromedriver') # check I may not need local
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

In [None]:
# imports
from bs4 import BeautifulSoup as bsoup
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.common.exceptions import TimeoutException as te

In [None]:
# define functions

def get_provider_from_result(result):
  provider = result.find_next(
      "div", class_="icon-column").find_next("img")["aria-label"]
  return provider


def get_plan_name_from_result(result):
  plan_name = result.find_next("a").string
  return plan_name


def get_price_from_result(result):
  price = result.find_next("div", class_="need-to-know-price-column").find_next(
      "div", class_="price-wrapper").find_next("span", class_="value").string
  return price


def make_retail_elec_df(result, suburb, postcode):
    df_dict = {"suburb": suburb,
               "postcode": postcode,
               "provider": [get_provider_from_result(result)],
               "plan_name": [get_plan_name_from_result(result)],
               "price_per_year": [get_price_from_result(result)]}
    df = pd.DataFrame(df_dict)
    return df


def find_available_buttons(button):
  return button["id"]
# another helper for clicking buttons


def click_btn(btn):
  wd.find_element(By.NAME, btn).click()

# the beast

def get_retail_prices(suburb, postcode):
  wd.get("https://www.energymadeeasy.gov.au/start")
  time.sleep(1)
  query_words = suburb + " " + postcode
  wd.find_element(By.NAME, "electricity").click()
  wd.find_element(By.ID, 'autocomplete-postcode').send_keys(query_words)
  time.sleep(1)

  # click things that are constant in all forms
  WebDriverWait(wd, timeout=10).until(ec.presence_of_element_located(
      (By.CLASS_NAME, "autocomplete__results__item")))
  wd.find_element(By.CLASS_NAME, "autocomplete__results__item").click()
  wd.find_element(By.NAME, "2to3People").click()
  wd.find_element(By.NAME, "noUsage").click()

  # because the 'Tell me about your Property' part of the form varies by suburb
  # we first detect what buttons are available in the form
  check_form = bsoup(wd.page_source, 'html.parser')
  form = check_form.find(
      "section", class_="form-section").find_all("button", class_="radio-button btn")
  # collect together all the available buttons by their id's
  avail_btns = [find_available_buttons(btns) for btns in form]

  avail_btns = list(set(avail_btns))
  # all buttons we have encountered and the button we will press if available
  buttons_dict = {"id": ['ac', 'esh', 'ewh', 'cl', 'sp', 'uh', 'pv', 'smartMeter'], "btn_to_press": [
      'ac-Yes', 'esh-Yes', 'ewh-Yes', 'cl-No', 'sp-No', 'uh-No', 'pv-No', 'smartMeter-No']}
  buttons_df = pd.DataFrame(buttons_dict)

  for btn in buttons_df[buttons_df["id"].isin(avail_btns)]["btn_to_press"]:
    click_btn(btn)
  # finally we just want to check whether the form has given us the electricity distributor button to press
  distrib_required = check_form.find('div', class_='distributor')

  # this choice is probably the most important for our results
  # should we choose based on their geog/suburb?
  if distrib_required != None:
    if 'distributor' in distrib_required["class"]:
      dist_ele = wd.find_element(
          By.XPATH, "//*[@id='electricity-distributor-radio-item-0']")
      wd.execute_script("arguments[0].click()", dist_ele)

  # this should be in each form
  retailer_drop_down = Select(wd.find_element(By.NAME, "electricity-retailer"))
  retailer_drop_down.select_by_value("notSure")

  # accept terms of the form
  ele = wd.find_element(By.XPATH, '//*[@id="acceptTerms"]')
  wd.execute_script("arguments[0].click()", ele)

  # submit the form
  wd.find_element(By.ID, "submit").click()

  # after the form has been submitted and results loaded we scrape the html
  WebDriverWait(wd, timeout=30).until(
      ec.presence_of_element_located((By.CLASS_NAME, "results-plans")))
  #wd.save_screenshot("./progress/accept_terms.png")
  soup = bsoup(wd.page_source, 'html.parser')

  results = soup.find_all("div", class_="plan-results-tile")
  dfs = []
  for i in results:
    dfs.append(make_retail_elec_df(i, suburb = suburb, postcode = postcode))

  return pd.concat(dfs, ignore_index=True)

In [None]:
def main():
  suburbs = pd.read_csv('electricity_areas.csv', dtype = str)
  final = pd.DataFrame()
  
  for idx, row in suburbs.iterrows():
    try:
      cur = get_retail_prices(suburb = row['suburb'], postcode = row['postcode'])
      final = pd.concat([final, cur], ignore_index = True)
    except te:
      pass

  final.to_csv('output.csv', encoding = 'utf-8', index = False)

# run the code in the above function.
main()