In [26]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
from datetime import date
import time
import json

In [49]:
def initialize_browser(headless=False, logless=False, iw=5):

    service = Service(ChromeDriverManager().install())
    options = Options()
    options.add_argument('--start-maximized')

    if logless:
        options.add_argument('--log-level=3')

    if headless:
        options.add_argument('--headless')
        options.add_argument('--window-size=1920,1200')

    browser = webdriver.Chrome(service=service, options=options)
    browser.implicitly_wait(iw)

    return browser

In [52]:
def start_webscraping(province, attempts=5):

    # Stardirize the province variable
    province = province.lower()

    # Initialize page and pagination variables
    pag = 1
    last_page = 0

    # Initializing the browser and acessing the url
    browser = initialize_browser()
    browser.get(f'https://www.rentcanada.com/{province}')

    # Getting all the listings on the page
    listings = browser.find_elements(By.CLASS_NAME, 'listing-container')


    # Read the file json file for updates and converting the json to a python obejct
    with open('rentals.json', 'r') as file:
        data = json.load(file)


    # Starting the looping to check for the last page
    while last_page == 0:

        print(f'Page: {pag}')

        # Starting the webcraping for each listing 
        for i,listing in enumerate(listings):

            print(f'Listing: {i+1}/{len(listings)}')

            id = listing.get_attribute('id').split('_')[1]
            listing.click()

            # Deal with loading errors through looping attempts
            for i in range(0, attempts):

                try:
                    print(f'Attempt: {i+1} - {id}')

                    time.sleep(2)

                    # Scrolling down the page to force all information to be loaded
                    browser.execute_script('arguments[0].scrollIntoView();', browser.find_element(By.CSS_SELECTOR, f'a[title="More Details"]'))

                    time.sleep(5)

                    # Listing Information

                    # Feature Label
                    try:
                        label = browser.find_element(By.CLASS_NAME, 'featured-label').text
                    except:
                        label = None
                        pass

                    # City and Address
                    listing_heading = browser.find_element(By.CLASS_NAME, 'listing-heading')
                    full_address = listing_heading.find_element(By.CLASS_NAME, 'address').text.split(', ')
                    address = full_address[0]
                    city = full_address[1]

                    # Offerings 
                    offerings_container = browser.find_element(By.CLASS_NAME, 'ww-listing-offerings')
                    offerings = offerings_container.find_elements(By.CLASS_NAME, 'offering')

                    offerings_dict = dict()
                    offerings_list = list()

                    for offering in offerings:

                        beds = offering.find_element(By.CLASS_NAME, 'beds').find_element(By.CLASS_NAME, 'value').text.split(' ')[0]
                        if beds == 'Bachelor':
                            beds = '0'
                        baths = offering.find_element(By.CLASS_NAME, 'baths').find_element(By.CLASS_NAME, 'value').text.split(' ')[0]
                        area = offering.find_element(By.CLASS_NAME, 'sqft').find_element(By.CLASS_NAME, 'value').text.removesuffix(' sqft.')
                        price = offering.find_element(By.CLASS_NAME, 'rate').find_element(By.CLASS_NAME, 'value').text.removeprefix('$').replace(',','')

                        offerings_dict['beds'] = beds
                        offerings_dict['baths'] = baths
                        offerings_dict['area'] = area
                        offerings_dict['price'] = price

                        offerings_list.append(offerings_dict)

                    # Utilities included
                    try: 
                        utilities_container = browser.find_element(By.CLASS_NAME, 'utilities-list')
                        utilities = utilities_container.find_elements(By.TAG_NAME, 'li')

                        utilities_list = list()
                        for utility in utilities:
                            var = utility.get_attribute('class').removeprefix('utility ')
                            utilities_list.append(var)
                    except:
                        utilities_list = list()

                    # Features and Amenities included
                    try:
                        features_amenities_container = browser.find_element(By.CLASS_NAME, 'amenities-container')
                        features_amenities = features_amenities_container.find_elements(By.TAG_NAME, 'span')

                        features_amenities_list = list()
                        for feat_amen in features_amenities:
                            var = feat_amen.text.strip()
                            if var:
                                features_amenities_list.append(var)
                    except:
                        features_amenities_list = list()

                    # Pet Policies
                    try:
                        pet_container = browser.find_element(By.CLASS_NAME, 'pet-policies-container')
                        pet_policies = pet_container.find_elements(By.CLASS_NAME, 'value')

                        pet_policies_list = list()
                        for pet_policy in pet_policies:
                            var = pet_policy.text.strip()
                            if var:
                                pet_policies_list.append(var)
                    except:
                        pet_policies_list = list()

                    # Parking
                    try:
                        parking_container = browser.find_element(By.CLASS_NAME, 'parking-container')
                        parkings = parking_container.find_elements(By.CLASS_NAME, 'value')

                        parkings_list = list()
                        for parking in parkings:
                            var = parking.text.strip()
                            if var:
                                parkings_list.append(var)
                    except:
                        parkings_list = list()

                    # Location Score
                    score_container = browser.find_element(By.CLASS_NAME, 'location-map-container')
                    ovr_score = score_container.find_element(By.ID, 'proximitiiMap-score').text

                    data_cat_scores = ['health','grocery','fitness','park','high','transit','shop','food','coffee','childcare','elem','entertainment']
                    proximitii_score = {'ovr_score':ovr_score}

                    for cat in data_cat_scores:

                        score = score_container.find_element(By.CSS_SELECTOR, f'div[data-category="{cat}"]').find_elements(By.TAG_NAME, 'span')[-1].text
                        proximitii_score[cat] = score

                    # Set no errors
                    str_error = None


                except Exception as e:
                    str_error = str(e)
                    print(str_error)

                if str_error:
                    time.sleep(3)

                else:
                    # Agg all the listing information
                    listing_info = {id:{
                        'label':label, 
                        'address':address, 
                        'city':city,
                        'offerings':offerings_list,
                        'utilities':utilities_list,
                        'features_amenities':features_amenities_list,
                        'pet_policy':pet_policies_list,
                        'location_score':proximitii_score}}

                    # Creating a json object
                    listing_json = json.dumps(listing_info, indent=4)

                    # Converting the python object back to json
                    data.update(json.loads(listing_json))
                    updated_content = json.dumps(data, indent=4)

                    # Writing an updated file with new listing info
                    with open('rentals.json', 'w') as file:
                        file.write(updated_content)

                    break
                
            # Close Listing
            browser.find_element(By.CLASS_NAME, 'close').click()


        # Check Next Page
        pagination = browser.find_element(By.CLASS_NAME, 'pagination-block')

        try:
            last_page = 0 if pagination.find_element(By.CSS_SELECTOR, '.next.enabled') else 1
        except:
            last_page = 1

        if last_page == 0:
            pagination.find_element(By.CSS_SELECTOR, '.next.enabled').click()
            pag += 1

            time.sleep(5)
            listings = browser.find_elements(By.CLASS_NAME, 'listing-container')


    browser.quit()

In [51]:
start_webscraping('Ontario')

Page: 1
Listing: 1/3
Attempt: 1 - 18946
