In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from datetime import date, timedelta
import time
import sqlite3
import numpy as np
import json
import re

In [2]:
conn = sqlite3.connect('rentals.db')

cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS rentals_ontario (
    rental_id INTEGER PRIMARY KEY AUTOINCREMENT,
    property_id INTEGER,
    city TEXT,
    adress TEXT,
    Nbed INTEGER,
    Nbath INTEGER,
    pets INTEGER,
    size INTEGER,
    utilities BLOB,
    features_amenities BLOB,
    property_type TEXT,
    property_subtype TEXT,
    parking_type TEXT,
    parking_spots INTEGER,
    lease_term TEXT,
    short_term INTEGER,
    furnished INTEGER,
    year_built INTEGER,
    NS_daycares INTEGER,
    NS_quiet INTEGER,
    NS_high_schools INTEGER,
    NS_pedestrian_friendly INTEGER,
    NS_groceries_stores INTEGER,
    NS_elementary_schools INTEGER,
    NS_car_friendly INTEGER,
    NS_coffee_shops INTEGER,
    NS_restaurants INTEGER,
    NS_shopping INTEGER,
    NS_vibrancy INTEGER,
    NS_nightlife INTEGER,
    price INTEGER,
    updated TEXT
)
''')

<sqlite3.Cursor at 0x20641dd8ab0>

In [3]:
# Set all the places that the bot is going to extract data from.
cities = '''Ailsa Craig
Alfred
Ameliasburgh
Arkona
Azilda
Barrys Bay
Blackstock
Blenheim
Bobcaygeon
Bothwell
Brechin
Brights Grove
Caledon East
Caledon Village
Camlachie
Cayuga
Cedar Valley
Cheltenham
Chesley
Claremont
Clarksburg
Cobden
Colborne
Coldwater
Corunna
Crysler
Delhi
Dundalk
Dunrobin
Dutton
East Garafraxa
Elliot Lake
Elmvale
Emeryville
Erin
Gananoque
Garson
Goderich
Goodwood
Grand Bend
Grand Valley
Greater Napanee
Grey Highlands
Hagersville
Halton Hills
Hampton
Hanover
Hawkesbury
Hawkestone
Hepworth
Ilderton
Ingleside
Jordan
Kemble
Kettleby
Kincardine
Kirkfield
Kleinburg Station
Lambton Shores
Limehouse
Lions Head
Listowel
Little Britain
Loretto
Lowbanks
Lucan
Madoc
Mallorytown
Markdale
Marmora
Merrickville
Midhurst
Millbrook
Millgrove
Minesing
Mount Brydges
Mount Forest
New Hamburg
Norfolk
North Bay
North Gower
Nottawa
Odessa
Oro
Oro Station
Pakenham
Paris
Parry Sound
Pefferlaw
Pembroke
Petawawa
Picton
Plantagenet
Pointe Aux Roches
Point Edward
Port Dover
Port Elgin
Port Severn
Prescott
Prince Edward
Puslinch
Queenston
Renfrew
Ridgeville
Rockwood
Rosemont
Saint Marys
Sauble Beach
Schreiber
Severn
Singhampton
Smooth Rock Falls
Southampton
Southwold
Springwater
Sunderland
Sutton
Tara
Terra Cotta
Vars
Wahnapitae
Wainfleet
Washago
Waterford
Wellesley
Winchester
Woodlawn
Wyoming'''

# Transform the string in a way that could be used in the url and split it on a list.
cities = cities.lower().replace(' ','-')
cities = cities.split('\n')

In [4]:
# Download the lastest version of the Chrome Driver.
service = Service(ChromeDriverManager().install())

# Selenium Web Scraping Steps

1. Initialize the browser with implicitly wait of 30 seconds
2. Access the site https://rentals.ca/
3. Add the name of the city in the url
4. Open the browser with maximized window
5. Get the html content and check the number of total results
6. Check pagination
7. Click on the property
8. Define the rental within the loop
9. Define all variables
10. Register collected data on the database
11. Loop the iteration to get all the properties
12. Switch page and execute tasks after Step 5 until there is no more pages

In [None]:
# Step 1 - Initialize the browser with implicitly wait of 30 seconds.
browser = webdriver.Chrome(service=service)
browser.implicitly_wait(30)


# Step 2 and 3 - Access the site https://rentals.ca/ with the city name.
city = 'halifax'
browser.get(f'https://rentals.ca/{city}')


# Step 4 - Open the browser with maximized window in order to the bot be able to collect rent information.
browser.maximize_window()
time.sleep(5)


# Step 5 - Get all html content from the page to parse with bs4 and check the number of results
menu_info = BeautifulSoup(browser.page_source, 'html.parser')

Nresults = menu_info.find('div', attrs={'class':'page-title__bottom-line'}).find_next('strong').text
Nresults = re.search(r'\d+', Nresults).group(0)

property_ids = []

# Step 6 - Check pagination
pagination = 1 if menu_info.find('ul', attrs={'class':'pagination'}) else 0
last_page = 0


# Step 7 - CLick on the property
rentals = browser.find_elements(By.CLASS_NAME, 'listing-card__details-link')

while last_page == 0:


    try:
        last_page = 1 if menu_info.find('li', attrs={'class':'pagination__item pagination__item--disabled'}).find('a', attrs={'data-msgid':'Next'}) else 0
    except:
        pass

    property_ids_div = menu_info.find_all('div', attrs={'class':'listing-card-container col-12'})

    for p_id in property_ids_div:
        property_id = p_id.attrs['data-property-id'] #or .get_attribute('data-property-id')
        property_ids.append(property_id)


    for rent in range(0, len(rentals)):
        

        # Step 8 - Open the rental page
        rentals[rent].click()
        time.sleep(5)

        rent_page = BeautifulSoup(browser.page_source, 'html.parser')
        rent_information = rent_page.find('div', attrs={'class':'listing-overview'})


        # Step 9 - Define all the desire variables
    
        # Top Panel
        updated = rent_information.find('p', attrs={'class':'listing-card__last-updated'}).text
        adress = rent_information.find('h2', attrs={'class':'listing-card__title'}).get_text(strip=True)
        adress = adress[:(adress.rfind('-')-1)]

        main_features = rent_information.find('ul', attrs={'class':'listing-card__main-features'})
        main_features = [x.get_text() for x in main_features.find_all('span')]
        pets = 1 if 'Pets' in main_features else 0

        # Floor Plans
        floor_plans = rent_information.find('div', attrs={'class':'listing-floor-plans__filter'}).find_next_siblings('div')
        Nfloor_groups = len(floor_plans)

        groups = {}

        for group in range(1, Nfloor_groups+1):

            Nbed = floor_plans[group-1].find('span').text
            Nbed = re.search('((?=\.\d|\d)(?:\d+)?(?:\.?\d*))', Nbed).group(0)

            group_panel = floor_plans[group-1].find('div', attrs={'id':f'floor-plan-group{Nbed}'})

            Ngroup_items = len(group_panel.find_all('div', attrs={'class':'unit-details'}))

            for item in range(1, Ngroup_items+1):

                group_items = {}

                price = group_panel.find('li', attrs={'class':'unit-details__infos--price'}).get_text(strip=True)[1:]
                price = re.sub(',', '', price)

                Nbath = group_panel.find('li', attrs={'class':'unit-details__infos--baths'}).text
                Nbath = re.search(r'\d+', Nbath).group(0)

                try:
                    size = group_panel.find('li', attrs={'class':'unit-details__infos--dimensions'}).text
                    size = re.search(r'\d+', size).group(0)
                except:
                    size = 0
                    pass

                group_items['bed'] = Nbed
                group_items['price'] = price
                group_items['bath'] = Nbath
                group_items['size'] = size

                groups[f'group_{group}|item_{item}'] = group_items


        # Utilities Included
        utilities_panel = rent_information.find('ul', attrs={'class':'listing-utilities'})

        if utilities_panel:
            utilities = [x.get_text(strip=True) for x in utilities_panel.find_all('li')]
        else:
            utilities = []

        utilities_json = json.dumps(utilities)

        # About Panel
        about_highlight = rent_information.find('ul', attrs={'class':'listing-highlighted-info'})

        property_type = about_highlight.find('h4', attrs={'data-msgid':'Property Type'}).find_next('p').text
        property_subtype = about_highlight.find('h4', attrs={'data-msgid':'Property Sub-type'}).find_next('p').text
        parking_type = about_highlight.find('h4', attrs={'data-msgid':'Parking Type'}).find_next('p').text
        parking_spots = 99 if about_highlight.find('h4', attrs={'data-msgid':'Parking Spots'}).find_next('p').text == 'No Info' else about_highlight.find('h4', attrs={'data-msgid':'Parking Spots'}).find_next('p').text
        lease_term = about_highlight.find('h4', attrs={'data-msgid':'Lease Term'}).find_next('p').text
        short_term = 1 if about_highlight.find('h4', attrs={'data-msgid':'Short-term'}).find_next('p').text == 'Yes' else 0
        furnished = 1 if about_highlight.find('h4', attrs={'data-msgid':'Furnished'}).find_next('p').text == 'Yes' else 0
        year_built = 0 if about_highlight.find('h4', attrs={'data-msgid':'Year Built'}).find_next('p').text == '' or 'No Info' else about_highlight.find('h4', attrs={'data-msgid':'Year Built'}).find_next('p').text

        # Features & Amenities

        features_amenities = {}

        features_amenities_panel = rent_information.find('div', attrs={'class':'menu-panel-slider__panels'})

        if features_amenities_panel:
            feature_sections = features_amenities_panel.find_all('section', attrs={'class':'menu-panel-slider__panel'})

            for section in range(1, len(feature_sections)+1):

                features_amenities_type = feature_sections[section-1].find('h4', attrs={'class':'menu-panel-slider__panel-heading'}).get_text(strip=True)
                features_amenities_type = re.search(r'\D+', features_amenities_type).group(0)

                features_amenities_items_panel = features_amenities_panel.find('ul', attrs={'class':'listing-features-and-amenities__content'})
                features_amenities_items = [x.get_text(strip=True) for x in features_amenities_panel.find_all('li')]

                features_amenities[features_amenities_type] = features_amenities_items
        
        features_amenities_json = json.dumps(features_amenities)

        # Neighbourhood Score

        NS_daycares = rent_information.find('li', attrs={'data-id':'daycares'}).find_all('span')[-1].text
        NS_quiet = rent_information.find('li', attrs={'data-id':'quiet'}).find_all('span')[-1].text
        NS_high_schools = rent_information.find('li', attrs={'data-id':'high_schools'}).find_all('span')[-1].text
        NS_pedestrian_friendly = rent_information.find('li', attrs={'data-id':'pedestrian_friendly'}).find_all('span')[-1].text
        NS_groceries_stores = rent_information.find('li', attrs={'data-id':'groceries'}).find_all('span')[-1].text
        NS_elementary_schools = rent_information.find('li', attrs={'data-id':'primary_schools'}).find_all('span')[-1].text
        NS_car_friendly = rent_information.find('li', attrs={'data-id':'car_friendly'}).find_all('span')[-1].text
        NS_coffee_shops = rent_information.find('li', attrs={'data-id':'cafes'}).find_all('span')[-1].text
        NS_restaurants = rent_information.find('li', attrs={'data-id':'restaurants'}).find_all('span')[-1].text
        NS_shopping = rent_information.find('li', attrs={'data-id':'shopping'}).find_all('span')[-1].text
        NS_vibrancy = rent_information.find('li', attrs={'data-id':'vibrant'}).find_all('span')[-1].text
        NS_nightlife = rent_information.find('li', attrs={'data-id':'nightlife'}).find_all('span')[-1].text

        # Get back to menu
        browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div/div[3]/div[1]/div[2]/div/div/div/div/div/div/div/div[1]/button[1]').click()        


        # Step 10 - Store the data
        for record in groups:

            # Get the information from the groups
            Nbed = groups[record]['bed']
            Nbath = groups[record]['bath']
            size = groups[record]['size']
            price = groups[record]['price']


            sql_insert = '''
            INSERT INTO rentals_ontario (
                property_id,
                city,
                adress,
                Nbed,
                Nbath,
                pets,
                size,
                utilities,
                features_amenities,
                property_type,
                property_subtype,
                parking_type,
                parking_spots,
                lease_term,
                short_term,
                furnished,
                year_built,
                NS_daycares,
                NS_quiet,
                NS_high_schools,
                NS_pedestrian_friendly,
                NS_groceries_stores,
                NS_elementary_schools,
                NS_car_friendly,
                NS_coffee_shops,
                NS_restaurants,
                NS_shopping,
                NS_vibrancy,
                NS_nightlife,
                price,
                updated
            )
            VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
            '''
            
            args = (
                property_id[rent],
                city,
                adress,
                int(Nbed),
                int(Nbath),
                pets,
                int(size),
                utilities_json,
                features_amenities_json,
                property_type,
                property_subtype,
                parking_type,
                int(parking_spots),
                lease_term,
                short_term,
                furnished,
                int(year_built),
                int(NS_daycares),
                int(NS_quiet),
                int(NS_high_schools),
                int(NS_pedestrian_friendly),
                int(NS_groceries_stores),
                int(NS_elementary_schools),
                int(NS_car_friendly),
                int(NS_coffee_shops),
                int(NS_restaurants),
                int(NS_shopping),
                int(NS_vibrancy),
                int(NS_nightlife),
                int(price),
                updated
            )

            cursor.execute(sql_insert, args)
            conn.commit()

    # Step 11 and 12 - Loop the iteration and switch page

    if pagination == 1:
        
        browser.find_element(By.LINK_TEXT, 'Next').click()
        time.sleep(5)

        menu_info = BeautifulSoup(browser.page_source, 'html.parser')
        rentals = browser.find_elements(By.CLASS_NAME, 'listing-card__details-link')

        property_ids = []
    
    else:
        break

conn.close()

In [6]:
conn = con = sqlite3.connect('rentals.db')

cursor = conn.cursor()

cursor.execute('SELECT * FROM rentals_ontario')

rows = cursor.fetchall()

for row in rows:
    print(row)

conn.close()