In [1]:
# Import dependencies
import chromedriver_autoinstaller
chromedriver_autoinstaller.install()
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import re # to work with regex
import time # to add delays between requests
import random # to make those delays a random amount of time
from datetime import date # to save today's date as 'scraped_date' in data
from pathlib import Path


## Variables that change each run

Please only edit these variables when scraping different pages, to keep output file names consistent.

In [2]:
# Property type code:
# 1 condos, 2 freehold townhomes, 3 condo townhomes, 5 detached homes, 4 no filter(don't use)
property_type_code = 5
# Page range to scrape (inclusive)
first_page_to_scrape = 276
last_page_to_scrape = 375
# last_page_to_scrape = 477 # 477, should be ~9521 records

## Define functions

In [3]:
# Dictionary with areas and cities to use to add area
area_decoder = {
    # 'cy'
    'Toronto': ['Toronto'],
    # '3a-3c-3e-3g-3i-3b-3d-3f-3h-3j'
    'York Region' : [
        'Aurora',
        'East Gwillimbury',
        'Georgina',
        'Georgina Islands',
        'King',
        'Markham',
        'Newmarket',
        'Richmond Hill',
        'Vaughan',
        'Whitchurch-Stouffville'
    ],
    # 'x-w-v'
    'Peel Region': [
        'Brampton',
        'Caledon',
        'Mississauga'
    ],
    # 'i-j-l-k'
    'Halton Region':[
        'Burlington',
        'Halton Hills',
        'Milton',
        'Oakville'
    ],
    # '16-y-z-10-11-12-13-15'
    'Durham Region':[
        'Ajax',
        'Brock',
        'Clarington',
        'Oshawa',
        'Pickering',
        'Scugog',
        'Uxbridge',
        'Whitby'
    ]
}

In [4]:
# Define function to scrape a single page,
# @arg html: scraped html
# @return list of listing dictionaries
def get_listing_info(html):
    # Create a BeautifulSoup object from the scraped HTML
    soup = BeautifulSoup(html, 'html.parser')
    # List of listings in the html soup, found using class sl
    listings_html = soup.find_all('div', 'sl')
    # Empty list to store listing dicts
    listings_list = []
    for item in listings_html:
        # get address
        address = item.find('div', 'slt_address').text
        # get location info
        location = item.find_all('na2')
        city = location[0].text
        neighbourhood = 'not_provided' if len(location) == 1 else location[1].text
        street = location[2].text if len(location) == 3 else address.strip(' 1234567890#')
        # Create a dictionary containing the info for a listing
        listing = {
            'url': item.find('div', 'slt_address').a['href'],
            'address': address,
            'price': item.find('div', 'slt_price').text.strip('$ CAD').replace(',', ''),
            'baths': item.find('div', 'slt_baths').text.rstrip(' baths'),
            'beds': item.find('div', 'slt_beds').text.rstrip(' beds').partition('+')[0],
            'dens': item.find('div', 'slt_beds').text.rstrip(' beds').partition('+')[2],
            'street': street,
            'neighbourhood': neighbourhood,
            'city': city
        }
        # add area, based on decoder, if not in decoder, stays as 'other'
        listing['area'] = 'Other'
        for area, city_list in area_decoder.items():
            if listing['city'] in city_list:
                listing['area'] = area
        # if no den, put 0
        listing['dens'] = '0' if listing['dens'] == '' else listing['dens']
        # Extract mls_id from url
        listing['mls_id'] = re.search('[A-Z]\\d{7}', listing['url']).group()
        # Append to the list
        listings_list.append(listing)
    # Return expanded listings_list
    return listings_list

In [5]:
# Define function that scrapes a range of pages from listing.ca toronto section
# @arg property_type: 1 is condos, 2 freehold townhomes, 3 condo townhomes, 4 no filter, 5 detached homes
# @return dataframe of listing data
def get_df_from_pages(start_page, end_page, property_type):
    # Define area code
    # area = '..' # all areas
    # Empty listing info list
    listing_info_list = []
    # Launch browser
    driver = webdriver.Chrome()
    # Loop tthrough a range corresponding to search pages and extend listing info list
    for i in range(start_page, end_page + 1):
        url = f'https://listing.ca/mls/?{property_type}..........{i}..$'
        driver.get(url)
        listing_info_list.extend(get_listing_info(driver.page_source))
        # Wait a random amount between 1 and 2 seconds, to avoid connection errors
        time.sleep(random.random() + 1)
    # Quit browsing session
    driver.quit()
    # Make df with list of dictionaries
    df = pd.DataFrame(listing_info_list)
    # Dictionary with what property type codes mean
    property_type_decoder = {
        1: 'condo_apartment',
        2: 'freehold _townhome',
        3: 'condo_townhome',
        5: 'detached_home'
    }
    # Add property type column
    df['property_type'] = property_type_decoder[property_type]
    # Add date scraped colum
    df['date_scraped'] = date.today()
    # Change type of these columns to integer
    for col_name in ['price', 'baths', 'beds', 'dens']:
        df[col_name] = df[col_name].astype('int64')
    return df

## Run scraping function and save results

In [6]:
# Run function with arguments: start and end page and property type code
scraped_df = get_df_from_pages(
    start_page=first_page_to_scrape,
    end_page=last_page_to_scrape,
    property_type=property_type_code
)
# Check result
print(scraped_df.info())
scraped_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 993 entries, 0 to 992
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            993 non-null    object
 1   address        993 non-null    object
 2   price          993 non-null    int64 
 3   baths          993 non-null    int64 
 4   beds           993 non-null    int64 
 5   dens           993 non-null    int64 
 6   street         993 non-null    object
 7   neighbourhood  993 non-null    object
 8   city           993 non-null    object
 9   area           993 non-null    object
 10  mls_id         993 non-null    object
 11  property_type  993 non-null    object
 12  date_scraped   993 non-null    object
dtypes: int64(4), object(9)
memory usage: 101.0+ KB
None


Unnamed: 0,url,address,price,baths,beds,dens,street,neighbourhood,city,area,mls_id,property_type,date_scraped
0,https://niagara-on-the-lake.listing.ca/305-cen...,305 Centre St,1349000,3,3,1,Centre St,not_provided,Niagara-on-the-Lake,Other,X7365082,detached_home,2024-01-30
1,https://georgina.listing.ca/3-inlet-crt.N73651...,3 Inlet Crt,599900,2,3,0,Inlet Crt,Sutton & Jackson's Point,Georgina,York Region,N7365194,detached_home,2024-01-30
2,https://brock.listing.ca/1090-concession-11-rd...,1090 Concession 11 Rd,1399000,1,2,0,Concession 11 Rd,Cannington,Brock,Durham Region,N7364980,detached_home,2024-01-30
3,https://caledon.listing.ca/16218-airport-rd.W7...,16218 Airport Rd,969500,1,2,1,Airport Rd,Caledon East,Caledon,Peel Region,W7365416,detached_home,2024-01-30
4,https://milton.listing.ca/269-martin-st.W73649...,269 Martin St,3298000,6,4,0,Martin St,Old Milton,Milton,Halton Region,W7364976,detached_home,2024-01-30


In [7]:
scraped_df.tail()

Unnamed: 0,url,address,price,baths,beds,dens,street,neighbourhood,city,area,mls_id,property_type,date_scraped
988,https://quinte-west.listing.ca/59-summer-breez...,59 Summer Breeze Dr,1279000,4,5,0,Summer Breeze Dr,not_provided,Quinte West,Other,X7335010,detached_home,2024-01-30
989,https://mississauga.listing.ca/1953-hindhead-r...,1953 Hindhead Rd,2950000,5,4,2,Hindhead Rd,Clarkson,Mississauga,Peel Region,W7334170,detached_home,2024-01-30
990,https://kawartha-lakes.listing.ca/86-cedartree...,86 Cedartree Lane,916000,2,2,0,Cedartree Lane,Bobcaygeon,Kawartha Lakes,Other,X7334894,detached_home,2024-01-30
991,https://shelburne.listing.ca/634-hammond-st.X7...,634 Hammond St,949900,3,4,0,Hammond St,Shelburne,Shelburne,Other,X7334858,detached_home,2024-01-30
992,https://wainfleet.listing.ca/20816-graybiel-rd...,20816 Graybiel Rd,1399999,4,4,4,Graybiel Rd,not_provided,Wainfleet,Other,X7334828,detached_home,2024-01-30


In [8]:
# Save to csv, using args in file name
scraped_df.to_csv(
    Path(
        'data',
        f'listings_all_type{property_type_code}_' +
        f'pages{first_page_to_scrape:03}-{last_page_to_scrape:03}_{date.today()}.csv'),
    header=True, index=False
)