In [1]:
# Import dependencies
import chromedriver_autoinstaller
chromedriver_autoinstaller.install()
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import re # to work with regex
import time # to add delays between requests
from datetime import date # to save today's date as 'scraped_date' in data
from pathlib import Path


## Variables that change each run

Please only edit these variables when scraping different pages, to keep output file names consistent.

In [2]:
# Property type code:
# 1 condos, 2 freehold townhomes, 3 condo townhomes, 5 detached homes, 4 no filter(don't use)
property_type_code = 2
# Page range to scrape (inclusive)
first_page_to_scrape = 1
last_page_to_scrape = 16

## Define functions

In [3]:
# Define function to scrape a single page,
# @arg html: scraped html
# @return list of listing dictionaries
def get_listing_info(html):
    # Create a BeautifulSoup object from the scraped HTML
    soup = BeautifulSoup(html, 'html.parser')
    # List of listings in the html soup, found using class sl
    listings_html = soup.find_all('div', 'sl')
    # Empty list to store listing dicts
    listings_list = []
    for item in listings_html:
        # Create a dictionary containing the info for a listing
        listing = {
            'url': item.find('div', 'slt_address').a['href'],
            'address': item.find('div', 'slt_address').text,
            'price': item.find('div', 'slt_price').text.strip('$ CAD').replace(',', ''),
            'baths': item.find('div', 'slt_baths').text.rstrip(' baths'),
            'beds': item.find('div', 'slt_beds').text.rstrip(' beds').partition('+')[0],
            'dens': item.find('div', 'slt_beds').text.rstrip(' beds').partition('+')[2],
            'street': item.find_all('na2')[2].text,
            'neighbourhood': item.find_all('na2')[1].text
        }
        # if no den, put 0
        listing['dens'] = '0' if listing['dens'] == '' else listing['dens']
        # Extract mls_id from url
        listing['mls_id'] = re.search('[A-Z]\\d{7}', listing['url']).group()
        # Append to the list
        listings_list.append(listing)
    # Return expanded listings_list
    return listings_list

In [4]:
# Define function that scrapes a range of pages from listing.ca toronto section
# @arg property_type: 1 is condos, 2 freehold townhomes, 3 condo townhomes, 4 no filter, 5 detached homes
# @return dataframe of listing data
def get_df_from_pages(start_page, end_page, property_type):
    # Define city code (Toronto is cy)
    city = 'cy'
    # Empty listing info list
    listing_info_list = []
    # Launch browser
    driver = webdriver.Chrome()
    # Loop tthrough a range corresponding to search pages and extend listing info list
    for i in range(start_page, end_page + 1):
        url = f'https://listing.ca/mls/?{property_type}.{city}.........{i}..$'
        driver.get(url)
        listing_info_list.extend(get_listing_info(driver.page_source))
        # Wait 1 second, to avoid connection errors
        time.sleep(1)
    # Quit browsing session
    driver.quit()
    # Make df with list of dictionaries
    df = pd.DataFrame(listing_info_list)
    # Dictionary with what city codes mean
    city_code_decoder = {
        'cy': 'Toronto'
    }
    # Add city column
    df['city'] = city_code_decoder[city]
    # Dictionary with what property type codes mean
    property_type_decoder = {
        1: 'condo_apartment',
        2: 'freehold _townhome',
        3: 'condo_townhome',
        5: 'detached_home'
    }
    # Add property type column
    df['property_type'] = property_type_decoder[property_type]
    # Add date scraped colum
    df['date_scraped'] = date.today()
    # Change type of these columns to integer
    for col_name in ['price', 'baths', 'beds', 'dens']:
        df[col_name] = df[col_name].astype('int64')
    return df

## Run scraping function and save results

In [5]:
# Run function with arguments: start and end page and property type code
scraped_df = get_df_from_pages(
    start_page=first_page_to_scrape,
    end_page=last_page_to_scrape,
    property_type=property_type_code
)
# Check result
print(scraped_df.info())
scraped_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3226 entries, 0 to 3225
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            3226 non-null   object
 1   address        3226 non-null   object
 2   price          3226 non-null   int64 
 3   baths          3226 non-null   int64 
 4   beds           3226 non-null   int64 
 5   dens           3226 non-null   int64 
 6   street         3226 non-null   object
 7   neighbourhood  3226 non-null   object
 8   mls_id         3226 non-null   object
 9   city           3226 non-null   object
 10  property_type  3226 non-null   object
 11  date_scraped   3226 non-null   object
dtypes: int64(4), object(8)
memory usage: 302.6+ KB
None


Unnamed: 0,url,address,price,baths,beds,dens,street,neighbourhood,mls_id,city,property_type,date_scraped
0,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,E8018446,Toronto,condo_apartment,2024-01-30
1,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,C7266728,Toronto,condo_apartment,2024-01-30
2,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,W7239426,Toronto,condo_apartment,2024-01-30
3,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,E8030950,Toronto,condo_apartment,2024-01-30
4,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,E8030860,Toronto,condo_apartment,2024-01-30


In [6]:
# Save to csv, using args in file name
scraped_df.to_csv(
    Path(
        'data',
        f'listings_type{property_type_code}_' +
        f'pages{first_page_to_scrape:03}-{last_page_to_scrape:03}_{date.today()}.csv'),
    header=True, index=False
)