In [5]:
# Import dependencies
import chromedriver_autoinstaller
chromedriver_autoinstaller.install()
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import re # to work with regex
import time # to add delays between requests


## Scraping one page

In [2]:
# Launch browser
driver = webdriver.Chrome()
# Visit listing.ca
url = "https://toronto.listing.ca/"
driver.get(url)

In [3]:
# Create a BeautifulSoup object from the scraped HTML
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [4]:
# List of listings in the html soup, found using class sl
listings_html = soup.find_all('div', 'sl')
# Print first element of the list
listings_html[0]


<div class="sl">
<div class="slt">
<div class="slt_address"><a href="https://toronto.listing.ca/286-main-st-911.E8018446.htm#15-1" style="font-size:16px;">286 Main St 911</a></div>
<div class="slt_price">$619,900 CAD</div>
<div class="slt_baths"><span class="slt_bb_value">1</span> baths</div>
<div class="slt_beds"><span class="slt_bb_value">1+1</span> beds</div>
</div>
<div class="sl_loc">
<div>
<na2><a class="na2" href="https://toronto.listing.ca">Toronto</a></na2>
<na2><a class="na2" href="https://toronto.listing.ca/east-end-danforth.htm">East End-Danforth</a></na2>
<na2><a class="na2" href="https://toronto.listing.ca/main-st.htm">Main St</a></na2>
</div>
<div>
<div>( <script type="text/javascript">document.write('<a href="https://listing.ca/buy-real-estate-with-bitcoin.htm" target="_blank">?</a>');</script><a href="https://listing.ca/buy-real-estate-with-bitcoin.htm" target="_blank">?</a> )</div>
<div>- BTC</div>
</div>
</div>
<div style="font-size:12px;">Assignment sale at the BRAN

In [5]:
# Loop through listings and append the clean info a list of dictionaries
listings_list = []
for item in listings_html:
    # Create a dictionary containing the info for a listing
    listing = {
        'url': item.find('div', 'slt_address').a['href'],
        'price': item.find('div', 'slt_price').text.strip('$ CAD').replace(',', ''),
        'baths': item.find('div', 'slt_baths').text.rstrip(' baths'),
        'beds': item.find('div', 'slt_beds').text.rstrip(' beds').partition('+')[0],
        'dens': item.find('div', 'slt_beds').text.rstrip(' beds').partition('+')[2],
        'street': item.find_all('na2')[2].text,
        'neighbourhood': item.find_all('na2')[1].text
    }
    # Extract mls_id from url
    listing['mls_id'] = re.search('[A-Z]\\d{7}', listing['url']).group()
    # Append to the list
    listings_list.append(listing)
# Check results
listings_list[0:3]

[{'url': 'https://toronto.listing.ca/286-main-st-911.E8018446.htm#15-1',
  'address': '286 Main St 911',
  'price': '$619,900 CAD',
  'baths': '1 baths',
  'beds': '1+1 beds',
  'neighbourhood': 'East End-Danforth',
  'mls_id': 'E8018446'},
 {'url': 'https://toronto.listing.ca/18-willowlea-dr.E7353624.htm#15-2',
  'address': '18 Willowlea Dr',
  'price': '$1,699,900 CAD',
  'baths': '0 baths',
  'beds': '0 beds',
  'neighbourhood': 'Highland Creek',
  'mls_id': 'E7353624'},
 {'url': 'https://toronto.listing.ca/5-kingsbury-cres.E7292902.htm#15-3',
  'address': '5 Kingsbury Cres',
  'price': '$15,000,000 CAD',
  'baths': '1 baths',
  'beds': '2 beds',
  'neighbourhood': 'Birchcliffe-Cliffside',
  'mls_id': 'E7292902'}]

In [6]:
# Create dataframe with info from first page's listings
df = pd.DataFrame(listings_list)
df.head()

Unnamed: 0,url,address,price,baths,beds,neighbourhood,mls_id
0,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,"$619,900 CAD",1 baths,1+1 beds,East End-Danforth,E8018446
1,https://toronto.listing.ca/18-willowlea-dr.E73...,18 Willowlea Dr,"$1,699,900 CAD",0 baths,0 beds,Highland Creek,E7353624
2,https://toronto.listing.ca/5-kingsbury-cres.E7...,5 Kingsbury Cres,"$15,000,000 CAD",1 baths,2 beds,Birchcliffe-Cliffside,E7292902
3,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,"$529,000 CAD",1 baths,1 beds,Waterfront Communities C1,C7266728
4,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,"$624,900 CAD",1 baths,1+1 beds,Mimico,W7239426


In [8]:
# Quit browsing session
driver.quit()

## Function and page loops

In [9]:
# Define function to scrape a single page,
# @arg html: scraped html
# @return list of listing dictionaries
def get_listing_info(html):
    # Create a BeautifulSoup object from the scraped HTML
    soup = BeautifulSoup(html, 'html.parser')
    # List of listings in the html soup, found using class sl
    listings_html = soup.find_all('div', 'sl')
    # Empty list to store listing dicts
    listings_list = []
    for item in listings_html:
        # Create a dictionary containing the info for a listing
        listing = {
            'url': item.find('div', 'slt_address').a['href'],
            'address': item.find('div', 'slt_address').text,
            'price': item.find('div', 'slt_price').text.strip('$ CAD').replace(',', ''),
            'baths': item.find('div', 'slt_baths').text.rstrip(' baths'),
            'beds': item.find('div', 'slt_beds').text.rstrip(' beds').partition('+')[0],
            'dens': item.find('div', 'slt_beds').text.rstrip(' beds').partition('+')[2],
            'street': item.find_all('na2')[2].text,
            'neighbourhood': item.find_all('na2')[1].text
        }
        # Extract mls_id from url
        listing['mls_id'] = re.search('[A-Z]\\d{7}', listing['url']).group()
        # Append to the list
        listings_list.append(listing)
    # Return expanded listings_list
    return listings_list

In [12]:
# Define function that scrapes a range of pages from listing.ca toronto section
def get_df_from_pages(start_page, end_page):
    # Empty listing info list
    listing_info_list = []
    # Launch browser
    driver = webdriver.Chrome()
    # Loop tthrough a range corresponding to search pages and extend listing info list
    for i in range(start_page, end_page + 1):
        url = f'https://listing.ca/mls/?.cy.........{i}..$'
        driver.get(url)
        listing_info_list.extend(get_listing_info(driver.page_source))
        # Wait 1 second, to avoid connection errors
        time.sleep(1)
    # Quit browsing session
    driver.quit()
    df = pd.DataFrame(listing_info_list)
    return df

In [13]:
# Test function using pages 2 to 5, should result in 80 records (20 per page)
test_df = get_df_from_pages(2, 5)
print(test_df.info())
test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            80 non-null     object
 1   address        80 non-null     object
 2   price          80 non-null     object
 3   baths          80 non-null     object
 4   beds           80 non-null     object
 5   neighbourhood  80 non-null     object
 6   mls_id         80 non-null     object
dtypes: object(7)
memory usage: 4.5+ KB
None


Unnamed: 0,url,address,price,baths,beds,neighbourhood,mls_id
0,https://toronto.listing.ca/20-murmouth-rd.E802...,20 Murmouth Rd,"$974,888 CAD",2 baths,3 beds,Tam O'Shanter-Sullivan,E8028440
1,https://toronto.listing.ca/37-pheasant-rd.C802...,37 Pheasant Rd,"$2,488,000 CAD",5 baths,4+2 beds,Willowdale East,C8028956
2,https://toronto.listing.ca/34-van-horne-ave.C8...,34 Van Horne Ave,"$2,399,000 CAD",4 baths,4 beds,Don Valley Village,C8028806
3,https://toronto.listing.ca/532-fairlawn-ave.C8...,532 Fairlawn Ave,"$3,690,000 CAD",5 baths,4+1 beds,Bedford Park-Nortown,C8028524
4,https://toronto.listing.ca/221-holmes-ave.C802...,221 Holmes Ave,"$1,795,000 CAD",2 baths,3+2 beds,Willowdale East,C8028416
