In [1]:
# Import dependencies
import chromedriver_autoinstaller
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
chromedriver_autoinstaller.install()
# Launch browser
driver = webdriver.Chrome()

In [3]:
#Visit listing.ca
url = "https://toronto.listing.ca/"
driver.get(url)

In [4]:
#Scrape the website
html = driver.page_source

#Create a BeautifulSoup object from the scraped HTML
soup = BeautifulSoup(html, 'html.parser')

## Initial scraping

In [5]:
# List of listings in the html soup, found using class sl
listings_html = soup.find_all('div', 'sl')
# Print first element of the list
listings_html[0]


<div class="sl">
<div class="slt">
<div class="slt_address"><a href="https://toronto.listing.ca/286-main-st-911.E8018446.htm#15-1" style="font-size:16px;">286 Main St 911</a></div>
<div class="slt_price">$619,900 CAD</div>
<div class="slt_baths"><span class="slt_bb_value">1</span> baths</div>
<div class="slt_beds"><span class="slt_bb_value">1+1</span> beds</div>
</div>
<div class="sl_loc">
<div>
<na2><a class="na2" href="https://toronto.listing.ca">Toronto</a></na2>
<na2><a class="na2" href="https://toronto.listing.ca/east-end-danforth.htm">East End-Danforth</a></na2>
<na2><a class="na2" href="https://toronto.listing.ca/main-st.htm">Main St</a></na2>
</div>
<div>
<div>( <script type="text/javascript">document.write('<a href="https://listing.ca/buy-real-estate-with-bitcoin.htm" target="_blank">?</a>');</script><a href="https://listing.ca/buy-real-estate-with-bitcoin.htm" target="_blank">?</a> )</div>
<div>- BTC</div>
</div>
</div>
<div style="font-size:12px;">Assignment sale at the BRAN

In [6]:
# Loop through listings and append the clean info a list of dictionaries
listings_list = []
for item in listings_html:
    # Create a dictionary containing the info for a listing
    listing = {
        'url': item.find('div', 'slt_address').a['href'],
        'address': item.find('div', 'slt_address').text,
        'price': item.find('div', 'slt_price').text,
        'baths': item.find('div', 'slt_baths').text,
        'beds': item.find('div', 'slt_beds').text,
        'neighbourhood': item.find_all('na2')[1].text
    }
    # Append to the list
    listings_list.append(listing)
# Check results
listings_list[0:3]

[{'url': 'https://toronto.listing.ca/286-main-st-911.E8018446.htm#15-1',
  'address': '286 Main St 911',
  'price': '$619,900 CAD',
  'baths': '1 baths',
  'beds': '1+1 beds',
  'neighbourhood': 'East End-Danforth'},
 {'url': 'https://toronto.listing.ca/18-willowlea-dr.E7353624.htm#15-2',
  'address': '18 Willowlea Dr',
  'price': '$1,699,900 CAD',
  'baths': '0 baths',
  'beds': '0 beds',
  'neighbourhood': 'Highland Creek'},
 {'url': 'https://toronto.listing.ca/5-kingsbury-cres.E7292902.htm#15-3',
  'address': '5 Kingsbury Cres',
  'price': '$15,000,000 CAD',
  'baths': '1 baths',
  'beds': '2 beds',
  'neighbourhood': 'Birchcliffe-Cliffside'}]

In [7]:
# Create dataframe with info from first page's listings
df = pd.DataFrame(listings_list)
df.head()

Unnamed: 0,url,address,price,baths,beds,neighbourhood
0,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,"$619,900 CAD",1 baths,1+1 beds,East End-Danforth
1,https://toronto.listing.ca/18-willowlea-dr.E73...,18 Willowlea Dr,"$1,699,900 CAD",0 baths,0 beds,Highland Creek
2,https://toronto.listing.ca/5-kingsbury-cres.E7...,5 Kingsbury Cres,"$15,000,000 CAD",1 baths,2 beds,Birchcliffe-Cliffside
3,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,"$529,000 CAD",1 baths,1 beds,Waterfront Communities C1
4,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,"$624,900 CAD",1 baths,1+1 beds,Mimico


In [11]:
# Extract mls id from url column
df['mls_id'] = [re.search('[A-Z]\\d{7}', url).group() for url in df['url']]
df.head()

Unnamed: 0,url,address,price,baths,beds,neighbourhood,mls_id
0,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,"$619,900 CAD",1 baths,1+1 beds,East End-Danforth,E8018446
1,https://toronto.listing.ca/18-willowlea-dr.E73...,18 Willowlea Dr,"$1,699,900 CAD",0 baths,0 beds,Highland Creek,E7353624
2,https://toronto.listing.ca/5-kingsbury-cres.E7...,5 Kingsbury Cres,"$15,000,000 CAD",1 baths,2 beds,Birchcliffe-Cliffside,E7292902
3,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,"$529,000 CAD",1 baths,1 beds,Waterfront Communities C1,C7266728
4,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,"$624,900 CAD",1 baths,1+1 beds,Mimico,W7239426


In [10]:
# Quit browsing session
driver.quit()

## Functions and page loops

In [15]:
#Define function to scrape listing info
def get_listing_info():
    #Empty list to store the listing info
    listing_info = []

    #Collect HTML from browser
    html = driver.page_source
    #Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    #Save main page of the website to a variable
    main_page = soup.select_one('div', id_ = 'div_all')
    #Find all the listing addresses
    listing_addresses = main_page.find_all('div', class_ = 'slt_address')
    #Find all the listing prices
    listing_prices = main_page.find_all('div', class_ = 'slt_price')
    #Find all the beds
    listing_beds = main_page.find_all('div', class_ = 'slt_beds')
    #Find all the baths
    listing_baths = main_page.find_all('div', class_ = 'slt_baths')
    #Find all the listing locations
    listing_locations = main_page.find_all('a', class_ = 'na2')

    #Loop through the addresses and append the text to a new list
    addresses = []
    for address in listing_addresses:
        addresses.append(address.get_text())
    #Loop through the prices and append the text to a new list
    prices = []
    for price in listing_prices:
        prices.append(price.get_text())
    #Loop through the beds and append the text to a new list
    beds = []
    for bed in listing_beds:
        beds.append(bed.get_text())
    #Loop through the baths and append the text to a new list
    baths = []
    for bath in listing_baths:
        baths.append(bath.get_text())
    #Loop through the locations and append the text to a new list
    locations_list = []
    for location in listing_locations:
        locations_list.append(location.get_text())
    #Loop through the locations again and concatenate every 2nd and 3rd element (neighbourhood + street)
    locations = []
    for i in range(0, len(locations_list), 3):
        if i + 2 < len(locations_list):
            concat_location = locations_list[i + 1] + ', ' + locations_list[i + 2]
            locations.append(concat_location)
    #Create a dictionary with the listing info
    listing_info_dict = {'address': addresses, 'price': prices, 'beds': beds, 'baths': baths, 'location': locations}
    #Append the dictionary to the listing_info list
    listing_info.append(listing_info_dict)
    #Return the listing_info list
    return listing_info


In [16]:
#Import By function from selenium so driver can click on 'Next' page
from selenium.webdriver.common.by import By

In [17]:
#Loop to collect the listing info and click to the next page for 5 total pages
listing_info_list = []

for _ in range(5):
    listing_info_list.extend(get_listing_info())
    driver.find_element(By.PARTIAL_LINK_TEXT, 'Next').click()

In [18]:
#Check the output of the above loop 
listing_info_list

[{'address': ['286 Main St 911',
   '55 Harrison Garden Blvd 808',
   '18 Willowlea Dr',
   '5 Kingsbury Cres',
   '215 Queen St 606',
   '10 Park Lawn Rd 1408',
   '1240 Lansdowne Ave',
   '8 Ivy Ave',
   '370 Rosewell Ave',
   '54 Granlea Rd',
   '1 Rowntree Rd 612',
   '6 Buddleswood Crt',
   '12 Burdock Lane',
   '195 Redpath Ave Th01',
   '10 Guildwood Pkwy 424',
   '150 Heath St 703',
   '38 Grenville St Ph4506',
   '25 Carlton St 1401',
   '151 Dan Leckie Way 352',
   '205 Manning Ave 309'],
  'price': ['$619,900 CAD',
   '$725,000 CAD',
   '$1,699,900 CAD',
   '$15,000,000 CAD',
   '$529,000 CAD',
   '$624,900 CAD',
   '$995,000 CAD',
   '$1,399,900 CAD',
   '$2,288,900 CAD',
   '$2,990,000 CAD',
   '$624,999 CAD',
   '$799,000 CAD',
   '$848,000 CAD',
   '$1,098,000 CAD',
   '$1,050,000 CAD',
   '$2,299,000 CAD',
   '$1,650,000 CAD',
   '$399,000 CAD',
   '$1,160,000 CAD',
   '$849,000 CAD'],
  'beds': ['1+1 beds',
   '2 beds',
   '0 beds',
   '2 beds',
   '1 beds',
   '1+1 be

In [19]:
#Convert listing_info_list to a pandas DF
df_all = pd.DataFrame(listing_info_list)
df_all = df_all.apply(lambda col: col.explode(), axis=0).reset_index(drop=True)
df_all

#output of listing_info_list is a list of dictionaries, each containing 5 lists (address, price, beds, baths, location)
#one dictionary for each page that was scraped (5)
#might need to change function so output is formatted differently and easier to convert to DF

Unnamed: 0,address,price,beds,baths,location
0,286 Main St 911,"$619,900 CAD",1+1 beds,1 baths,"East End-Danforth, Main St"
1,55 Harrison Garden Blvd 808,"$725,000 CAD",2 beds,2 baths,"Willowdale East, Harrison Garden Blvd"
2,18 Willowlea Dr,"$1,699,900 CAD",0 beds,0 baths,"Highland Creek, Willowlea Dr"
3,5 Kingsbury Cres,"$15,000,000 CAD",2 beds,1 baths,"Birchcliffe-Cliffside, Kingsbury Cres"
4,215 Queen St 606,"$529,000 CAD",1 beds,1 baths,"Waterfront Communities C1, Queen St"
...,...,...,...,...,...
95,460 Adelaide St 1131,"$617,999 CAD",1 beds,1 baths,"Moss Park, Adelaide St"
96,44 Loney Ave,"$1,050,000 CAD",3+1 beds,2 baths,"Downsview-Roding-CFB, Loney Ave"
97,120 Redwater Dr,"$1,199,000 CAD",3+3 beds,3 baths,"Rexdale-Kipling, Redwater Dr"
98,39 Drewry Ave 36,"$799,000 CAD",2+1 beds,2 baths,"Newtonbrook West, Drewry Ave"
