In [1]:
# Import dependencies
import chromedriver_autoinstaller
chromedriver_autoinstaller.install()
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import re # to work with regex
import time # to add delays between requests


## Scraping one page

In [2]:
# # Launch browser
# driver = webdriver.Chrome()
# # Visit listing.ca
# url = "https://toronto.listing.ca/"
# driver.get(url)

In [3]:
# # Create a BeautifulSoup object from the scraped HTML
# soup = BeautifulSoup(driver.page_source, 'html.parser')

In [4]:
# # List of listings in the html soup, found using class sl
# listings_html = soup.find_all('div', 'sl')
# # Print first element of the list
# listings_html[0]


In [5]:
# # Loop through listings and append the clean info a list of dictionaries
# listings_list = []
# for item in listings_html:
#     # Create a dictionary containing the info for a listing
#     listing = {
#         'url': item.find('div', 'slt_address').a['href'],
#         'price': item.find('div', 'slt_price').text.strip('$ CAD').replace(',', ''),
#         'baths': item.find('div', 'slt_baths').text.rstrip(' baths'),
#         'beds': item.find('div', 'slt_beds').text.rstrip(' beds').partition('+')[0],
#         'dens': item.find('div', 'slt_beds').text.rstrip(' beds').partition('+')[2],
#         'street': item.find_all('na2')[2].text,
#         'neighbourhood': item.find_all('na2')[1].text
#     }
#     # Extract mls_id from url
#     listing['mls_id'] = re.search('[A-Z]\\d{7}', listing['url']).group()
#     # Append to the list
#     listings_list.append(listing)
# # Check results
# listings_list[0:3]

In [6]:
# # Create dataframe with info from first page's listings
# df = pd.DataFrame(listings_list)
# df.head()

In [7]:
# # Quit browsing session
# driver.quit()

## Function and page loops

In [8]:
# Define function to scrape a single page,
# @arg html: scraped html
# @return list of listing dictionaries
def get_listing_info(html):
    # Create a BeautifulSoup object from the scraped HTML
    soup = BeautifulSoup(html, 'html.parser')
    # List of listings in the html soup, found using class sl
    listings_html = soup.find_all('div', 'sl')
    # Empty list to store listing dicts
    listings_list = []
    for item in listings_html:
        # Create a dictionary containing the info for a listing
        listing = {
            'url': item.find('div', 'slt_address').a['href'],
            'address': item.find('div', 'slt_address').text,
            'price': item.find('div', 'slt_price').text.strip('$ CAD').replace(',', ''),
            'baths': item.find('div', 'slt_baths').text.rstrip(' baths'),
            'beds': item.find('div', 'slt_beds').text.rstrip(' beds').partition('+')[0],
            'dens': item.find('div', 'slt_beds').text.rstrip(' beds').partition('+')[2],
            'street': item.find_all('na2')[2].text,
            'neighbourhood': item.find_all('na2')[1].text
        }
        # if no den, put 0
        listing['dens'] = '0' if listing['dens'] == '' else listing['dens']
        # Extract mls_id from url
        listing['mls_id'] = re.search('[A-Z]\\d{7}', listing['url']).group()
        # Append to the list
        listings_list.append(listing)
    # Return expanded listings_list
    return listings_list

In [9]:
# Define function that scrapes a range of pages from listing.ca toronto section
# @arg property_type: 1 is condos, 2 freehold townhomes, 3 condo townhomes, 4 no filter, 5 detached homes
# @return dataframe of listing data
def get_df_from_pages(start_page, end_page, property_type):
    # Empty listing info list
    listing_info_list = []
    # Launch browser
    driver = webdriver.Chrome()
    # Loop tthrough a range corresponding to search pages and extend listing info list
    for i in range(start_page, end_page + 1):
        url = f'https://listing.ca/mls/?{property_type}.cy.........{i}..$'
        driver.get(url)
        listing_info_list.extend(get_listing_info(driver.page_source))
        # Wait 1 second, to avoid connection errors
        time.sleep(1)
    # Quit browsing session
    driver.quit()
    # Make df with list of dictionaries
    df = pd.DataFrame(listing_info_list)
    # Dictionary with property type meanings
    property_type_decoder = {
        1: 'condo_apartment',
        2: 'freehold _townhome',
        3: 'condo_townhome',
        5: 'detached_home'
    }
    # Put property type in column
    df['property_type'] = property_type_decoder[property_type]
    # Change type of these columns to integer
    for col_name in ['price', 'baths', 'beds', 'dens']:
        df[col_name] = df[col_name].astype('int64')
    return df

In [10]:
# Test function using pages 2 to 3, should result in 40 records (20 per page)
test_df = get_df_from_pages(2, 3, 1)
print(test_df.info())
test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            40 non-null     object
 1   address        40 non-null     object
 2   price          40 non-null     int64 
 3   baths          40 non-null     int64 
 4   beds           40 non-null     int64 
 5   dens           40 non-null     int64 
 6   street         40 non-null     object
 7   neighbourhood  40 non-null     object
 8   mls_id         40 non-null     object
 9   property_type  40 non-null     object
dtypes: int64(4), object(6)
memory usage: 3.2+ KB
None


Unnamed: 0,url,address,price,baths,beds,dens,street,neighbourhood,mls_id,property_type
0,https://toronto.listing.ca/251-manitoba-st-280...,251 Manitoba St 2804,824990,2,2,1,Manitoba St,Mimico,W8030310,condo_apartment
1,https://toronto.listing.ca/35-mercer-st-3711.C...,35 Mercer St 3711,1037000,2,2,0,Mercer St,Waterfront Communities C1,C8030300,condo_apartment
2,https://toronto.listing.ca/15-queens-quay-east...,15 Queens Quay East Quay 2202,999900,1,2,1,Queens Quay East Quay,Waterfront Communities C8,C8030224,condo_apartment
3,https://toronto.listing.ca/2720-dundas-st-520....,2720 Dundas St 520,950000,1,2,0,Dundas St,Junction Area,W8030156,condo_apartment
4,https://toronto.listing.ca/5-mabelle-ave-3329....,5 Mabelle Ave 3329,788000,2,2,0,Mabelle Ave,Islington-City Centre West,W8030106,condo_apartment
