In [1]:
# Import dependencies
from bs4 import BeautifulSoup
import pandas as pd
import re # to work with regex
from datetime import date # to save today's date as 'scraped_date' in data
from pathlib import Path

# Function to collect data


In [9]:
# Define function to scrape one html source
# @arg html: scraped html
# @return list of listing dictionaries
def get_listing_info(subtype, property_type):
    # Path to html path
    html_file = Path('..', 'data', 'sqft', subtype + '.html')
    # Create a BeautifulSoup object from the scraped HTML
    soup = BeautifulSoup(html_file.read_text(), 'html.parser')
    # List of listings in the html soup
    listings_html = soup.find_all('article')
    # Empty list to store listing dicts
    listings_list = []
    for article in listings_html:
        # To find mls_id and neighbourhood
        p_desc = article.find('div', class_="pDesc").contents
        # To find sqft and status
        description = article.find_all('meta')[-1]['content']
        # To find price, address, beds, dens and baths
        str_elements = list(article.stripped_strings)
        # get address
        address = str_elements[1].replace('\n                                       ', '')
        # Create a dictionary containing the info for a listing
        listing = {
            'url': article.find('meta')['content'],
            'price': str_elements[0].lstrip('$').replace(',', ''),
            'address': address,
            'beds': re.search('Bed: ?([0-9+]*)', description).group(1).partition('+')[0],
            'dens': re.search('Bed: ?([0-9+]*)', description).group(1).partition('+')[2],
            'baths': re.search('Bath: ?([0-9+]*)', description).group(1),
            'sqft': re.search('Area:(.*)Sq Ft', description).group(1),
            'mls_id': p_desc[1].contents[-1],
            'status': re.search('Status: ?([A-Za-z ]*),', description).group(1),
            'subtype': subtype,
            'property_type': property_type,
            'street': address.strip(' 1234567890#-'),
            'neighbourhood': p_desc[10].contents[-1].partition(' -')[0].strip(' '),
            'city': 'Toronto'
        }
        # if no den/beds/baths, put 0
        for k in ['beds', 'dens', 'baths']:
            listing[k] = '0' if listing[k] == '' else listing[k]
        # if empty sqft, write 'not_provided'
        listing['sqft'] = 'not_provided' if listing['sqft'] == '' else listing['sqft']
        # Append to the list
        listings_list.append(listing)
    # Return expanded listings_list
    return listings_list

In [3]:
# Dictionary with what property type codes mean
property_type_decoder = {
    'condo_apartment': [
        'con_apartment', 'con_loft'
    ],
    'freehold _townhome': [
        'res_semidetached', 'res_townhomes'
    ],
    'condo_townhome': [
        'con_townhome'
    ],
    'detached_home': [
        'res_bungalow', 'res_detached'
    ]
}


## Run scraping function and save results

In [10]:
listing_data_list = []
# Loop through property types
for property_type, subtypes in property_type_decoder.items():
    for subtype in subtypes:
        # Run function to get data
        listing_data_list.extend(get_listing_info(subtype, property_type))
# Make list into df and drop duplicate ids
df = pd.DataFrame(listing_data_list).drop_duplicates(['mls_id'], ignore_index=True)
# Add date scraped colum
df['date_scraped'] = date.today()


In [11]:
# Change type of these columns to integer
for col_name in ['price', 'baths', 'beds', 'dens']:
    df[col_name] = df[col_name].astype('int64')

In [12]:
# Check result
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1544 entries, 0 to 1543
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            1544 non-null   object
 1   price          1544 non-null   int64 
 2   address        1544 non-null   object
 3   beds           1544 non-null   int64 
 4   dens           1544 non-null   int64 
 5   baths          1544 non-null   int64 
 6   sqft           1544 non-null   object
 7   mls_id         1544 non-null   object
 8   status         1544 non-null   object
 9   subtype        1544 non-null   object
 10  property_type  1544 non-null   object
 11  street         1544 non-null   object
 12  neighbourhood  1544 non-null   object
 13  city           1544 non-null   object
 14  date_scraped   1544 non-null   object
dtypes: int64(4), object(11)
memory usage: 181.1+ KB
None


Unnamed: 0,url,price,address,beds,dens,baths,sqft,mls_id,status,subtype,property_type,street,neighbourhood,city,date_scraped
0,https://torontocondoteam.ca/5106-1-bloor-st-e-...,1699000,5106 - 1 Bloor St E,2,1,3,1000-1199,C7326020,For Sale,con_apartment,condo_apartment,Bloor St E,Church-Yonge Corridor,Toronto,2024-02-05
1,https://torontocondoteam.ca/1604-181-huron-st-...,743000,1604 - 181 Huron St,1,0,1,0-499,C7362210,For Sale,con_apartment,condo_apartment,Huron St,Kensington-Chinatown,Toronto,2024-02-05
2,https://torontocondoteam.ca/5109-14-york-st-c7...,698888,5109 - 14 York St,1,0,1,500-599,C7251458,For Sale,con_apartment,condo_apartment,York St,Waterfront Communities C1,Toronto,2024-02-05
3,https://torontocondoteam.ca/3404-77-harbour-sq...,669900,3404 - 77 Harbour Sq,1,1,1,600-699,C6792974,For Sale,con_apartment,condo_apartment,Harbour Sq,Waterfront Communities C1,Toronto,2024-02-05
4,https://torontocondoteam.ca/906-51-trolley-cre...,575000,906 - 51 Trolley Cres,1,0,1,500-599,C6802076,For Sale,con_apartment,condo_apartment,Trolley Cres,Moss Park,Toronto,2024-02-05


In [13]:
df.tail()

Unnamed: 0,url,price,address,beds,dens,baths,sqft,mls_id,status,subtype,property_type,street,neighbourhood,city,date_scraped
1539,https://torontocondoteam.ca/103-15-brunel-crt-...,649900,103 - 15 Brunel Crt,1,0,1,700-799,C7014914,For Sale,con_loft,condo_apartment,Brunel Crt,Waterfront Communities C1,Toronto,2024-02-05
1540,https://torontocondoteam.ca/314-380-macpherson...,979000,314 - 380 Macpherson Ave,1,1,1,800-899,C7297528,For Sale,con_loft,condo_apartment,Macpherson Ave,Casa Loma,Toronto,2024-02-05
1541,https://torontocondoteam.ca/4-200-clinton-st-c...,1995000,4 - 200 Clinton St,2,0,2,1600-1799,C7296196,For Sale,con_loft,condo_apartment,Clinton St,Palmerston-Little Italy,Toronto,2024-02-05
1542,https://torontocondoteam.ca/408-637-lake-shore...,1199000,408 - 637 Lake Shore Blvd W,1,0,1,1000-1199,C7302798,For Sale,con_loft,condo_apartment,Lake Shore Blvd W,Niagara,Toronto,2024-02-05
1543,https://torontocondoteam.ca/405-150-logan-ave-...,1252990,405 - 150 Logan Ave,2,0,2,900-999,E6787680,For Sale,con_loft,condo_apartment,Logan Ave,South Riverdale,Toronto,2024-02-05


In [10]:
# Save to csv, using args in file name
df.to_csv(
    Path('..', 'data', 'sqft', 'all_listings' + f'_{date.today()}.csv'),
    header=True, index=False
)