In [1]:
import pandas as pd
import numpy as np
import re
import json
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import time

In [None]:
def split_coordinate(four_coords, divisions_longs, devision_lats, if_big_box):
    
    if if_big_box:
        [min_latitude, max_latitude, min_longitude, max_longitude] = [float(x) for x in if_big_box.split(':')]
    else:
         [min_latitude, max_latitude, min_longitude, max_longitude] = four_coords

    longitude_step = (max_longitude - min_longitude) / divisions_longs
    latitude_step = (max_latitude - min_latitude) / devision_lats  # Typo: should be "divisions_lats"

    coord_boxes = []
    
    # Generate bounding boxes for each grid cell
    for i in range(divisions_longs):
        for j in range(devision_lats):
            box_min_lat = round(min_latitude + j * latitude_step, 5)
            box_max_lat = round(min_latitude + (j + 1) * latitude_step, 5)
            box_min_lon = round(min_longitude + i * longitude_step, 5)
            box_max_lon = round(min_longitude + (i + 1) * longitude_step, 5)

            # Store bounding box as a string in the format "min_lat:max_lat:min_lon:max_lon"
            box_str = f"{box_min_lat}:{box_max_lat}:{box_min_lon}:{box_max_lon}"
            coord_boxes.append(box_str)
    
    return coord_boxes

In [3]:
def vancouver_grid(head, divisions_longs, devision_lats):
    """
    Generates a grid of latitude-longitude bounding boxes within Vancouver's city boundary.

    Parameters:
    head (dict): Headers for the API request.
    divisions_longs (int): Number of divisions along the longitude (default is 15).
    devision_lats (int): Number of divisions along the latitude (default is 15).

    Returns:
    list: A list of strings representing bounding boxes in the format "min_lat:max_lat:min_lon:max_lon".
    """
    
    # API endpoint for Vancouver city boundary geo-coordinates
    van_geo_info_url = 'https://opendata.vancouver.ca/api/explore/v2.1/catalog/datasets/city-boundary/records?limit=20'
    
    # Fetch geographical data from the API
    response = requests.get(van_geo_info_url, headers=head)
    geo_data = response.json()
    
    # Extract the city boundary coordinates
    contour = geo_data['results'][0]['geom']['geometry']['coordinates']

    # Flatten the list of coordinates and extract longitude and latitude values separately
    longitudes = [coord[0] for sublist in contour for coord in sublist]
    latitudes = [coord[1] for sublist in contour for coord in sublist]

    # Determine the minimum and maximum longitude and latitude values
    max_longitude = max(longitudes)
    min_longitude = min(longitudes)
    max_latitude = max(latitudes)
    min_latitude = min(latitudes)
    four_coords = [min_latitude, max_latitude, min_longitude, max_longitude]

    
    coord_boxes = split_coordinate(four_coords, divisions_longs, devision_lats, if_big_box = 0)
 
    
    return coord_boxes


In [4]:
def listing_count(head, coord_box):
    """
    Fetches the number of real estate listings within a specified coordinate box from Redfin.

    Parameters:
    head (dict): Headers for the HTTP request.
    coord_box (str): A string representing the bounding box in the format "min_lat:max_lat:min_lon:max_lon".

    Returns:
    tuple: (viewport_url, select_listing_count, total_listing_count)
        - viewport_url (str): The URL used for the request.
        - select_listing_count (int): The number of listings shown in the current viewport.
        - total_listing_count (int): The total number of listings available.
        - If no listings are found, returns 'no_listing'.
    """
    
    # Construct the URL for the given coordinate box
    viewport_url = f"https://www.redfin.ca/bc/vancouver/filter/viewport={coord_box}"
    
    # Send a GET request to fetch the webpage
    resp = requests.get(viewport_url, headers=head)

    # Raise an error if the request fails (non-200 status code)
    if resp.status_code != 200:
        raise Exception("Failing in webpage requests")
    
    # Parse the HTML response using BeautifulSoup
    soup = BeautifulSoup(resp.text, 'html.parser')

    # Check if the page indicates no listings are available
    if soup.find('div', {'class': 'HomeViews reversePosition'}).find('h2'):
        return 'no_listing'
    
    # Extract the listing summary section
    listing_summary = soup.find('div', {'class': "homes summary reversePosition"})

    # Use regex to extract numeric values from the listing summary
    select_listing_count, total_listing_count = re.findall(r'\d{1,10}(?:,\d{1,10})*', listing_summary.text)
    
    # Convert extracted strings into integers, handling comma formatting
    select_listing_count, total_listing_count = int(select_listing_count), int(total_listing_count.replace(',', ''))
    
    return viewport_url, select_listing_count, total_listing_count


In [5]:
def crawling_redfin(head, viewport_url, page):
    """
    Crawls a specific page of real estate listings from Redfin within a given viewport.

    Parameters:
    head (dict): Headers for the HTTP request.
    viewport_url (str): Base URL for the listings search.
    page (int): Page number to crawl.

    Returns:
    list: A list of BeautifulSoup objects representing individual property listings.
    """
    
    # Construct the URL for the specified page number
    target_url = f"{viewport_url}/page-{page}"
    
    # Send a GET request to fetch the webpage
    resp = requests.get(target_url, headers=head)

    # Raise an error if the request fails (non-200 status code)
    if resp.status_code != 200:
        raise Exception("Failing in webpage requests")
    
    # Parse the HTML response using BeautifulSoup
    soup = BeautifulSoup(resp.text, 'html.parser')

    return soup

In [6]:
def key_metric_extraction(soup, real_estate_info):
    """
    Extracts key real estate metrics from Redfin listing elements.

    Parameters:
    soup_boxes (list): A list of BeautifulSoup objects representing property listings.
    real_estate_info (dict): A dictionary to store extracted real estate information. 
                             The dictionary should have keys: 'address', 'zip_code', 'price', 
                             'bed', 'bath', 'sqr_footage', and 'property_link'.

    Returns:
    list: A list of indices where data extraction was incomplete.
    """
    
    incomplete_idx = []  # Stores indices of listings with missing data

    soup_boxes = soup.find_all("div", {"class": "HomeCardContainer"})

    for i, box in enumerate(soup_boxes):
        try:
            # Extract address (excluding last 23 characters, likely city/state info)
            address = box.find('address').text[:(-23)]
            real_estate_info['address'].append(address)
        except: 
            real_estate_info['address'].append(np.nan)
            incomplete_idx.append(i)

        try:
            # Extract ZIP code (last 7 characters of address text)
            zip_code = box.find('address').text[-7:]
            real_estate_info['zip_code'].append(zip_code)
        except: 
            real_estate_info['zip_code'].append(np.nan)
            incomplete_idx.append(i)        

        try:
            # Extract price
            price = box.find('span', {'class': 'bp-Homecard__Price--value'}).text
            real_estate_info['price'].append(price)
        except: 
            real_estate_info['price'].append(np.nan)
            incomplete_idx.append(i)

        try:
            # Extract number of bedrooms
            bed = box.find('span', {'class': 'bp-Homecard__Stats--beds text-nowrap'}).text
            real_estate_info['bed'].append(bed)
        except: 
            real_estate_info['bed'].append(np.nan)
            incomplete_idx.append(i)   

        try:
            # Extract number of bathrooms
            bath = box.find('span', {'class': 'bp-Homecard__Stats--baths text-nowrap'}).text
            real_estate_info['bath'].append(bath)
        except: 
            real_estate_info['bath'].append(np.nan)
            incomplete_idx.append(i)   

        try:
            # Extract square footage (locked stats section)
            sqr_footage = box.find('span', {'class': 'bp-Homecard__LockedStat--value'}).text
            real_estate_info['sqr_footage'].append(sqr_footage)
        except:
            real_estate_info['sqr_footage'].append(np.nan)
            incomplete_idx.append(i)   

        try:
            # Extract property link (prepend base URL)
            property_link = "https://www.redfin.com" + box.find("a").get('href')
            real_estate_info['property_link'].append(property_link)
        except:
            real_estate_info['property_link'].append(np.nan)
            incomplete_idx.append(i)

    return incomplete_idx


In [7]:
def calculate_min_pages(total_count, items_per_page):
    """
    Calculates the minimum number of pages required to display all items.

    Parameters:
    total_count (int): The total number of items to be displayed.
    items_per_page (int): The maximum number of items that can be displayed per page.

    Returns:
    int: The minimum number of pages required.
    """
    
    # Use integer division to determine the number of pages needed
    # Adding (items_per_page - 1) ensures proper rounding up
    return (total_count + items_per_page - 1) // items_per_page

In [8]:
def extracting_by_batch(head, batch_num, divisions_longs=15, devision_lats=15, splitted_big_box = 0):
    """
    Extracts real estate listing data from Redfin in batches using predefined coordinate grids.

    Parameters:
    head (dict): Headers for the HTTP requests.
    batch_num (int): The number of batches to divide the coordinate boxes into.
    test_batch_index (list, optional): The range of batch indices to process. Defaults to [0,1].

    Returns:
    tuple: (real_estate_info, missing_entries, big_coord_boxes)
        - real_estate_info (dict): A dictionary storing extracted real estate information.
        - missing_entries (dict): A dictionary tracking missing entries for incomplete pages.
        - big_coord_boxes (list): A list of coordinate boxes requiring further subdivision.
    """
    
    big_coord_boxes = []  # Stores coordinate boxes where select listing count < total listing count
    real_estate_info = defaultdict(list)  # Dictionary to store extracted real estate information
    missing_entries = defaultdict(list)  # Dictionary to track missing data entries

    # Generate the coordinate grid for Vancouver and split into batches
    if splitted_big_box:
        coord_boxes = splitted_big_box

    else: coord_boxes = vancouver_grid(head, divisions_longs, devision_lats)
    
    coord_box_batch = np.array_split(coord_boxes, batch_num)

    # Iterate over the specified batch range
    for i in range(len(coord_box_batch)):
        batch = coord_box_batch[i]

        # Process each coordinate box in the batch
        for coord_box in batch:
            listing_info = listing_count(head, coord_box)
            time.sleep(1)  # Prevent overwhelming the server

            # Skip if there are no listings in the area
            if listing_info == 'no_listing':
                print(f"Batch {i}-{coord_box} has no listings.")
                continue
            else:
                viewport_url, select_listing_count, total_listing_count = listing_info

                # If the selected listing count is less than the total, store the coordinate box for further subdivision
                if select_listing_count != total_listing_count:
                    big_coord_boxes.append(coord_box)
                    continue
                else:
                    # Calculate the number of pages to crawl based on listings per page (assumed 9 per page)
                    max_page = calculate_min_pages(select_listing_count, items_per_page=9)
                    missing = defaultdict(list)  # Tracks missing indices for this coordinate box

                    # Crawl and extract data for each page
                    for page in range(1, max_page):
                        soup_boxes = crawling_redfin(head, viewport_url, page)                        
                        incomplete_idx = key_metric_extraction(soup_boxes, real_estate_info)

                        # Store any missing data indices
                        if incomplete_idx:
                            missing[f'page_{page}'].append(incomplete_idx)

                        time.sleep(1)  # Prevent overwhelming the server

                    # Store missing entries for this coordinate box
                    missing_entries[coord_box].append(missing)

    return real_estate_info, missing_entries, big_coord_boxes

In [9]:
head = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
real_estate_info, missing_entries, big_coord_boxes = extracting_by_batch(head, batch_num=5, divisions_longs=6, devision_lats=6)

Batch 0-49.19835:49.21799:-123.22479:-123.19118 has no listings.
Batch 0-49.21799:49.23762:-123.22479:-123.19118 has no listings.
Batch 0-49.2769:49.29654:-123.22479:-123.19118 has no listings.
Batch 0-49.29654:49.31617:-123.22479:-123.19118 has no listings.
Batch 0-49.19835:49.21799:-123.19118:-123.15758 has no listings.
Batch 1-49.2769:49.29654:-123.19118:-123.15758 has no listings.
Batch 1-49.29654:49.31617:-123.19118:-123.15758 has no listings.
Batch 2-49.29654:49.31617:-123.15758:-123.12397 has no listings.
Batch 3-49.29654:49.31617:-123.12397:-123.09036 has no listings.
Batch 4-49.29654:49.31617:-123.09036:-123.05676 has no listings.
Batch 4-49.29654:49.31617:-123.05676:-123.02315 has no listings.


In [10]:
result = pd.DataFrame(real_estate_info)
result

Unnamed: 0,address,zip_code,price,bed,bath,sqr_footage,property_link
0,4091 W 34th Ave,V6N 2L6,"$2,788,000",5 beds,2 baths,1653,https://www.redfin.com/bc/vancouver/4091-W-34t...
1,4067 W 37th Ave,V6N 2W6,"$4,800,000",6 beds,4 baths,4189,https://www.redfin.com/bc/vancouver/4067-W-37t...
2,4056 W 36th Ave,V6N 2S9,"$6,750,000",4 beds,3 baths,3837,https://www.redfin.com/bc/vancouver/4056-W-36t...
3,3968 W 23rd Ave,V6S 1L2,"$3,388,000",4 beds,3.5 baths,2359,https://www.redfin.com/bc/vancouver/3968-W-23r...
4,4022 W 30th Ave,V6S 1X5,"$6,590,000",5 beds,5.5 baths,4190,https://www.redfin.com/bc/vancouver/4022-W-30t...
...,...,...,...,...,...,...,...
8339,2658 Dundas St #2,V5K 1P9,"$1,675,000",3 beds,4 baths,1407,https://www.redfin.com/bc/vancouver/2658-Dunda...
8340,3617 Adanac St,V5K 2P7,"$1,725,000",4 beds,3 baths,1922,https://www.redfin.com/bc/vancouver/3617-Adana...
8341,16 N Kaslo St,V5K 3M8,"$1,695,000",3 beds,2 baths,1623,https://www.redfin.com/bc/vancouver/16-N-Kaslo...
8342,2818 Adanac St,V5K 2N3,"$3,299,000",7 beds,7 baths,2952,https://www.redfin.com/bc/vancouver/2818-Adana...


In [12]:
big_coord_boxes

[np.str_('49.25726:49.2769:-123.15758:-123.12397'),
 np.str_('49.2769:49.29654:-123.15758:-123.12397'),
 np.str_('49.25726:49.2769:-123.12397:-123.09036'),
 np.str_('49.2769:49.29654:-123.12397:-123.09036')]

In [17]:
big_coord_boxes[0]

np.str_('49.25726:49.2769:-123.15758:-123.12397')

In [23]:
splitted_boxes = []
for big_box in big_coord_boxes:
    splitted_box = split_coordinate(four_coords=1, divisions_longs=3, devision_lats=3, if_big_box=big_box)
    splitted_boxes.append(splitted_box)
splitted_boxes
    

[['49.25726:49.26381:-123.15758:-123.14638',
  '49.26381:49.27035:-123.15758:-123.14638',
  '49.27035:49.2769:-123.15758:-123.14638',
  '49.25726:49.26381:-123.14638:-123.13517',
  '49.26381:49.27035:-123.14638:-123.13517',
  '49.27035:49.2769:-123.14638:-123.13517',
  '49.25726:49.26381:-123.13517:-123.12397',
  '49.26381:49.27035:-123.13517:-123.12397',
  '49.27035:49.2769:-123.13517:-123.12397'],
 ['49.2769:49.28345:-123.15758:-123.14638',
  '49.28345:49.28999:-123.15758:-123.14638',
  '49.28999:49.29654:-123.15758:-123.14638',
  '49.2769:49.28345:-123.14638:-123.13517',
  '49.28345:49.28999:-123.14638:-123.13517',
  '49.28999:49.29654:-123.14638:-123.13517',
  '49.2769:49.28345:-123.13517:-123.12397',
  '49.28345:49.28999:-123.13517:-123.12397',
  '49.28999:49.29654:-123.13517:-123.12397'],
 ['49.25726:49.26381:-123.12397:-123.11277',
  '49.26381:49.27035:-123.12397:-123.11277',
  '49.27035:49.2769:-123.12397:-123.11277',
  '49.25726:49.26381:-123.11277:-123.10156',
  '49.26381:49.

In [14]:
result.to_csv("../data/vancouver_real_estate.csv")