In [2]:
import pandas as pd
import numpy as np
import re
import json
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import time

In [3]:
def split_coordinate(four_coords, divisions_longs, devision_lats, if_big_box):
    
    if if_big_box:
        [min_latitude, max_latitude, min_longitude, max_longitude] = [float(x) for x in four_coords.split(':')]
    else:
         [min_latitude, max_latitude, min_longitude, max_longitude] = four_coords

    longitude_step = (max_longitude - min_longitude) / divisions_longs
    latitude_step = (max_latitude - min_latitude) / devision_lats  # Typo: should be "divisions_lats"

    coord_boxes = []
    
    # Generate bounding boxes for each grid cell
    for i in range(divisions_longs):
        for j in range(devision_lats):
            box_min_lat = round(min_latitude + j * latitude_step, 5)
            box_max_lat = round(min_latitude + (j + 1) * latitude_step, 5)
            box_min_lon = round(min_longitude + i * longitude_step, 5)
            box_max_lon = round(min_longitude + (i + 1) * longitude_step, 5)

            # Store bounding box as a string in the format "min_lat:max_lat:min_lon:max_lon"
            box_str = f"{box_min_lat}:{box_max_lat}:{box_min_lon}:{box_max_lon}"
            coord_boxes.append(box_str)
    
    return coord_boxes

In [4]:
def vancouver_grid(head, divisions_longs, devision_lats):
    """
    Generates a grid of latitude-longitude bounding boxes within Vancouver's city boundary.

    Parameters:
    head (dict): Headers for the API request.
    divisions_longs (int): Number of divisions along the longitude (default is 15).
    devision_lats (int): Number of divisions along the latitude (default is 15).

    Returns:
    list: A list of strings representing bounding boxes in the format "min_lat:max_lat:min_lon:max_lon".
    """
    
    # API endpoint for Vancouver city boundary geo-coordinates
    van_geo_info_url = 'https://opendata.vancouver.ca/api/explore/v2.1/catalog/datasets/city-boundary/records?limit=20'
    
    # Fetch geographical data from the API
    response = requests.get(van_geo_info_url, headers=head)
    geo_data = response.json()
    
    # Extract the city boundary coordinates
    contour = geo_data['results'][0]['geom']['geometry']['coordinates']

    # Flatten the list of coordinates and extract longitude and latitude values separately
    longitudes = [coord[0] for sublist in contour for coord in sublist]
    latitudes = [coord[1] for sublist in contour for coord in sublist]

    # Determine the minimum and maximum longitude and latitude values
    max_longitude = max(longitudes)
    min_longitude = min(longitudes)
    max_latitude = max(latitudes)
    min_latitude = min(latitudes)
    four_coords = [min_latitude, max_latitude, min_longitude, max_longitude]

    
    coord_boxes = split_coordinate(four_coords, divisions_longs, devision_lats, if_big_box = 0)
 
    
    return coord_boxes


In [5]:
def listing_count(head, coord_box):
    """
    Fetches the number of real estate listings within a specified coordinate box from Redfin.

    Parameters:
    head (dict): Headers for the HTTP request.
    coord_box (str): A string representing the bounding box in the format "min_lat:max_lat:min_lon:max_lon".

    Returns:
    tuple: (viewport_url, select_listing_count, total_listing_count)
        - viewport_url (str): The URL used for the request.
        - select_listing_count (int): The number of listings shown in the current viewport.
        - total_listing_count (int): The total number of listings available.
        - If no listings are found, returns 'no_listing'.
    """
    
    # Construct the URL for the given coordinate box
    viewport_url = f"https://www.redfin.ca/bc/vancouver/filter/viewport={coord_box}"
    
    # Send a GET request to fetch the webpage
    resp = requests.get(viewport_url, headers=head)

    # Raise an error if the request fails (non-200 status code)
    if resp.status_code != 200:
        raise Exception("Failing in webpage requests")
    
    # Parse the HTML response using BeautifulSoup
    soup = BeautifulSoup(resp.text, 'html.parser')

    # Check if the page indicates no listings are available
    if soup.find('div', {'class': 'HomeViews reversePosition'}).find('h2'):
        return 'no_listing'
    
    # Extract the listing summary section
    listing_summary = soup.find('div', {'class': "homes summary reversePosition"})

    # Use regex to extract numeric values from the listing summary
    select_listing_count, total_listing_count = re.findall(r'\d{1,10}(?:,\d{1,10})*', listing_summary.text)
    
    # Convert extracted strings into integers, handling comma formatting
    select_listing_count, total_listing_count = int(select_listing_count), int(total_listing_count.replace(',', ''))
    
    return viewport_url, select_listing_count, total_listing_count


In [6]:
def crawling_redfin(head, viewport_url, page):
    """
    Crawls a specific page of real estate listings from Redfin within a given viewport.

    Parameters:
    head (dict): Headers for the HTTP request.
    viewport_url (str): Base URL for the listings search.
    page (int): Page number to crawl.

    Returns:
    list: A list of BeautifulSoup objects representing individual property listings.
    """
    
    # Construct the URL for the specified page number
    target_url = f"{viewport_url}/page-{page}"
    
    # Send a GET request to fetch the webpage
    resp = requests.get(target_url, headers=head)

    # Raise an error if the request fails (non-200 status code)
    if resp.status_code != 200:
        raise Exception("Failing in webpage requests")
    
    # Parse the HTML response using BeautifulSoup
    soup = BeautifulSoup(resp.text, 'html.parser')

    return soup

In [7]:
header =  {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
geo_grid = vancouver_grid(head = header, divisions_longs=1, devision_lats=1)
geo_grid

['49.19835:49.31617:-123.22479:-123.02315']

In [8]:
viewport_url, select_listing_count, total_listing_count = listing_count(head=header, coord_box=geo_grid)
print(viewport_url, select_listing_count, total_listing_count)

https://www.redfin.ca/bc/vancouver/filter/viewport=['49.19835:49.31617:-123.22479:-123.02315'] 350 4647


In [9]:
soup1 = crawling_redfin(head=header, viewport_url=viewport_url, page=1)
info1 = soup1.find_all('script', {'type': 'application/ld+json'})
info1 =  [json.loads(i.string) for i in info1]
time.sleep(1)
soup2 = crawling_redfin(head=header, viewport_url=viewport_url, page=2)
info2 = soup2.find_all('script', {'type': 'application/ld+json'})
info2 =  [json.loads(i.string) for i in info2]
print(len(info1), len(info2))

63 69


In [25]:
info1[3]

[{'@context': 'http://schema.org',
  'name': '1317 E 18th Ave, Vancouver, BC V5V 1H5',
  'url': 'https://www.redfin.ca/bc/vancouver/1317-E-18th-Ave-V5V-1H5/home/155587781',
  'address': {'@type': 'PostalAddress',
   'streetAddress': '1317 E 18th Ave',
   'addressLocality': 'Vancouver',
   'addressRegion': 'BC',
   'postalCode': 'V5V 1H5',
   'addressCountry': 'CA'},
  'geo': {'@type': 'GeoCoordinates',
   'latitude': 49.2546031,
   'longitude': -123.0774025},
  'numberOfRooms': 3,
  'floorSize': {'@type': 'QuantitativeValue',
   'value': 1259,
   'unitCode': 'FTK'},
  '@type': 'SingleFamilyResidence'},
 {'@context': 'http://schema.org',
  '@type': 'Product',
  'name': '1317 E 18th Ave, Vancouver, BC V5V 1H5',
  'offers': {'@type': 'Offer', 'price': 1599000, 'priceCurrency': 'CAD'},
  'url': 'https://www.redfin.ca/bc/vancouver/1317-E-18th-Ave-V5V-1H5/home/155587781'}]

In [26]:
further_invest[0]

{'@context': 'http://schema.org',
 '@type': 'Event',
 'name': '3D Walkthrough - 3346 E 8th Ave, Vancouver, BC V5M 1X9',
 'startDate': '2025-04-08',
 'url': 'https://www.redfin.ca/bc/vancouver/3346-E-8th-Ave-V5M-1X9/home/155337964',
 'eventAttendanceMode': 'http://schema.org/OnlineEventAttendanceMode',
 'eventStatus': 'http://schema.org/EventMovedOnline',
 'location': [{'@type': 'VirtualLocation',
   'url': 'https://www.redfin.ca/bc/vancouver/3346-E-8th-Ave-V5M-1X9/home/155337964'},
  {'@type': 'Place',
   'name': '3346 E 8th Ave, Vancouver, BC V5M 1X9',
   'geo': {'@type': 'GeoCoordinates',
    'latitude': 49.2627399,
    'longitude': -123.0322041},
   'address': {'@type': 'PostalAddress',
    'streetAddress': '3346 E 8th Ave',
    'addressLocality': 'Vancouver',
    'postalCode': 'V5M 1X9',
    'addressRegion': 'BC',
    'addressCountry': 'CA'}}]}

In [37]:
further_invest[0]

{'@context': 'http://schema.org',
 '@type': 'Event',
 'name': '3D Walkthrough - 3346 E 8th Ave, Vancouver, BC V5M 1X9',
 'startDate': '2025-04-08',
 'url': 'https://www.redfin.ca/bc/vancouver/3346-E-8th-Ave-V5M-1X9/home/155337964',
 'eventAttendanceMode': 'http://schema.org/OnlineEventAttendanceMode',
 'eventStatus': 'http://schema.org/EventMovedOnline',
 'location': [{'@type': 'VirtualLocation',
   'url': 'https://www.redfin.ca/bc/vancouver/3346-E-8th-Ave-V5M-1X9/home/155337964'},
  {'@type': 'Place',
   'name': '3346 E 8th Ave, Vancouver, BC V5M 1X9',
   'geo': {'@type': 'GeoCoordinates',
    'latitude': 49.2627399,
    'longitude': -123.0322041},
   'address': {'@type': 'PostalAddress',
    'streetAddress': '3346 E 8th Ave',
    'addressLocality': 'Vancouver',
    'postalCode': 'V5M 1X9',
    'addressRegion': 'BC',
    'addressCountry': 'CA'}}]}

In [50]:
result_event = defaultdict(list)
result_event_list = defaultdict(list)
result = defaultdict(list)

further_invest = []

for j, i in enumerate(info1):
    if isinstance(i, dict):
        
        type_i = i.get('@type')
        
        if type_i == 'Organization' or type_i == 'BreadcrumbList':
            continue
        
        elif type_i == 'Event':
            
            location = i.get('location')
        
            if isinstance(location, list):
                try:
                    address = i.get('location')[1].get('address').get('streetAddress')
                    postalCode = i.get('location')[1].get('address').get('postalCode')
                    latitude = i.get('location')[1].get('geo').get('latitude')
                    longitude = i.get('location')[1].get('geo').get('longitude')
                    url = i.get('url')

                    result_event_list['address'].append(address)
                    result_event_list['postalCode'].append(postalCode)
                    result_event_list['latitude'].append(latitude)
                    result_event_list['longitude'].append(longitude)
                    result_event_list['url'].append(url)
                except:
                    further_invest.append(i)
                
            else:
                try:
                    address = i.get('location').get('name')
                    postalCode = i.get('location').get('address').get('postalCode') 
                    latitude = i.get('location').get('geo').get('latitude')
                    longitude = i.get('location').get('geo').get('longitude')
                    price = i.get('offers').get('price')
                    url = i.get('url')
                    
                    result_event['address'].append(address)
                    result_event['postalCode'].append(postalCode)
                    result_event['latitude'].append(latitude)
                    result_event['longitude'].append(longitude)
                    result_event['price'].append(price)
                    result_event['url'].append(url)
            
                    
                except:
                    further_invest.append(i)


    elif isinstance(i, list):

        i_1 = i[0]
        address = i_1.get('address').get('streetAddress')
        postalCode = i_1.get('address').get('postalCode')
        latitude = i_1.get('geo').get('latitude')
        longitude = i_1.get('geo').get('latitude')
        sqr_footage = i_1.get('floorSize').get('value')
        bedrooms = i_1.get('numberOfRooms')
        url = i_1.get('url')
        
        i_2 = i[1]
        price = i_2.get('offers').get('price')

        result['address'].append(address)
        result['postalCode'].append(postalCode)
        result['latitude'].append(latitude)
        result['longitude'].append(longitude)
        result['price'].append(price)
        result['square_footage'].append(sqr_footage)
        result['bedroom'].append(bedrooms)
        result['url'].append(url)
    

In [51]:
result

defaultdict(list,
            {'address': ['683 W 18th Ave',
              '1317 E 18th Ave',
              '2028 W 11th Ave #402',
              '2935 East 29th Ave',
              '1915 E 7th Ave',
              '3113 E Kent Avenue North',
              '1618 Quebec St #2002',
              '181 W 1st Ave #510',
              '293 Smithe St #104',
              '2323 Fir St #302',
              '1470 Pennyfarthing Dr #408',
              '3020 Knight St',
              '3018 Knight St',
              '1005 Beach Ave #805',
              '2070 W 14th Ave',
              '3533 W 20th Ave',
              '8570 Osler St',
              '718 Main St #704',
              '856 W 15th Ave #302',
              '1345 W 4th Ave #205',
              '75 E 50th Ave',
              '63 Keefer Pl #1905',
              '1171 Jervis St #1201',
              '1812 Tasmania Cres',
              '645 E 27 Ave',
              '2550 Parker St',
              '3346 E 8th Ave',
              '1445 Marpole A

In [52]:
pd.DataFrame(result)

Unnamed: 0,address,postalCode,latitude,longitude,price,square_footage,bedroom,url
0,683 W 18th Ave,V5Z 1V9,49.255407,49.255407,2399900,1618,4,https://www.redfin.ca/bc/vancouver/683-W-18th-...
1,1317 E 18th Ave,V5V 1H5,49.254603,49.254603,1599000,1259,3,https://www.redfin.ca/bc/vancouver/1317-E-18th...
2,2028 W 11th Ave #402,V6J 2C9,49.261736,49.261736,649000,590,1,https://www.redfin.ca/bc/vancouver/2028-W-11th...
3,2935 East 29th Ave,V5R 1V8,49.244789,49.244789,4000000,1887,3,https://www.redfin.ca/bc/vancouver/2935-E-29th...
4,1915 E 7th Ave,V5N 1S3,49.264295,49.264295,1589000,1457,3,https://www.redfin.ca/bc/vancouver/1915-E-7th-...
5,3113 E Kent Avenue North,V5S 4Y1,49.206984,49.206984,1559000,2055,4,https://www.redfin.ca/bc/vancouver/3113-E-Kent...
6,1618 Quebec St #2002,V6A 0C5,49.271435,49.271435,1338000,1007,2,https://www.redfin.ca/bc/vancouver/1618-Quebec...
7,181 W 1st Ave #510,V5Y 0E3,49.27068,49.27068,1188000,1016,2,https://www.redfin.ca/bc/vancouver/181-W-1st-A...
8,293 Smithe St #104,V6B 5W4,49.277783,49.277783,1549000,1570,2,https://www.redfin.ca/bc/vancouver/293-Smithe-...
9,2323 Fir St #302,V6J 5J9,49.26528,49.26528,871900,498,1,https://www.redfin.ca/bc/vancouver/2323-Fir-St...


In [53]:
pd.DataFrame(result_event)

Unnamed: 0,address,postalCode,latitude,longitude,price,url
0,683 W 18th Ave,V5Z 1V9,49.255407,-123.119682,2399900,https://www.redfin.ca/bc/vancouver/683-W-18th-...
1,1317 E 18th Ave,V5V 1H5,49.254603,-123.077403,1599000,https://www.redfin.ca/bc/vancouver/1317-E-18th...
2,2028 W 11th Ave #402,V6J 2C9,49.261736,-123.151216,649000,https://www.redfin.ca/bc/vancouver/2028-W-11th...
3,1915 E 7th Ave,V5N 1S3,49.264295,-123.0655,1589000,https://www.redfin.ca/bc/vancouver/1915-E-7th-...
4,1618 Quebec St #2002,V6A 0C5,49.271435,-123.101873,1338000,https://www.redfin.ca/bc/vancouver/1618-Quebec...
5,1005 Beach Ave #805,V6E 3W2,49.276915,-123.134257,1145000,https://www.redfin.ca/bc/vancouver/1005-Beach-...
6,3533 W 20th Ave,V6S 1E6,49.254913,-123.182826,3899800,https://www.redfin.ca/bc/vancouver/3533-W-20th...
7,8570 Osler St,V6P 0G4,49.208984,-123.13131,1649900,https://www.redfin.ca/bc/vancouver/8570-Osler-...
8,856 W 15th Ave #302,V5Z 1R8,49.257595,-123.123295,999000,https://www.redfin.ca/bc/vancouver/856-W-15th-...
9,1345 W 4th Ave #205,V6H 3Y8,49.26671,-123.134162,796000,https://www.redfin.ca/bc/vancouver/1345-W-4th-...


In [54]:
pd.DataFrame(result_event_list)

Unnamed: 0,address,postalCode,latitude,longitude,url
0,3346 E 8th Ave,V5M 1X9,49.26274,-123.032204,https://www.redfin.ca/bc/vancouver/3346-E-8th-...
1,2168 W 18th Ave,V6L 1A4,49.256006,-123.154684,https://www.redfin.ca/bc/vancouver/2168-W-18th...
2,6999 Cambie St #205,V6P 0J1,49.222075,-123.117099,https://www.redfin.ca/bc/vancouver/6999-Cambie...
