In [1]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import time

In [3]:
def vancouver_grid(head, divisions=15):
    van_geo_info_url = 'https://opendata.vancouver.ca/api/explore/v2.1/catalog/datasets/city-boundary/records?limit=20'
    response = requests.get(van_geo_info_url,headers=head,verify=False)
    geo_data = response.json()
    contour = geo_data['results'][0]['geom']['geometry']['coordinates']

    longitudes = [coord[0] for sublist in contour for coord in sublist]
    latitudes = [coord[1] for sublist in contour for coord in sublist]

    max_longitude = max(longitudes)
    min_longitude = min(longitudes)
    max_latitude = max(latitudes)
    min_latitude = min(latitudes)

    longitude_step = (max_longitude - min_longitude) / divisions
    latitude_step = (max_latitude - min_latitude) / divisions

    coord_boxes = []
    for i in range(divisions):
        for j in range(divisions):
            box_min_lat = round(min_latitude + j * latitude_step, 5)
            box_max_lat = round(min_latitude + (j + 1) * latitude_step, 5)
            box_min_lon = round(min_longitude + i * longitude_step, 5)
            box_max_lon = round(min_longitude + (i + 1) * longitude_step, 5)

            box_str = f"{box_min_lat}:{box_max_lat}:{box_min_lon}:{box_max_lon}"
            coord_boxes.append(box_str)
    
    return  coord_boxes

In [4]:
def key_metric_extraction(soup_boxes, real_estate_info, incomplete_idx):
    
    for i, box in enumerate(soup_boxes):
        try:
            address = box.find('address').text[:(-7-5-len(state+city))]
            real_estate_info['address'].append(address)
        except: 
            real_estate_info['address'].append(np.nan)
            incomplete_idx.append(i)
        try:
            zip_code = box.find('address').text[-7:]
            real_estate_info['zip_code'].append(zip_code)
        except: 
            real_estate_info['zip_code'].append(np.nan)
            incomplete_idx.append(i)        
        try:
            price = box.find('span', {'class':'bp-Homecard__Price--value'}).text
            real_estate_info['price'].append(price)
        except: 
            real_estate_info['price'].append(np.nan)
            incomplete_idx.append(i)
        try:
            bed = box.find('span', {'class':'bp-Homecard__Stats--beds text-nowrap'}).text
            real_estate_info['bed'].append(bed)
        except: 
            real_estate_info['bed'].append(np.nan)
            incomplete_idx.append(i)   
        try:
            bath = box.find('span', {'class':'bp-Homecard__Stats--baths text-nowrap'}).text
            real_estate_info['bath'].append(bath)
        except: 
            real_estate_info['bath'].append(np.nan)
            incomplete_idx.append(i)   
        try:
            sqr_footage = box.find('span', {'class':'bp-Homecard__LockedStat--value'}).text
            real_estate_info['sqr_footage'].append(sqr_footage)
        except:
            real_estate_info['sqr_footage'].append(np.nan)
            incomplete_idx.append(i)   
        try:
            property_link = "https://www.redfin.com"+box.find("a").get('href')
            real_estate_info['property_link'].append(property_link)
        except:
            real_estate_info['property_link'].append(np.nan)
            incomplete_idx.append(i)

    return real_estate_info, incomplete_idx


In [None]:
def listing_count(head, coord_box):

    viewport_url = f"https://www.redfin.ca/bc/vancouver/filter/viewport={coord_box}"
    resp = requests.get(viewport_url, headers=head, verify=False)

    if resp.status_code != 200:
        raise Exception("Failing in webpage requests")
    
    soup = BeautifulSoup(resp.text, 'html.parser')
    if soup.find('div', {'class': 'HomeViews reversePosition'}).find('h2'):
        return 'no_listing'
    
    listing_summary = soup.find('div', {'class':"homes summary reversePosition"})
    select_listing_count, total_listing_count = re.findall(r'\d{1,10}(?:,\d{1,10})*', listing_summary.text)
    select_listing_count, total_listing_count = int(select_listing_count), int(total_listing_count.replace(',',''))
    
    return viewport_url, select_listing_count, total_listing_count 

In [None]:
def crawling_redfin(head, viewport_url, page):
    
    target_url = viewport_url + "/page-" + str(page)
    resp = requests.get(target_url,headers=head,verify=False)
    
    if resp.status_code != 200:
        raise Exception("Failing in webpage requests")
    
    soup=BeautifulSoup(resp.text,'html.parser')

    data_summary = soup.find('div', {'class':"homes summary reversePosition"})
    select_count, total_count = re.findall(r'\d{1,10}(?:,\d{1,10})*', data_summary.text)
    select_count, total_count = int(select_count), int(total_count.replace(',',''))

    soup_boxes = soup.find_all("div",{"class":"HomeCardContainer"})

    return  soup_boxes

In [None]:
head = head={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
coord_boxes = vancouver_grid(head=head, divisions=15)
real_estate_info = defaultdict(list)
incomplete_idx = []



# Need to get the page number before the for loop to minimize calcuation and avoid error

In [None]:
top_square: 49.3:49.28:-123.12:-123.14
left/top_square:49.28514:49.25232:-123.1678:-123.22548
bottom_sqaure: 49.22024:49.20381:-123.12346:-123.1523
top_right: 49.29408:49.26126:-123.00855:-123.06623
bottom_right: 9.23109:49.19824:-123.01579:-123.07347
bottom_left: 49.23167:49.19881:-123.11761:-123.17529