In [1]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import time

In [2]:
def vancouver_grid(head, divisions=15):
    van_geo_info_url = 'https://opendata.vancouver.ca/api/explore/v2.1/catalog/datasets/city-boundary/records?limit=20'
    response = requests.get(van_geo_info_url,headers=head,verify=False)
    geo_data = response.json()
    contour = geo_data['results'][0]['geom']['geometry']['coordinates']

    longitudes = [coord[0] for sublist in contour for coord in sublist]
    latitudes = [coord[1] for sublist in contour for coord in sublist]

    max_longitude = max(longitudes)
    min_longitude = min(longitudes)
    max_latitude = max(latitudes)
    min_latitude = min(latitudes)

    longitude_step = (max_longitude - min_longitude) / divisions
    latitude_step = (max_latitude - min_latitude) / divisions

    coord_boxes = []
    for i in range(divisions):
        for j in range(divisions):
            box_min_lat = round(min_latitude + j * latitude_step, 5)
            box_max_lat = round(min_latitude + (j + 1) * latitude_step, 5)
            box_min_lon = round(min_longitude + i * longitude_step, 5)
            box_max_lon = round(min_longitude + (i + 1) * longitude_step, 5)

            box_str = f"{box_min_lat}:{box_max_lat}:{box_min_lon}:{box_max_lon}"
            coord_boxes.append(box_str)
    
    return  coord_boxes

In [3]:
def listing_count(head, coord_box):

    viewport_url = f"https://www.redfin.ca/bc/vancouver/filter/viewport={coord_box}"
    resp = requests.get(viewport_url, headers=head, verify=False)

    if resp.status_code != 200:
        raise Exception("Failing in webpage requests")
    
    soup = BeautifulSoup(resp.text, 'html.parser')
    if soup.find('div', {'class': 'HomeViews reversePosition'}).find('h2'):
        return 'no_listing'
    
    listing_summary = soup.find('div', {'class':"homes summary reversePosition"})
    select_listing_count, total_listing_count = re.findall(r'\d{1,10}(?:,\d{1,10})*', listing_summary.text)
    select_listing_count, total_listing_count = int(select_listing_count), int(total_listing_count.replace(',',''))
    
    return viewport_url, select_listing_count, total_listing_count 

In [4]:
def crawling_redfin(head, viewport_url, page):
    
    target_url = viewport_url + "/page-" + str(page)
    resp = requests.get(target_url,headers=head,verify=False)
    
    if resp.status_code != 200:
        raise Exception("Failing in webpage requests")
    
    soup=BeautifulSoup(resp.text,'html.parser')
    soup_boxes = soup.find_all("div",{"class":"HomeCardContainer"})

    return  soup_boxes

In [25]:
def key_metric_extraction(soup_boxes, real_estate_info):
    
    incomplete_idx = []

    for i, box in enumerate(soup_boxes):
        try:
            address = box.find('address').text[:(-23)]
            real_estate_info['address'].append(address)
        except: 
            real_estate_info['address'].append(np.nan)
            incomplete_idx.append(i)
        try:
            zip_code = box.find('address').text[-7:]
            real_estate_info['zip_code'].append(zip_code)
        except: 
            real_estate_info['zip_code'].append(np.nan)
            incomplete_idx.append(i)        
        try:
            price = box.find('span', {'class':'bp-Homecard__Price--value'}).text
            real_estate_info['price'].append(price)
        except: 
            real_estate_info['price'].append(np.nan)
            incomplete_idx.append(i)
        try:
            bed = box.find('span', {'class':'bp-Homecard__Stats--beds text-nowrap'}).text
            real_estate_info['bed'].append(bed)
        except: 
            real_estate_info['bed'].append(np.nan)
            incomplete_idx.append(i)   
        try:
            bath = box.find('span', {'class':'bp-Homecard__Stats--baths text-nowrap'}).text
            real_estate_info['bath'].append(bath)
        except: 
            real_estate_info['bath'].append(np.nan)
            incomplete_idx.append(i)   
        try:
            sqr_footage = box.find('span', {'class':'bp-Homecard__LockedStat--value'}).text
            real_estate_info['sqr_footage'].append(sqr_footage)
        except:
            real_estate_info['sqr_footage'].append(np.nan)
            incomplete_idx.append(i)   
        try:
            property_link = "https://www.redfin.com"+box.find("a").get('href')
            real_estate_info['property_link'].append(property_link)
        except:
            real_estate_info['property_link'].append(np.nan)
            incomplete_idx.append(i)

    return incomplete_idx


In [6]:
def calculate_min_pages(total_count, items_per_page):
    return (total_count + items_per_page - 1) // items_per_page

In [26]:
head = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
coord_boxes = vancouver_grid(head)
real_estate_info = defaultdict(list)
big_coord_boxes = []
missing_entries = defaultdict(list)

for i in range(100,101):
    coord_box = coord_boxes[i]
    listing_info = listing_count(head, coord_box)
    time.sleep(1)
    if listing_info == 'no_listing':
        continue
    else:
        viewport_url, select_listing_count, total_listing_count = listing_info
        if select_listing_count != total_listing_count:
            big_coord_boxes.append(coord_box)
            continue
        else:
            max_page = calculate_min_pages(select_listing_count, items_per_page=9)
            missing = defaultdict(list)
            for page in range(1, max_page):
                soup_boxes = crawling_redfin(head, viewport_url, page)
                incomplete_idx = key_metric_extraction(soup_boxes, real_estate_info)
                missing[f'page_{page}'].append(incomplete_idx)
                time.sleep(1)
            missing_entries[f'coord_box_{i}'].append(missing)





In [27]:
pd.DataFrame(real_estate_info)

Unnamed: 0,address,zip_code,price,bed,bath,sqr_footage,property_link
0,1534 Harwood St #304,V6G 1X9,"$310,000",1 bed,1 bath,534,https://www.redfin.com/bc/vancouver/1534-Harwo...
1,1171 Jervis St #1901,V6E 0C9,"$3,688,000",2 beds,2.5 baths,2342,https://www.redfin.com/bc/vancouver/1171-Jervi...
2,1315 Cardero St #604,V6G 2J2,"$1,100,000",1 bed,1.5 baths,1027,https://www.redfin.com/bc/vancouver/1315-Carde...
3,1009 Harwood St #905,V6E 0C2,"$875,000",1 bed,1 bath,715,https://www.redfin.com/bc/vancouver/1009-Harwo...
4,1565 Burnaby St #201,V6G 1X1,"$725,000",2 beds,1 bath,823,https://www.redfin.com/bc/vancouver/1565-Burna...
...,...,...,...,...,...,...,...
164,1003 Pacific St #1601,V6E 4P2,"$1,099,000",2 beds,1 bath,926,https://www.redfin.com/bc/vancouver/1003-Pacif...
165,1171 Jervis St #1101,V6E 0C9,"$1,700,000",2 beds,2 baths,1018,https://www.redfin.com/bc/vancouver/1171-Jervi...
166,1009 Harwood St #604,V6E 0C2,"$645,000",1 bed,1 bath,527,https://www.redfin.com/bc/vancouver/1009-Harwo...
167,1403 Beach Ave Unit 2B,V6G 1Y3,"$2,880,000",3 beds,3 baths,1880,https://www.redfin.com/bc/vancouver/1403-Beach...
