In [1]:
import pandas as pd
import numpy as np
import re
import json
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import time

In [2]:
def split_coordinate(four_coords, divisions_longs, devision_lats, if_big_box):
    
    if if_big_box:
        [min_latitude, max_latitude, min_longitude, max_longitude] = [float(x) for x in four_coords.split(':')]
    else:
         [min_latitude, max_latitude, min_longitude, max_longitude] = four_coords

    longitude_step = (max_longitude - min_longitude) / divisions_longs
    latitude_step = (max_latitude - min_latitude) / devision_lats  # Typo: should be "divisions_lats"

    coord_boxes = []
    
    # Generate bounding boxes for each grid cell
    for i in range(divisions_longs):
        for j in range(devision_lats):
            box_min_lat = round(min_latitude + j * latitude_step, 5)
            box_max_lat = round(min_latitude + (j + 1) * latitude_step, 5)
            box_min_lon = round(min_longitude + i * longitude_step, 5)
            box_max_lon = round(min_longitude + (i + 1) * longitude_step, 5)

            # Store bounding box as a string in the format "min_lat:max_lat:min_lon:max_lon"
            box_str = f"{box_min_lat}:{box_max_lat}:{box_min_lon}:{box_max_lon}"
            coord_boxes.append(box_str)
    
    return coord_boxes

In [3]:
def vancouver_grid(head, divisions_longs, devision_lats):
    """
    Generates a grid of latitude-longitude bounding boxes within Vancouver's city boundary.

    Parameters:
    head (dict): Headers for the API request.
    divisions_longs (int): Number of divisions along the longitude (default is 15).
    devision_lats (int): Number of divisions along the latitude (default is 15).

    Returns:
    list: A list of strings representing bounding boxes in the format "min_lat:max_lat:min_lon:max_lon".
    """
    
    # API endpoint for Vancouver city boundary geo-coordinates
    van_geo_info_url = 'https://opendata.vancouver.ca/api/explore/v2.1/catalog/datasets/city-boundary/records?limit=20'
    
    # Fetch geographical data from the API
    response = requests.get(van_geo_info_url, headers=head)
    geo_data = response.json()
    
    # Extract the city boundary coordinates
    contour = geo_data['results'][0]['geom']['geometry']['coordinates']

    # Flatten the list of coordinates and extract longitude and latitude values separately
    longitudes = [coord[0] for sublist in contour for coord in sublist]
    latitudes = [coord[1] for sublist in contour for coord in sublist]

    # Determine the minimum and maximum longitude and latitude values
    max_longitude = max(longitudes)
    min_longitude = min(longitudes)
    max_latitude = max(latitudes)
    min_latitude = min(latitudes)
    four_coords = [min_latitude, max_latitude, min_longitude, max_longitude]

    
    coord_boxes = split_coordinate(four_coords, divisions_longs, devision_lats, if_big_box = 0)
 
    
    return coord_boxes


In [21]:
def listing_count(head, coord_box):
    """
    Fetches the number of real estate listings within a specified coordinate box from Redfin.

    Parameters:
    head (dict): Headers for the HTTP request.
    coord_box (str): A string representing the bounding box in the format "min_lat:max_lat:min_lon:max_lon".

    Returns:
    tuple: (viewport_url, select_listing_count, total_listing_count)
        - viewport_url (str): The URL used for the request.
        - select_listing_count (int): The number of listings shown in the current viewport.
        - total_listing_count (int): The total number of listings available.
        - If no listings are found, returns 'no_listing'.
    """
    
    # Construct the URL for the given coordinate box
    viewport_url = f"https://www.redfin.ca/bc/vancouver/filter/viewport={coord_box}"
    
    # Send a GET request to fetch the webpage
    resp = requests.get(viewport_url, headers=head)

    # Raise an error if the request fails (non-200 status code)
    if resp.status_code != 200:
        raise Exception("Failing in webpage requests")
    
    # Parse the HTML response using BeautifulSoup
    soup = BeautifulSoup(resp.text, 'html.parser')

    # Check if the page indicates no listings are available
    if soup.find('div', {'class': 'HomeViews reversePosition'}).find('h2'):
        return 'no_listing'
    
    # Extract the listing summary section
    listing_summary = soup.find('div', {'class': "homes summary reversePosition"})

    # Use regex to extract numeric values from the listing summary
    select_listing_count, total_listing_count = re.findall(r'\d{1,10}(?:,\d{1,10})*', listing_summary.text)
    
    # Convert extracted strings into integers, handling comma formatting
    select_listing_count, total_listing_count = int(select_listing_count), int(total_listing_count.replace(',', ''))
    
    return viewport_url, select_listing_count, total_listing_count


In [22]:
def crawling_redfin(head, viewport_url, page):
    """
    Crawls a specific page of real estate listings from Redfin within a given viewport.

    Parameters:
    head (dict): Headers for the HTTP request.
    viewport_url (str): Base URL for the listings search.
    page (int): Page number to crawl.

    Returns:
    list: A list of BeautifulSoup objects representing individual property listings.
    """
    
    # Construct the URL for the specified page number
    target_url = f"{viewport_url}/page-{page}"
    
    # Send a GET request to fetch the webpage
    resp = requests.get(target_url, headers=head)

    # Raise an error if the request fails (non-200 status code)
    if resp.status_code != 200:
        raise Exception("Failing in webpage requests")
    
    # Parse the HTML response using BeautifulSoup
    soup = BeautifulSoup(resp.text, 'html.parser')

    return soup

In [30]:
header =  {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
geo_grid = vancouver_grid(head = header, divisions_longs=6, devision_lats=6)

In [7]:
viewport_url, select_listing_count, total_listing_count = listing_count(head=header, coord_box=geo_grid)
print(viewport_url, select_listing_count, total_listing_count)

https://www.redfin.ca/bc/vancouver/filter/viewport=['49.19835:49.21799:-123.22479:-123.19118', '49.21799:49.23762:-123.22479:-123.19118', '49.23762:49.25726:-123.22479:-123.19118', '49.25726:49.2769:-123.22479:-123.19118', '49.2769:49.29654:-123.22479:-123.19118', '49.29654:49.31617:-123.22479:-123.19118', '49.19835:49.21799:-123.19118:-123.15758', '49.21799:49.23762:-123.19118:-123.15758', '49.23762:49.25726:-123.19118:-123.15758', '49.25726:49.2769:-123.19118:-123.15758', '49.2769:49.29654:-123.19118:-123.15758', '49.29654:49.31617:-123.19118:-123.15758', '49.19835:49.21799:-123.15758:-123.12397', '49.21799:49.23762:-123.15758:-123.12397', '49.23762:49.25726:-123.15758:-123.12397', '49.25726:49.2769:-123.15758:-123.12397', '49.2769:49.29654:-123.15758:-123.12397', '49.29654:49.31617:-123.15758:-123.12397', '49.19835:49.21799:-123.12397:-123.09036', '49.21799:49.23762:-123.12397:-123.09036', '49.23762:49.25726:-123.12397:-123.09036', '49.25726:49.2769:-123.12397:-123.09036', '49.2769:

In [9]:
soup1 = crawling_redfin(head=header, viewport_url=viewport_url, page=1)
info1 = soup1.find_all('script', {'type': 'application/ld+json'})
info1 =  [json.loads(i.string) for i in info1]
time.sleep(1)
soup2 = crawling_redfin(head=header, viewport_url=viewport_url, page=2)
info2 = soup2.find_all('script', {'type': 'application/ld+json'})
info2 =  [json.loads(i.string) for i in info2]
print(len(info1), len(info2))

63 69


In [47]:
def metrics_extraction(result, result_event, result_event_list, further_invest, soup):
    
    info = soup.find_all('script', {'type':'application/ld+json'})
    info =  [json.loads(i.string) for i in info]

    for j, i in enumerate(info):
        if isinstance(i, dict):
            
            type_i = i.get('@type')
            
            if type_i == 'Organization' or type_i == 'BreadcrumbList':
                continue
            
            elif type_i == 'Event':
                
                location = i.get('location')
            
                if isinstance(location, list):
                    try:
                        address = i.get('location')[1].get('address').get('streetAddress')
                        postalCode = i.get('location')[1].get('address').get('postalCode')
                        latitude = i.get('location')[1].get('geo').get('latitude')
                        longitude = i.get('location')[1].get('geo').get('longitude')
                        url = i.get('url')

                        result_event_list['address'].append(address)
                        result_event_list['postalCode'].append(postalCode)
                        result_event_list['latitude'].append(latitude)
                        result_event_list['longitude'].append(longitude)
                        result_event_list['url'].append(url)
                    except:
                        further_invest.append((j, i))
                    
                else:
                    try:
                        address = i.get('location').get('name')
                        postalCode = i.get('location').get('address').get('postalCode') 
                        latitude = i.get('location').get('geo').get('latitude')
                        longitude = i.get('location').get('geo').get('longitude')
                        price = i.get('offers').get('price')
                        url = i.get('url')
                        
                        result_event['address'].append(address)
                        result_event['postalCode'].append(postalCode)
                        result_event['latitude'].append(latitude)
                        result_event['longitude'].append(longitude)
                        result_event['price'].append(price)
                        result_event['url'].append(url)
                
                        
                    except:
                        further_invest.append(i)


        elif isinstance(i, list):
            try: 
                i_1 = i[0]
                address = i_1.get('address').get('streetAddress')
                postalCode = i_1.get('address').get('postalCode')
                latitude = i_1.get('geo').get('latitude')
                longitude = i_1.get('geo').get('latitude')
                sqr_footage = i_1.get('floorSize').get('value')
                bedrooms = i_1.get('numberOfRooms')
                url = i_1.get('url')
                
                i_2 = i[1]
                price = i_2.get('offers').get('price')

                result['address'].append(address)
                result['postalCode'].append(postalCode)
                result['latitude'].append(latitude)
                result['longitude'].append(longitude)
                result['price'].append(price)
                result['square_footage'].append(sqr_footage)
                result['bedroom'].append(bedrooms)
                result['url'].append(url)
            
            except:
                further_invest.append((j,i))
        

In [33]:
def calculate_min_pages(total_count, items_per_page):
    """
    Calculates the minimum number of pages required to display all items.

    Parameters:
    total_count (int): The total number of items to be displayed.
    items_per_page (int): The maximum number of items that can be displayed per page.

    Returns:
    int: The minimum number of pages required.
    """
    
    # Use integer division to determine the number of pages needed
    # Adding (items_per_page - 1) ensures proper rounding up
    return (total_count + items_per_page - 1) // items_per_page

In [None]:
def extracting_by_batch_method2(head, batch_num, divisions_longs=15, devision_lats=15, splitted_big_box = 0):   
    
    big_coord_boxes = []  
    result_event = defaultdict(list)
    result_event_list = defaultdict(list)
    result = defaultdict(list)
    url_with_issue = []

    
    if splitted_big_box:
        coord_boxes = splitted_big_box

    else: coord_boxes = vancouver_grid(head, divisions_longs, devision_lats)
    coord_box_batch = np.array_split(coord_boxes, batch_num)
    
    for i in range(len(coord_box_batch)):
        batch = coord_box_batch[i]
        for coord_box in batch:
            listing_info = listing_count(head, coord_box)
            time.sleep(1)  
            if listing_info == 'no_listing':
                print(f"Batch {i}-{coord_box} has no listings.")
                continue
            else:
                viewport_url, select_listing_count, total_listing_count = listing_info
                if select_listing_count != total_listing_count:
                    big_coord_boxes.append(coord_box)
                    continue            
                else:
                    # Calculate the number of pages to crawl based on listings per page (assumed 9 per page)
                    max_page = calculate_min_pages(select_listing_count, items_per_page=9)
                    for page in range(1, max_page):
                        soup = crawling_redfin(head, viewport_url, page)                        
                        metrics_extraction(result, result_event, result_event_list, url_with_issue, soup)
    
    return result, result_event, result_event_list, big_coord_boxes, url_with_issue

SyntaxError: expected ':' (3954926735.py, line 1)