### StreetEasy Mott Haven Buildings, Attempt 2

Previously, we attempted to scrape the [list of buildings in Mott Haven](https://streeteasy.com/buildings/mott-haven) from the StreetEasy website through `BeautifulSoup`. However, the page seems to use JS to dynamically load content. So, we'll try scraping through headless browser `Playwright`.

The details we need are the following:

* building name
* address
* coordinates (for mapping)
* year it was built
* number of stories
* number of units
* link to individual pages (which we will use to get more details)

In [1]:
# importing libraries
import pandas as pd
from playwright.async_api import async_playwright
import asyncio
import nest_asyncio
from random import randrange
import time

In [2]:
# snoozer

def snoozer(start_time, end_time):
    '''
    This function creates a snoozer that can be used when scraping.
    It requires `from random import randrange` and `import time`. 
    
    Parameters: 
    start_time (int) = start time of range, in seconds
    end_time (int) = end time of range, in seconds
    '''
    timer = randrange(start_time, end_time)
    print(f"Snoozing for {timer} seconds...")
    time.sleep(timer)
    print("") # adds a line break for readability

In [None]:
## MAIN CODE
# this is tweaked from the bs4 scraper version

base_url = "https://streeteasy.com/buildings/mott-haven"
end_page = 2 # number of pages we want to scrape
errors_list = [] # holds pages with errors
main_df = [] # holds all captured lists

# starting playwright
async def scraper():
    '''
    This function scrapes a page asynchronously.
    '''
    async with async_playwright() as playwright:
        browser = await playwright.firefox.launch(headless=False) # False because I want to see it load the page
        context = await browser.new_context()
        page = await context.new_page()
    
        try:
            for page_num in range(1, end_page + 1):
                print(f"Attempting to scrape page {page_num}...")
                
                ## requesting URLs
                if page_num != 1:
                    url = f"{base_url}?page={page_num}"
                else:
                    url = base_url
                await page.goto(url)
                await page.wait_for_load_state('networkidle')
        
                ## extracting data
                # target_items = await page.query_selector_all("li.item.building")
            
                # building_names = [ await target.query_selector("h2.details-title").inner_text() for target in target_items ]
                # building_links = [ "https://streeteasy.com" + await query_selector("a").get_attribute("href") for target in target_items]
                # building_addresses = [ await target.query_selector("ul li").inner_text() for target in target_items ]
                # building_latlng = [ await target.get_attribute("se:map:point") for target in target_items]
            
                # ## other data held separately
                # for target in target_items:        
                #     other_details = await target.query_selector("ul.details_info li.detail_cell")        
                #     building_units = [ await other_details[0].inner_text().replace(" units", "").strip() ]
                #     building_stories = [ await other_details[1].inner_text().replace(" stories", "").strip() ]
                #     building_year = [ await other_details[2].inner_text().replace("built in", "").strip() ]
    
                print(f"Successfuly scraped page {page_num}!")
                
        except Exception as e:
            errors_list.append(url)
            print(f"Error '{e}' was found on {url}, page {page_num} of {end_page} pages. Moving to next scrape...")
            
        finally:
            # ## create df to hold all data
            # main_df.append(pd.DataFrame({ "building_name": building_names,
                                         # "link": building_links,
                                         # "address": building_addresses,
                                         # "coordinates": building_latlng,
                                         # "year_built": building_year,
                                         # "total_stories": building_stories,
                                         # "total_units": building_units
                                        # }))
            
            snoozer(21, 56)
    
            # closing playwright
            await browser.close()
            print(f"Done scraping {end_page} pages!") 

nest_asyncio.apply()
asyncio.run(scraper())