### StreetEasy Mott Haven Buildings, Attempt 2

Previously, we attempted to scrape the [list of buildings in Mott Haven](https://streeteasy.com/buildings/mott-haven) from the StreetEasy website through `BeautifulSoup`. However, the page seems to use JS to dynamically load content. So, we'll try scraping through headless browser `Playwright`.

The details we need are the following:

* building name
* address
* coordinates (for mapping)
* year it was built
* number of stories
* number of units
* link to individual pages (which we will use to get more details)

In [104]:
# importing libraries
import pandas as pd
from playwright.async_api import async_playwright
import asyncio
import nest_asyncio
from bs4 import BeautifulSoup
from random import randrange
import time
import numpy as np

In [2]:
# snoozer

def snoozer(start_time, end_time):
    '''
    This function creates a snoozer that can be used when scraping.
    It requires `from random import randrange` and `import time`. 
    
    Parameters: 
    start_time (int) = start time of range, in seconds
    end_time (int) = end time of range, in seconds
    '''
    timer = randrange(start_time, end_time)
    print(f"Snoozing for {timer} seconds...")
    time.sleep(timer)
    print("") # adds a line break for readability

### Apparently, it's more convenient to `soup`-ify Playwright `page.content()`

This worked... until I was denied access to the webpage. (I used VPN!)

In [None]:
## SOMA VERSION

base_url = "https://streeteasy.com/buildings/mott-haven"
end_page = 1 # number of pages we want to scrape
### for now, `end_page` is set to 1... but when this code works, it should be changed back to 45.
errors_list = [] # holds pages with errors
all_data = [] # holds all captured lists
    
for page_num in range(1, end_page + 1):
    # starting playwright
    playwright = await async_playwright().start()
    browser = await playwright.firefox.launch(headless=False) # False because I want to see it load the page
    page = await browser.new_page()
    
    print(f"Attempting to scrape page {page_num}...")
        
    try:
        ## requesting URLs
        if page_num != 1:
            url = f"{base_url}?page={page_num}"
        else:
            url = base_url
        await page.goto(url)
        soup = BeautifulSoup(await page.content()) # Playwright -> BeautifulSoup is easier?
    
        await playwright.stop() # closes the browser? in a way it skips the access check-in parts
        
        # extracting data
        target_items = soup.find_all("li", class_="item building")
        
        building_names = [ target.find("h2", class_="details-title").get_text(strip=True).replace("SAVE", "") for target in target_items ]
        building_links = [ "https://streeteasy.com" + target.find("a").get("href") for target in target_items]
        building_addresses = [ target.find("ul").find("li").replace("At ", "").get_text(strip=True) if "At " in target.find("ul").find("li").get_text()\
                              else np.nan for target in target_items ]
        building_latlng = [ target.get("se:map:point") for target in target_items]
    
        ## other data held separately

        # initializing lists, so they can exist outside the for loop below
        building_units = []
        building_stories = [] 
        building_year = []

        for target in target_items:
            other_details = target.find("ul", class_="details_info")
            if other_details: # this temporarily places NaN, will be replaced if `detail` is found
                units = np.nan
                stories = np.nan
                year = np.nan
            for detail in other_details.find_all("li", class_="detail_cell"):
                text = detail.text.strip()
                if "units" in text:
                    units = int(text.split()[0]) 
                elif "stories" in text:
                    stories = int(text.split()[0]) 
                elif "built in" in text:
                    year = int(text.split()[-1]) 
            # appending lists        
            building_units.append(units)
            building_stories.append(stories)
            building_year.append(year)
        
        print(f"Successfuly scraped page {page_num}!")
        
    except Exception as e:
        errors_list.append(url)
        print(f"Error '{e}' was found on {url}, page {page_num} of {end_page} pages. Moving to next scrape...")
    
    finally:
        # checking to see if they all have the same range
        print(len(building_names))
        print(len(building_links))
        print(len(building_addresses))
        print(len(building_latlng))
        print(len(building_year))
        print(len(building_stories))
        print(len(building_units))

        # create df to hold all data
        all_data.append(pd.DataFrame({ "building_name": building_names,
                                     "link": building_links,
                                     "address": building_addresses,
                                     "coordinates": building_latlng,
                                     "year_built": building_year,
                                     "total_stories": building_stories,
                                     "total_units": building_units
                                    }))
        
        if page_num <= end_page - 1:
            snoozer(21, 56)
    
print(f"Done scraping {page_num} of {end_page} pages!")

In [None]:
# converting this to our final df

final_df = pd.concat(all_data, ignore_index=True)
final_df

In [105]:
# saving the file to csv

final_df.to_csv("mott-haven-streeteasy-buildings.csv", encoding="UTF-8", index=False)

### This was the original Playwright code I've been working on... 

A recurring problem is access check-ins then denials when we try to jump to the next page. I haven't run the scrapers yet. 

In [None]:
## MAIN CODE
# this is tweaked from the bs4 scraper version

base_url = "https://streeteasy.com/buildings/mott-haven"
end_page = 2 # number of pages we want to scrape
### for now, `end_page` is set to 2... but when this code works, it should be changed back to 45.
errors_list = [] # holds pages with errors
main_df = [] # holds all captured lists

# starting playwright
async def scraper():
    '''
    This function scrapes a page asynchronously.
    '''
    async with async_playwright() as playwright:
        browser = await playwright.firefox.launch(headless=False) # False because I want to see it load the page
        context = await browser.new_context()
        page = await context.new_page()
    
        try:
            for page_num in range(1, end_page + 1):
                print(f"Attempting to scrape page {page_num}...")
                
                ## requesting URLs
                if page_num != 1:
                    url = f"{base_url}?page={page_num}"
                else:
                    url = base_url
                await page.goto(url)
                await page.wait_for_load_state('networkidle')
        
                ## extracting data
                # target_items = await page.query_selector_all("li.item.building")
            
                # building_names = [ await target.query_selector("h2.details-title").inner_text() for target in target_items ]
                # building_links = [ "https://streeteasy.com" + await query_selector("a").get_attribute("href") for target in target_items]
                # building_addresses = [ await target.query_selector("ul li").inner_text() for target in target_items ]
                # building_latlng = [ await target.get_attribute("se:map:point") for target in target_items]
            
                # ## other data held separately
                # for target in target_items:        
                #     other_details = await target.query_selector("ul.details_info li.detail_cell")        
                #     building_units = [ await other_details[0].inner_text().replace(" units", "").strip() ]
                #     building_stories = [ await other_details[1].inner_text().replace(" stories", "").strip() ]
                #     building_year = [ await other_details[2].inner_text().replace("built in", "").strip() ]
    
                print(f"Successfuly scraped page {page_num}!")
                
        except Exception as e:
            errors_list.append(url)
            print(f"Error '{e}' was found on {url}, page {page_num} of {end_page} pages. Moving to next scrape...")
            
        finally:
            # ## create df to hold all data
            # main_df.append(pd.DataFrame({ "building_name": building_names,
                                         # "link": building_links,
                                         # "address": building_addresses,
                                         # "coordinates": building_latlng,
                                         # "year_built": building_year,
                                         # "total_stories": building_stories,
                                         # "total_units": building_units
                                        # }))
            
            snoozer(21, 56)
    
            # closing playwright
            await browser.close()
            print(f"Done scraping {end_page} pages!") 

nest_asyncio.apply()
asyncio.run(scraper())