### StreetEasy Mott Haven Buildings, Attempt 2

Previously, we attempted to scrape the [list of buildings in Mott Haven](https://streeteasy.com/buildings/mott-haven) from the StreetEasy website through `BeautifulSoup`. However, the page seems to use JS to dynamically load content. So, we'll try scraping through headless browser `Playwright`.

The details we need are the following:

* building name
* address
* coordinates (for mapping)
* year it was built
* number of stories
* number of units
* link to individual pages (which we will use to get more details)

In [1]:
# importing libraries
import pandas as pd
from playwright.async_api import async_playwright
import asyncio
from bs4 import BeautifulSoup
from random import randrange
import time
import numpy as np

In [2]:
# snoozer

def snoozer(start_time, end_time):
    '''
    This function creates a snoozer that can be used when scraping.
    It requires `from random import randrange` and `import time`. 
    
    Parameters: 
    start_time (int) = start time of range, in seconds
    end_time (int) = end time of range, in seconds
    '''
    timer = randrange(start_time, end_time)
    print(f"Snoozing for {timer} seconds...")
    time.sleep(timer)
    print("") # adds a line break for readability

### Apparently, it's more convenient to `soup`-ify Playwright `page.content()`

This worked... until I was denied access to the webpage. (I used VPN!)

As solutions, I found an article about [undetectable](https://scrapingant.com/blog/playwright-scraping-undetectable) `Playwright` and setting up [free proxies](https://free-proxy-list.net).

In [3]:
## SOMA VERSION

base_url = "https://streeteasy.com/buildings/mott-haven"
end_page = 45 # number of pages we want to scrape
### when this code works, this should be changed back to 45.
errors_list = [] # holds pages with errors
all_data = [] # holds all captured lists

# initializing lists
building_names = []
building_links = []
building_addresses = []
building_latlng = []
building_year = []
building_stories = []
building_units = []

# starting playwright   
playwright = await async_playwright().start()
browser = await playwright.firefox.launch(headless=False) # False because I want to see it load the page
context = await browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
page = await context.new_page()

for page_num in range(1, end_page + 1):     
    print(f"Attempting to scrape page {page_num}...")
        
    try:
        ## requesting URLs
        if page_num != 1:
            url = f"{base_url}?page={page_num}"
        else:
            url = base_url

        await page.goto(url, timeout=120000)
        
        soup = BeautifulSoup(await page.content(), "html.parser") # Playwright -> BeautifulSoup is easier?
        
        # extracting data
        target_items = soup.find_all("li", class_="item building")
        
        building_names = [ target.find("h2", class_="details-title").get_text(strip=True).replace("SAVE", "") for target in target_items ]
        building_links = [ "https://streeteasy.com" + target.find("a").get("href") for target in target_items ]
        building_latlng = [ target.get("se:map:point") for target in target_items ]
    
        ## other data held separately

        for target in target_items:
            # initializing placeholder variables
            address_tag = target.find("ul").find("li")
            if "At " in address_tag.get_text():
                addresses = address_tag.get_text(strip=True).replace("At ", "")
            else:
                addresses = np.nan # this temporarily places NaN, will be replaced if `detail` is found
            
            other_details = target.find("ul", class_="details_info")
            units = stories = year = np.nan # this temporarily places NaN, will be replaced if `detail` is found
            
            if other_details: 
                for detail in other_details.find_all("li", class_="detail_cell"):
                    text = detail.get_text(strip=True)
                    if "units" in text:
                        units = int(text.split()[0]) 
                    elif "stories" in text:
                        stories = int(text.split()[0]) 
                    elif "built in" in text:
                        year = int(text.split()[-1]) 
                        
            # appending lists        
            building_addresses.append(addresses)
            building_units.append(units)
            building_stories.append(stories)
            building_year.append(year)
        
        print(f"Successfuly scraped page {page_num}!")
        
    except Exception as e:
        errors_list.append(url)
        print(f"Error '{e}' was found on {url}, page {page_num} of {end_page} pages. Moving to next scrape...")
    
    finally:
        # checking to see if they all have the same range
        print(len(building_names))
        print(len(building_links))
        print(len(building_addresses))
        print(len(building_latlng))
        print(len(building_year))
        print(len(building_stories))
        print(len(building_units))

        # create df to hold all data
        all_data.append(pd.DataFrame({ "building_name": building_names,
                                     "link": building_links,
                                     "address": building_addresses,
                                     "coordinates": building_latlng,
                                     "year_built": building_year,
                                     "total_stories": building_stories,
                                     "total_units": building_units
                                    }))
        
        if page_num <= end_page - 1:
            snoozer(32, 132)

    # closing playwright
    await browser.close()

print(f"Done scraping {page_num} of {end_page} pages!")

Attempting to scrape page 1...
Successfuly scraped page 1!
11
11
11
11
11
11
11
Snoozing for 34 seconds...

Attempting to scrape page 2...
Error 'Page.goto: Target page, context or browser has been closed' was found on https://streeteasy.com/buildings/mott-haven?page=2, page 2 of 45 pages. Moving to next scrape...
11
11
11
11
11
11
11
Snoozing for 95 seconds...

Attempting to scrape page 3...
Error 'Page.goto: Target page, context or browser has been closed' was found on https://streeteasy.com/buildings/mott-haven?page=3, page 3 of 45 pages. Moving to next scrape...
11
11
11
11
11
11
11
Snoozing for 80 seconds...

Attempting to scrape page 4...
Error 'Page.goto: Target page, context or browser has been closed' was found on https://streeteasy.com/buildings/mott-haven?page=4, page 4 of 45 pages. Moving to next scrape...
11
11
11
11
11
11
11
Snoozing for 117 seconds...

Attempting to scrape page 5...
Error 'Page.goto: Target page, context or browser has been closed' was found on https://s

In [4]:
# converting this to our final df

final_df = pd.concat(all_data, ignore_index=True)
final_df

Unnamed: 0,building_name,link,address,coordinates,year_built,total_stories,total_units
0,The Arches +NYC,https://streeteasy.com/building/the-arches-nyc,224 East 135th Street,"40.8100955,-73.9305333",2021,25.0,190
1,The Arches,https://streeteasy.com/building/the-arches,228 East 135th Street,"40.80996311,-73.93100657",2020,25.0,156
2,One38,https://streeteasy.com/building/one38-138-bruc...,138 Bruckner Boulevard,"40.80353387,-73.9207622",2024,12.0,447
3,Bruckner House,https://streeteasy.com/building/bruckner-house,40 Bruckner Boulevard,"40.80629384,-73.92722838",2023,12.0,365
4,Maven Mott Haven,https://streeteasy.com/building/maven-mott-haven,2413 Third Avenue,"40.80875296,-73.93145214",2023,27.0,200
...,...,...,...,...,...,...,...
490,445 Gerard Avenue,https://streeteasy.com/building/445-gerard-ave...,,"40.81754349,-73.9300955",2023,11.0,338
491,Third at Bankside,https://streeteasy.com/building/third-at-bankside,2401 Third Avenue,"40.80867363,-73.9319146",2021,,458
492,101 Bruckner Boulevard,https://streeteasy.com/building/101-bruckner-b...,,"40.8054816,-73.925894",2021,7.0,55
493,The Motto,https://streeteasy.com/building/the-motto,2455 Third Avenue,"40.8093456,-73.9296974",2023,23.0,264


In [5]:
# saving the file to csv

final_df.to_csv("mott-haven-streeteasy-buildings.csv", encoding="UTF-8", index=False)

### This was the original Playwright code I've been working on... 

A recurring problem is access check-ins then denials when we try to jump to the next page. I haven't run the scrapers yet. 

In [None]:
# ## MAIN CODE
# # this is tweaked from the bs4 scraper version

# base_url = "https://streeteasy.com/buildings/mott-haven"
# end_page = 2 # number of pages we want to scrape
# ### for now, `end_page` is set to 2... but when this code works, it should be changed back to 45.
# errors_list = [] # holds pages with errors
# main_df = [] # holds all captured lists

# # starting playwright
# async def scraper():
#     '''
#     This function scrapes a page asynchronously.
#     '''
#     async with async_playwright() as playwright:
#         browser = await playwright.firefox.launch(headless=False) # False because I want to see it load the page
#         context = await browser.new_context()
#         page = await context.new_page()
    
#         try:
#             for page_num in range(1, end_page + 1):
#                 print(f"Attempting to scrape page {page_num}...")
                
#                 ## requesting URLs
#                 if page_num != 1:
#                     url = f"{base_url}?page={page_num}"
#                 else:
#                     url = base_url
#                 await page.goto(url)
#                 await page.wait_for_load_state('networkidle')
        
#                 ## extracting data
#                 # target_items = await page.query_selector_all("li.item.building")
            
#                 # building_names = [ await target.query_selector("h2.details-title").inner_text() for target in target_items ]
#                 # building_links = [ "https://streeteasy.com" + await query_selector("a").get_attribute("href") for target in target_items]
#                 # building_addresses = [ await target.query_selector("ul li").inner_text() for target in target_items ]
#                 # building_latlng = [ await target.get_attribute("se:map:point") for target in target_items]
            
#                 # ## other data held separately
#                 # for target in target_items:        
#                 #     other_details = await target.query_selector("ul.details_info li.detail_cell")        
#                 #     building_units = [ await other_details[0].inner_text().replace(" units", "").strip() ]
#                 #     building_stories = [ await other_details[1].inner_text().replace(" stories", "").strip() ]
#                 #     building_year = [ await other_details[2].inner_text().replace("built in", "").strip() ]
    
#                 print(f"Successfuly scraped page {page_num}!")
                
#         except Exception as e:
#             errors_list.append(url)
#             print(f"Error '{e}' was found on {url}, page {page_num} of {end_page} pages. Moving to next scrape...")
            
#         finally:
#             # ## create df to hold all data
#             # main_df.append(pd.DataFrame({ "building_name": building_names,
#                                          # "link": building_links,
#                                          # "address": building_addresses,
#                                          # "coordinates": building_latlng,
#                                          # "year_built": building_year,
#                                          # "total_stories": building_stories,
#                                          # "total_units": building_units
#                                         # }))
            
#             snoozer(21, 56)
    
#             # closing playwright
#             await browser.close()
#             print(f"Done scraping {end_page} pages!") 

# nest_asyncio.apply()
# asyncio.run(scraper())