### StreetEasy Mott Haven Buildings, Attempt 1

In this notebook, we will be scraping the [list of buildings in Mott Haven](https://streeteasy.com/buildings/mott-haven) from the StreetEasy website. The details we need are the following:

* building name
* address
* coordinates (for mapping)
* year it was built
* number of stories
* number of units
* link to individual pages (which we will use to get more details)

In [1]:
# importing libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
from random import randrange
import time
import numpy as np

In [2]:
# trial

url = "https://streeteasy.com/buildings/mott-haven"
response = requests.get(url)

In [3]:
response.status_code

403

### Excerpt source code that we wanted to target:

```
<li id="building_6257250" class="item building" data-id="6257250" se:behavior='selectable hoverable clickable rememberable mappable' se:map='map'se:map:point='40.8078,-73.9291' data-blockindex='0'>
  <div class="photo" se:behavior='selectable hoverable clickable rememberable mappable' se:map='map'se:map:point='40.8078,-73.9291'  se:map='map'se:map:point='40.8078,-73.9291'>
      <span id="saved_banner_6257250"></span>
    <a href="/building/the-crescendo"><img alt="The Crescendo  at 25 Bruckner Boulevard in Mott Haven" class="performance-marked" data-performance-mark="search.Buildings.listingImageVisible" src="https://photos.zillowstatic.com/fp/014287801b4057e345148514ce7e04da-se_medium_500_250.webp" /></a>
  </div>

  <div class="details row">
    <h2 class="details-title">
      <a se:clickable:target="true" href="/building/the-crescendo">The Crescendo </a>

        <span id="buttons_6257250"></span>

            <div class="se_embed_react" data-se-entry="userAuth" data-se-component="UserAuthModal" data-se-id="se_embed_react_eb19c5f5-880f-429b-b611-3ab94295952c" data-react-component=""></div>
    <script>
    window["se_embed_react_eb19c5f5-880f-429b-b611-3ab94295952c"] = ["UserAuthModal",{}]
    </script>

    </h2>

    <ul>
        <li>At 25 Bruckner Boulevard</li>
        <li class="price-info">
          <span class='price'>4 active rentals</span>
        </li>

        <li>
          <ul class="details_info"><li class="detail_cell">130 units</li><li class="detail_cell">6 stories</li><li class="detail_cell">built in 2017</li></ul>
        </li>
        <li>
          <div class="details_info"><span class="detail_cell">Rental Building in Mott Haven</span></div>
        </li>
        <li id="saved_section_6257250">
          <span class="u-visuallyHidden">saved_section</span>
        </li>
    </ul>
  </div>
</li>
```

## Defining functions

In [4]:
# function to check status of pages

def status_check(response):
    '''
    This function checks the status_code of a URL being requested.

    Parameter:
    response = requests.get(url)
    '''
    if 200 <= response.status_code <= 299:
        print("Page is accessible. Scraping begins...")
        return True
    else:
        print(f"Error {response.status_code}. We can't proceed.")
        return False

In [5]:
# snoozer

def snoozer(start_time, end_time):
    '''
    This function creates a snoozer that can be used when scraping.
    It requires `from random import randrange` and `import time`. 
    
    Parameters: 
    start_time (int) = start time of range, in seconds
    end_time (int) = end time of range, in seconds
    '''
    timer = randrange(start_time, end_time)
    print(f"Snoozing for {timer} seconds...")
    time.sleep(timer)
    print("") # adds a line break for readability

In [6]:
# headers to get through that 403 error

def get_session():
    '''
    This function creates a session with common headers.
    '''
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1"
    })
    return session

### Let's try this!

In [7]:
# initializing variables
base_url = "https://streeteasy.com/buildings/mott-haven"
end_page = 45 # changed to 45 when ready to scrape
errors_list = [] # holds pages with errors
all_data = [] # holds all captured data

session = get_session()

# initializing data dictionary
data_dict = {
    "building_name": [],
    "link": [],
    "address": [],
    "coordinates": [],
    "total_units": [],
    "total_stories": [],
    "year_built": []
}

for page_num in range(1, end_page + 1):     
    print(f"Attempting to scrape page {page_num} of {end_page} pages...")
    
    try:
        # request URL
        if page_num != 1:
            url = f"{base_url}?page={page_num}"
        else:
            url = base_url
        
        # retry mechanisms
        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = session.get(url, timeout=30)
                response.raise_for_status()
                break
            except requests.RequestException as e:
                if attempt == max_retries - 1:
                    raise e
                snoozer(10, 15)  # longer wait between retries
        
        soup = BeautifulSoup(response.content, "html.parser")

        # extracting data
        target_items = soup.find_all("li", class_="item building")
        print(f"Items found on this page: {len(target_items)}")

        # in case of errors
        if not target_items:
            print(f"Warning: No building data found on page {page_num}.")
            errors_list.append(url)
            continue

        for target in target_items:
            data_dict["building_name"].append(target.find("h2", class_="details-title").get_text(strip=True).replace("SAVE", "")\
                                  if target.find("h2", class_="details-title") else np.nan)
            
            data_dict["link"].append("https://streeteasy.com" + target.find("a").get("href")\
                             if target.find("a").get("href") else np.nan)
            
            data_dict["coordinates"].append(target.get("se:map:point")\
                              if target.get("se:map:point") else np.nan)
    
            # other data held separately
            address_tag = target.find("ul").find("li") if target.find("ul") else None
            address = (address_tag.get_text(strip=True).replace("At ", "")\
                       if address_tag and "At " in address_tag.get_text() else np.nan)
            data_dict["address"].append(address)

            # initialize variables
            units = stories = year = np.nan
    
            other_details = target.find("ul", class_="details_info")
            if other_details: 
                for detail in other_details.find_all("li", class_="detail_cell"):
                    text = detail.get_text(strip=True)
                    try:
                        if "units" in text:
                            units = int(text.split()[0])
                        elif "stories" in text:
                            stories = int(text.split()[0])
                        elif "built in" in text:
                            year = int(text.split()[-1])
                    except:
                        pass

            data_dict["total_units"].append(units)
            data_dict["total_stories"].append(stories)
            data_dict["year_built"].append(year)
        
        print(f"Successfuly scraped page {page_num}!")
        
    except Exception as e:
        errors_list.append(url)
        print(f"Error '{e}' was found on {url}, page {page_num} of {end_page} pages. Moving to next scrape...")
        continue
    
    finally:        
        if page_num <= end_page - 1:
            snoozer(53, 132)

# saving to df
df = pd.DataFrame(data_dict)
all_data.append(df)

Attempting to scrape page 1 of 45 pages...
Items found on this page: 11
Successfuly scraped page 1!
Snoozing for 107 seconds...

Attempting to scrape page 2 of 45 pages...
Items found on this page: 11
Successfuly scraped page 2!
Snoozing for 58 seconds...

Attempting to scrape page 3 of 45 pages...
Items found on this page: 11
Successfuly scraped page 3!
Snoozing for 77 seconds...

Attempting to scrape page 4 of 45 pages...
Items found on this page: 11
Successfuly scraped page 4!
Snoozing for 88 seconds...

Attempting to scrape page 5 of 45 pages...
Items found on this page: 11
Successfuly scraped page 5!
Snoozing for 122 seconds...

Attempting to scrape page 6 of 45 pages...
Items found on this page: 11
Successfuly scraped page 6!
Snoozing for 76 seconds...

Attempting to scrape page 7 of 45 pages...
Items found on this page: 11
Successfuly scraped page 7!
Snoozing for 125 seconds...

Attempting to scrape page 8 of 45 pages...
Items found on this page: 11
Successfuly scraped page 8!
S

In [8]:
# converting to our final df

final_df = pd.concat(all_data, ignore_index=True)
len(final_df)

488

In [9]:
# saving the file to csv
final_df.to_csv("mott-haven-streeteasy-buildings.csv", encoding="UTF-8", index=False)

## Failed attempt

In [None]:
# ## MAIN CODE, DON'T RUN 

# base_url = "https://streeteasy.com/buildings/mott-haven"
# end_page = 45 # number of pages we want to scrape
# errors_list = [] # holds pages with errors
# main_df = [] # holds all captured lists

# for page_num in range(1, end_page + 1):
#     print(f"Page {page_num}:")
    
#     ## requesting URLs
#     if page_num != 1: # not the first page
#         response = requests.get(f"{base_url}?page={page_num}")
#     else: # this is the first page
#         response = requests.get(base_url)
#     if not status_check(response):
#         errors_list.append(f"{base_url}?page={page_num}")
        
#     ## soupifying the response
#     soup = BeautifulSoup(response.text, "html.parser")
    
#     # ## extracting data
#     # target_items = soup.find_all("li", class_="item building")

#     # building_names = [ target.find("h2", class_="details-title").get_text(strip=True) for target in target_items ]
#     # building_links = [ "https://streeteasy.com" + target.find("a").get("href") for target in target_items]
#     # building_addresses = [ target.find("ul").find("li").get_text(strip=True) for target in target_items ]
#     # building_latlng = [ target.get("se:map:point") for target in target_items]

#     # ## other data held separately

#     # for target in target_items:        
#     #     other_details = target.find("ul", class_="details_info").find_all("li", class_="detail_cell")        
#     #     building_units = [ other_details[0].replace(" units", "") ]
#     #     building_stories = [ other_details[1].replace(" stories", "") ]
#     #     building_year = [ other_details[2].replace("built in", "") ]

#     # ## create df to hold all data

#     # main_df.append(pd.DataFrame({ "building_name": building_names,
#     #                              "link": building_links,
#     #                              "address": building_addresses,
#     #                              "coordinates": building_latlng,
#     #                              "year_built": building_year,
#     #                              "total_stories": building_stories,
#     #                              "total_units": building_units
#     #                             }))

#     snoozer(21, 56)
        
# print(f"Done scraping {end_page} pages!")  