## MNS rental market reports

In this notebook, we will be downloading monthly and year-end rental market reports for Bronx from [MNS](https://www.mns.com/bronx_rental_market_report). The files are in PDF format.

In [1]:
# importing libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import wget
from random import randrange
import time

In [2]:
# requesting URL
url = "https://www.mns.com/bronx_rental_market_report"
response = requests.get(url)
response

<Response [200]>

In [3]:
# soupifying HTML
soup = BeautifulSoup(response.text, "html.parser")

### Scraping, saving links to a `csv`

In [4]:
# getting all the links we want
# checking the source code, we find that they're all under the <select> tag with id="report_month"
# target_links = soup.find_all("select", id="report_month")

# but, we also saw that only the links we need are under the <option> tag
target_links = soup.find_all("option")
target_links

[<option value="select">Select report month</option>,
 <option value="http://www.mns.com/pdf/bronx_market_report_sep_24.pdf">September 2024</option>,
 <option value="http://www.mns.com/pdf/bronx_market_report_aug_24.pdf">August 2024</option>,
 <option value="http://www.mns.com/pdf/bronx_market_report_jul_24.pdf">July 2024</option>,
 <option value="http://www.mns.com/pdf/bronx_market_report_jun_24.pdf">June 2024</option>,
 <option value="http://www.mns.com/pdf/bronx_market_report_may_24.pdf">May 2024</option>,
 <option value="http://www.mns.com/pdf/bronx_market_report_apr_24.pdf">April 2024</option>,
 <option value="http://www.mns.com/pdf/bronx_market_report_mar_24.pdf">March 2024</option>,
 <option value="http://www.mns.com/pdf/bronx_market_report_feb_24.pdf">February 2024</option>,
 <option value="http://www.mns.com/pdf/bronx_market_report_jan_24.pdf">January 2024</option>,
 <option value="http://www.mns.com/pdf/bronx_year_end_market_report_2023.pdf">Year End 2023</option>,
 <option v

In [5]:
# we don't need the first item on the list
# so we're updating the list by slicing it
target_links = target_links[1:]

In [6]:
# getting only the links, which are inside the "value" attribute
# we're making two datasets: monthly reports and yearend reports

monthly_reports = [] # will hold our download links for monthly reports
monthly_list = [] # will hold our download links for monthly reports
yearend_reports = [] # will hold our download links for yearend reports
yearend_list = [] # will hold our download links for yearend reports

for link in target_links:
    if "Year End" in link.text:
        yearend_reports.append(link["value"])
        yearend_list.append(link.text)
    else:
        monthly_reports.append(link["value"])
        monthly_list.append(link.text)
print("Done!")

Done!


In [7]:
# printing to check if it worked
print(yearend_reports[0:2])
print(yearend_list[0:2])
print(monthly_reports[0:2])
print(monthly_list[0:2])

['http://www.mns.com/pdf/bronx_year_end_market_report_2023.pdf', 'http://www.mns.com/pdf/bronx_year_end_market_report_2022.pdf']
['Year End 2023', 'Year End 2022']
['http://www.mns.com/pdf/bronx_market_report_sep_24.pdf', 'http://www.mns.com/pdf/bronx_market_report_aug_24.pdf']
['September 2024', 'August 2024']


In [8]:
# saving the yearend reports into df, then csv for safekeeping

yearend_data = [ {"period": year, "link": link} for (year, link) in zip(yearend_list, yearend_reports) ]
bx_yearend = pd.DataFrame(yearend_data)
bx_yearend.to_csv("bx_yearend_reports.csv", encoding="UTF-8", index=False)

In [9]:
# saving the monthly reports into df, then csv for safekeeping

monthly_data = [ {"period": year, "link": link} for (year, link) in zip(monthly_list, monthly_reports) ]
bx_monthly = pd.DataFrame(monthly_data)
bx_monthly.to_csv("bx_monthly_reports.csv", encoding="UTF-8", index=False)

### Defining functions to make downloading easier for us

In [10]:
# downloading the files

def downloader(links_list, folder_name):
    '''
    This function downloads each of the files in a list of links and saves it to the folder identified. 
    It requires `import os` to run.
    Initialize `counter = 0` and `errors_list = []` before running function.
    
    Parameters:
    links_list(list) = list of URLs for download
    folder_name (str) = name of folder where files will be saved; this will be created if it does not exist
    '''
    os.makedirs(folder_name, exist_ok=True)

    print(f"Downloading link {counter} of {len(links_list)}...")
    try:
        wget.download(item, out=folder_name)
    except Exception as e:
        errors_list.append((item, e))
        print(f"Something went wrong with link {counter} due to {e}.")

In [11]:
# snoozer

def snoozer(start_time, end_time):
    '''
    This function creates a snoozer that can be used when scraping.
    It requires `from random import randrange` and `import time`. 
    
    Parameters: 
    start_time (int) = start time of range, in seconds
    end_time (int) = end time of range, in seconds
    '''
    timer = randrange(start_time, end_time)
    print("") # adds a line break for readability
    print(f"Snoozing for {timer} seconds...")
    time.sleep(timer)

### Now, the downloading actually happens!

In [12]:
# downloading actual PDFs of yearend reports

counter = 0
errors_list = []

for item in yearend_reports:
    counter += 1
    downloader(yearend_reports, "yearend-reports")
    if counter <= len(yearend_reports) - 1: # so it does not snooze when the downloads are all done
        snoozer(12, 34)
    print("") # adds a line break per item downloaded    
print("Downloads done!")

Downloading link 1 of 4...
100% [......................................................] 7731223 / 7731223
Snoozing for 22 seconds...

Downloading link 2 of 4...
100% [......................................................] 7724989 / 7724989
Snoozing for 14 seconds...

Downloading link 3 of 4...
100% [......................................................] 7721986 / 7721986
Snoozing for 32 seconds...

Downloading link 4 of 4...
100% [......................................................] 8759323 / 8759323
Downloads done!


In [13]:
# downloading actual PDFs of monthly reports

counter = 0
errors_list = []

for item in monthly_reports:
    counter += 1
    downloader(monthly_reports, "monthly-reports")
    if counter <= len(monthly_reports) - 1: # so it does not snooze when the downloads are all done
        snoozer(12, 34)
    print("") # adds a line break per item downloaded   
print("Downloads done!")

Downloading link 1 of 65...
100% [......................................................] 6110141 / 6110141
Snoozing for 21 seconds...

Downloading link 2 of 65...
100% [......................................................] 6073250 / 6073250
Snoozing for 29 seconds...

Downloading link 3 of 65...
100% [......................................................] 5987964 / 5987964
Snoozing for 28 seconds...

Downloading link 4 of 65...
100% [......................................................] 5990148 / 5990148
Snoozing for 22 seconds...

Downloading link 5 of 65...
100% [......................................................] 5933668 / 5933668
Snoozing for 24 seconds...

Downloading link 6 of 65...
100% [......................................................] 5951542 / 5951542
Snoozing for 18 seconds...

Downloading link 7 of 65...
100% [......................................................] 5993589 / 5993589
Snoozing for 16 seconds...

Downloading link 8 of 65...
100% [..............

#### Longer version of the code that I used a basis for my functions:

In [None]:
# # downloading actual PDFs of yearend reports

# # creating a new folder to hold our yearend reports
# downloads_folder = "yearend-reports"
# os.makedirs(downloads_folder, exist_ok=True)

# # other variables we need
# counter = 0
# errors_list = [] # will hold any problematic link
# start_range, end_range = 12, 31

# # downloading
# for report in yearend_reports:
#     counter += 1
#     print(f"Downloading link {counter} of {len(yearend_reports)}")
#     try:
#         wget.download(report, out=downloads_folder)
#     except Exception as e:
#         errors_list.append((report, e))
#         print(f"Something went wrong in link {counter} due to {e}.")
#     snoozer(start_range, end_range)
# print("Downloads done!")