# NYC federal funding, fiscal year 2025

From the list of [NYC preliminary budget reports](https://council.nyc.gov/budget/fy2025/) we scraped, we will now be downloading each of the PDFs.

In [1]:
# importing libraries
import requests
import pandas as pd
import os
from random import randrange
import time

In [2]:
# headers to get past the 403 error

def get_session():
    """
    This function creates a session with common headers.
    """
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
    })
    return session

In [3]:
# snoozer

def snoozer(start_time, end_time):
    '''
    This function creates a snoozer that can be used when scraping.
    It requires `from random import randrange` and `import time`. 
    
    Parameters: 
    start_time (int) = start time of range, in seconds
    end_time (int) = end time of range, in seconds
    '''
    timer = randrange(start_time, end_time)
    print(f"Snoozing for {timer} seconds...")
    print("") # adds a line break for readability
    time.sleep(timer)

### Downloading the links saved on our `csv`

In [4]:
# reading the csv
df = pd.read_csv("preliminary-budget-reports-links.csv")
df.head(3)

Unnamed: 0,agency,link
0,February 2024 Economic and Tax Revenue Forecast,https://council.nyc.gov/budget/wp-content/uplo...
1,FY25 Financial Plan Overview,https://council.nyc.gov/budget/wp-content/uplo...
2,Financial Plan Overview Dashboard,https://council.nyc.gov/budget/wp-content/uplo...


In [5]:
# initializing
counter = 0
os.makedirs("pdfs", exist_ok=True)
errors_list = [] # holds errors
session = get_session()

# iterating each link for downloading
for pdf_link in df["link"]:
    counter += 1
    print(f"Downloading {counter} of {len(df['link'])} PDFs...")
    response = session.get(pdf_link)
    if 200 <= response.status_code < 400:
        try:
            filename = f"{counter}_{pdf_link.split('/')[-1]}"
            output_path = os.path.join("pdfs", filename)
            with open(output_path, "wb") as f:
                f.write(response.content)
        except Exception as e:
            errors_list.append((pdf_link, e))
            print(f"Something went wrong with PDF #{counter} due to {e}.")
    else:
        print(f"Failed to download: {pdf_link}")
    if counter <= len(df["link"]) - 1: # so it does not snooze when the downloads are all done
        snoozer(8, 24)

print("Done!")


Downloading 1 of 53 PDFs...
Snoozing for 15 seconds...

Downloading 2 of 53 PDFs...
Snoozing for 18 seconds...

Downloading 3 of 53 PDFs...
Snoozing for 14 seconds...

Downloading 4 of 53 PDFs...
Snoozing for 21 seconds...

Downloading 5 of 53 PDFs...
Snoozing for 8 seconds...

Downloading 6 of 53 PDFs...
Snoozing for 11 seconds...

Downloading 7 of 53 PDFs...
Snoozing for 17 seconds...

Downloading 8 of 53 PDFs...
Snoozing for 22 seconds...

Downloading 9 of 53 PDFs...
Snoozing for 14 seconds...

Downloading 10 of 53 PDFs...
Snoozing for 15 seconds...

Downloading 11 of 53 PDFs...
Snoozing for 10 seconds...

Downloading 12 of 53 PDFs...
Snoozing for 23 seconds...

Downloading 13 of 53 PDFs...
Snoozing for 11 seconds...

Downloading 14 of 53 PDFs...
Snoozing for 19 seconds...

Downloading 15 of 53 PDFs...
Snoozing for 17 seconds...

Downloading 16 of 53 PDFs...
Snoozing for 23 seconds...

Downloading 17 of 53 PDFs...
Snoozing for 21 seconds...

Downloading 18 of 53 PDFs...
Snoozing for