# Community full board scrapper 

All community boards have separate sites for their minutes 

> I watched Soma's youtube tutorial https://www.youtube.com/watch?v=QNKxzkNpsko


I chatted with Soma, and he said he imagined each community board to have one scrapper, and it's better to have a minimum viable product since it looks more impressive on your CV than attempting and learning something even if it fails. 

## Setup: Import what you'll need to scrape the page

We'll be using either Playwrightfor this, *not* requests.

## Starting your search

Starting from [here](https://www.nyc.gov/site/bronxcb1/calendar/board-meeting-minutes.page), search for all community board meeting minutes from Community Board 1 Bronx

# Finding all minutes

In [16]:
import os
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup

# Base URL
base_url = "https://cbmanhattan.cityofnewyork.us/cb4/archives/"

# Years to scrape
years_to_scrape = [
    "full-board-minutes-audio",
    "2024-full-board-minutes-audio",
    "2023-full-board-minutes-audio",
    "2022-full-board-minutes-audio",
    "2021-full-board-minutes-audio",
    "2019-full-board-minutes-audio",
    "2018-full-board-minutes-audio",
]

# Generate URLs
urls_to_scrape = [f"{base_url}{year_ending}/" for year_ending in years_to_scrape]

# Folder to store downloaded PDFs
folder_name = "Manhattan_CB4_PDFs"
os.makedirs(folder_name, exist_ok=True)

# Initialize session and data
session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
})
scraped_data = []

# Function to scrape webpage
def scrape_webpage(url):
    try:
        response = session.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            links = soup.find_all("a", href=True)
            for link in links:
                href = link["href"]
                text = link.text.strip()
                if not href.startswith("http"):
                    href = f"{base_url}{href}"
                scraped_data.append({"Date": text, "URL": href})
        else:
            print(f"Failed to fetch webpage {url}. Status code: {response.status_code}")
    except requests.exceptions.Timeout:
        print(f"Timeout occurred for URL: {url}")
    except requests.exceptions.RequestException as e:
        print(f"Error occurred for URL {url}: {e}")

# Scrape all URLs
for url in urls_to_scrape:
    scrape_webpage(url)
    time.sleep(2)  # Delay between requests

# Save to CSV
df = pd.DataFrame(scraped_data)
df.to_csv("Manhattan_CB4.csv", index=False)
print("Scraped data saved to 'Manhattan_CB4.csv'.")


Scraped data saved to 'Manhattan_CB4.csv'.


In [18]:
import pandas as pd
df = pd.DataFrame(scraped_data)
df.head()


Unnamed: 0,Date,URL
0,,http://www1.nyc.gov/
1,311,http://www1.nyc.gov/311/index.page
2,Search all NYC.gov websites,http://www1.nyc.gov/home/search/index.page
3,,https://cbmanhattan.cityofnewyork.us/cb4/
4,Skip to content,https://cbmanhattan.cityofnewyork.us/cb4/archi...


# Export the df 

I stole this code from the internet and fixed it up, link [here](https://www.geeksforgeeks.org/downloading-pdfs-with-python-using-requests-and-beautifulsoup/)

In [9]:
import os
import requests
import pandas as pd
import pdfplumber
import time

# Load the DataFrame
input_file = "Manhattan_CB1.csv"
df = pd.read_csv(input_file)

# Folder to store downloaded PDFs
folder_name = "Manhattan_CB1_PDFs"
os.makedirs(folder_name, exist_ok=True)

# Initialize a list to store content
content_list = []
failed_files = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    url = row['URL']
    date = row['Date']

    try:
        print(f"Processing: {date} ({url})")
        file_name = os.path.basename(url)
        file_path = os.path.join(folder_name, file_name)

        # Download the PDF if not already downloaded
        if not os.path.exists(file_path):
            response = requests.get(url)
            with open(file_path, 'wb') as pdf_file:
                pdf_file.write(response.content)
            print(f"Downloaded: {file_path}")
        
        # Add a delay to prevent scraping too quickly
        time.sleep(2)

        # Extract text using pdfplumber
        pdf_text = ""
        try:
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    pdf_text += page.extract_text() or ""
        except Exception as e:
            print(f"pdfplumber failed for {file_name}: {e}")
            pdf_text = "Error extracting content"

        # Log files that could not be processed
        if not pdf_text.strip():
            failed_files.append(file_name)
            pdf_text = "Manual review required"

        content_list.append(pdf_text.strip())

    except Exception as e:
        print(f"Error processing {date}: {e}")
        content_list.append("Error extracting content")

# Add the content as a new column in the DataFrame
df['Content'] = content_list
df.to_csv("Manhattan_CB1_with_content.csv", index=False)

# Save failed files for manual review
with open("failed_files.log", "w") as log_file:
    log_file.write("\n".join(failed_files))

print("Processing complete. Check 'Manhattan_CB1_with_content.csv' and 'failed_files.log'.")


MissingSchema: Invalid URL 'URL_OF_THE_WEBPAGE': No schema supplied. Perhaps you meant http://URL_OF_THE_WEBPAGE?

# Print PDF 

I used this link from Soma's website [here](https://jonathansoma.com/everything/pdfs/ocr-tools/) 

In [5]:
import pandas as pd

csv_path = "/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB1_with_content.csv"
data = pd.read_csv(csv_path)

print(data.columns)

Index(['Date', 'URL', 'Content'], dtype='object')
