# Community full board scrapper 

All community boards have separate sites for their minutes 

> I watched Soma's youtube tutorial https://www.youtube.com/watch?v=QNKxzkNpsko


I chatted with Soma, and he said he imagined each community board to have one scrapper, and it's better to have a minimum viable product since it looks more impressive on your CV than attempting and learning something even if it fails. 

## Setup: Import what you'll need to scrape the page

We'll be using either Playwrightfor this, *not* requests.

## Starting your search

Starting from [here](https://www.nyc.gov/site/bronxcb1/calendar/board-meeting-minutes.page), search for all community board meeting minutes from Community Board 1 Bronx

# Finding all minutes

In [12]:
import os
import requests
import pandas as pd
import pdfplumber
import time
from bs4 import BeautifulSoup

# Base URL and webpage URL
base_url = "https://www.nyc.gov"
webpage_url = f"{base_url}/site/manhattancb1/archives/monthly-full-board-meeting-minutes.page"

# Folder to store downloaded PDFs
folder_name = "Manhattan_CB1_PDFs"
os.makedirs(folder_name, exist_ok=True)

# Initialize a list to store extracted data
scraped_data = []

# Scrape the webpage
response = requests.get(webpage_url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all divs with class "span4"
    divs = soup.find_all("div", class_="span4")
    
    for div in divs:
        # Extract the year from the <strong> tag
        year_tag = div.find("strong")
        year = year_tag.text.strip() if year_tag else "Unknown Year"

        # Extract all <a> tags
        links = div.find_all("a", href=True)
        for link in links:
            href = link["href"]
            text = link.text.strip()

            # Convert relative URLs to absolute URLs
            if not href.startswith("http"):
                href = f"{base_url}{href}"
            
            # Append the year before the existing text
            formatted_text = f"{year} {text}"
            scraped_data.append({"Date": formatted_text, "URL": href})
else:
    print(f"Failed to fetch webpage. Status code: {response.status_code}")

# Convert scraped data to a DataFrame
df = pd.DataFrame(scraped_data)

# Save the scraped data to CSV
df.to_csv("Manhattan_CB1.csv", index=False)
print("Scraped data saved to 'Manhattan_CB1.csv'.")

# Process each PDF in the DataFrame
content_list = []
failed_files = []

for index, row in df.iterrows():
    url = row["URL"]
    date = row["Date"]

    try:
        print(f"Processing: {date} ({url})")
        file_name = os.path.basename(url)
        file_path = os.path.join(folder_name, file_name)

        # Download the PDF if not already downloaded
        if not os.path.exists(file_path):
            response = requests.get(url)
            with open(file_path, "wb") as pdf_file:
                pdf_file.write(response.content)
            print(f"Downloaded: {file_path}")
        
        # Add a delay to prevent scraping too quickly
        time.sleep(2)

        # Extract text using pdfplumber
        pdf_text = ""
        try:
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    pdf_text += page.extract_text() or ""
        except Exception as e:
            print(f"pdfplumber failed for {file_name}: {e}")
            pdf_text = "Error extracting content"

        # Log files that could not be processed
        if not pdf_text.strip():
            failed_files.append(file_name)
            pdf_text = "Manual review required"

        content_list.append(pdf_text.strip())

    except Exception as e:
        print(f"Error processing {date}: {e}")
        content_list.append("Error extracting content")

# Add the content as a new column in the DataFrame
df["Content"] = content_list
df.to_csv("Manhattan_CB1_with_content.csv", index=False)

# Save failed files for manual review
with open("failed_files.log", "w") as log_file:
    log_file.write("\n".join(failed_files))

print("Processing complete. Check 'Manhattan_CB1_with_content.csv' and 'failed_files.log'.")


Scraped data saved to 'Manhattan_CB1.csv'.
Processing: 2024 January 2024 (https://www.nyc.gov/assets/manhattancb1/downloads/pdf/full-board-meeting-minutes/24-01-23.pdf)
Downloaded: Manhattan_CB1_PDFs/24-01-23.pdf
Processing: 2024 February 2024 (https://www.nyc.gov/assets/manhattancb1/downloads/pdf/full-board-meeting-minutes/24-02-27.pdf)
Downloaded: Manhattan_CB1_PDFs/24-02-27.pdf
Processing: 2024 March 2024 (https://www.nyc.gov/assets/manhattancb1/downloads/pdf/full-board-meeting-minutes/24-03-26.pdf)
Downloaded: Manhattan_CB1_PDFs/24-03-26.pdf
Processing: 2024 April 2024 (https://www.nyc.gov/assets/manhattancb1/downloads/pdf/full-board-meeting-minutes/24-04-30.pdf)
Downloaded: Manhattan_CB1_PDFs/24-04-30.pdf
Processing: 2024 May 2024 (https://www.nyc.gov/assets/manhattancb1/downloads/pdf/full-board-meeting-minutes/24-05-28.pdf)
Downloaded: Manhattan_CB1_PDFs/24-05-28.pdf
Processing: 2024 June 2024 (https://www.nyc.gov/assets/manhattancb1/downloads/pdf/full-board-meeting-minutes/24-06

In [13]:
import pandas as pd
df.head()


Unnamed: 0,Date,URL,Content
0,2024 January 2024,https://www.nyc.gov/assets/manhattancb1/downlo...,COMMUNITY BOARD 1-MANHATTAN\nMINUTES\nDATE: Tu...
1,2024 February 2024,https://www.nyc.gov/assets/manhattancb1/downlo...,COMMUNITY BOARD 1-MANHATTAN\nMINUTES\nDATE: Tu...
2,2024 March 2024,https://www.nyc.gov/assets/manhattancb1/downlo...,COMMUNITY BOARD 1-MANHATTAN\nMINUTES\nDATE: Tu...
3,2024 April 2024,https://www.nyc.gov/assets/manhattancb1/downlo...,COMMUNITY BOARD 1-MANHATTAN\nMINUTES\nDATE: Tu...
4,2024 May 2024,https://www.nyc.gov/assets/manhattancb1/downlo...,COMMUNITY BOARD 1-MANHATTAN\nMINUTES\nDATE: Tu...


# Export the df 

I stole this code from the internet and fixed it up, link [here](https://www.geeksforgeeks.org/downloading-pdfs-with-python-using-requests-and-beautifulsoup/)

In [1]:
import os
import requests
import pandas as pd
import pdfplumber
import time

# Load the DataFrame
input_file = "Manhattan_CB1.csv"
df = pd.read_csv(input_file)

# Folder to store downloaded PDFs
folder_name = "Manhattan_CB1_PDFs"
os.makedirs(folder_name, exist_ok=True)

# Initialize a list to store content
content_list = []
failed_files = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    url = row['URL']
    date = row['Date']

    try:
        print(f"Processing: {date} ({url})")
        file_name = os.path.basename(url)
        file_path = os.path.join(folder_name, file_name)

        # Download the PDF if not already downloaded
        if not os.path.exists(file_path):
            response = requests.get(url)
            with open(file_path, 'wb') as pdf_file:
                pdf_file.write(response.content)
            print(f"Downloaded: {file_path}")
        
        # Add a delay to prevent scraping too quickly
        time.sleep(2)

        # Extract text using pdfplumber
        pdf_text = ""
        try:
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    pdf_text += page.extract_text() or ""
        except Exception as e:
            print(f"pdfplumber failed for {file_name}: {e}")
            pdf_text = "Error extracting content"

        # Log files that could not be processed
        if not pdf_text.strip():
            failed_files.append(file_name)
            pdf_text = "Manual review required"

        content_list.append(pdf_text.strip())

    except Exception as e:
        print(f"Error processing {date}: {e}")
        content_list.append("Error extracting content")

# Add the content as a new column in the DataFrame
df['Content'] = content_list
df.to_csv("Manhattan_CB1_with_content.csv", index=False)

# Save failed files for manual review
with open("failed_files.log", "w") as log_file:
    log_file.write("\n".join(failed_files))

print("Processing complete. Check 'Manhattan_CB1_with_content.csv' and 'failed_files.log'.")


Processing: 2024 January 2024 (https://www.nyc.gov/assets/manhattancb1/downloads/pdf/full-board-meeting-minutes/24-01-23.pdf)
Processing: 2024 February 2024 (https://www.nyc.gov/assets/manhattancb1/downloads/pdf/full-board-meeting-minutes/24-02-27.pdf)
Processing: 2024 March 2024 (https://www.nyc.gov/assets/manhattancb1/downloads/pdf/full-board-meeting-minutes/24-03-26.pdf)
Processing: 2024 April 2024 (https://www.nyc.gov/assets/manhattancb1/downloads/pdf/full-board-meeting-minutes/24-04-30.pdf)
Processing: 2024 May 2024 (https://www.nyc.gov/assets/manhattancb1/downloads/pdf/full-board-meeting-minutes/24-05-28.pdf)
Processing: 2024 June 2024 (https://www.nyc.gov/assets/manhattancb1/downloads/pdf/full-board-meeting-minutes/24-06-25.pdf)
Processing: 2024 July 2024 (https://www.nyc.gov/assets/manhattancb1/downloads/pdf/full-board-meeting-minutes/24-07-23.pdf)
Processing: 2024 September 2024 (https://www.nyc.gov/assets/manhattancb1/downloads/pdf/full-board-meeting-minutes/24-09-24.pdf)
Pro

# Print PDF 

I used this link from Soma's website [here](https://jonathansoma.com/everything/pdfs/ocr-tools/) 

In [5]:
import pandas as pd

csv_path = "Manhattan_CB1_with_content.csv"
data = pd.read_csv(csv_path)

print(data.columns)

Index(['Date', 'URL', 'Content'], dtype='object')
