# Community full board scrapper 

All community boards have separate sites for their minutes 

> I watched Soma's youtube tutorial https://www.youtube.com/watch?v=QNKxzkNpsko


I chatted with Soma, and he said he imagined each community board to have one scrapper, and it's better to have a minimum viable product since it looks more impressive on your CV than attempting and learning something even if it fails. 

## Setup: Import what you'll need to scrape the page

We'll be using either Playwrightfor this, *not* requests.

## Starting your search

Starting from [here](https://www.nyc.gov/site/bronxcb1/calendar/board-meeting-minutes.page), search for all community board meeting minutes from Community Board 1 Bronx

# Finding all minutes

In [6]:
import os
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup

# Base URLs to scrape
urls_to_scrape = [
    "https://cbmanhattan.cityofnewyork.us/cb2/minutes/",
    "https://cbmanhattan.cityofnewyork.us/cb2/minutes/archives/"
]

# Folder to store downloaded PDFs
folder_name = "Manhattan_CB2_PDFs"
os.makedirs(folder_name, exist_ok=True)

# Initialize session and data
session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
})
scraped_data = []

# Function to scrape webpage
def scrape_webpage(url, base_url):
    try:
        response = session.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")

            # Find all 'div' elements with the class 'entry-content' and within those, look for 'a' tags with an 'href' attribute
            links = soup.find_all("div", class_="entry-content")  # Fix syntax for class_
            for div in links:
                anchor_tags = div.find_all("a", href=True)  # Find 'a' tags inside each 'div'
                for link in anchor_tags:
                    href = link["href"]
                    text = link.text.strip()
                    # Filter for PDF links or meeting minutes
                    if ".pdf" in href.lower() or "minutes" in href.lower():
                        scraped_data.append({"Date": text, "URL": href})

        else:
            print(f"Failed to fetch webpage {url}. Status code: {response.status_code}")
    except requests.exceptions.Timeout:
        print(f"Timeout occurred for URL: {url}")
    except requests.exceptions.RequestException as e:
        print(f"Error occurred for URL {url}: {e}")

# Scrape both URLs
for url in urls_to_scrape:
    scrape_webpage(url, url)  # Pass the base URL for relative link resolution
    time.sleep(2)  # Delay between requests

# Save to CSV
df = pd.DataFrame(scraped_data)
df.to_csv("Manhattan_CB2.csv", index=False)
print("Scraped data saved to 'Manhattan_CB2.csv'.")

Scraped data saved to 'Manhattan_CB2.csv'.


In [7]:
import pandas as pd
df = pd.DataFrame(scraped_data)
df.head()


Unnamed: 0,Date,URL
0,October 2024,https://cbmanhattan.cityofnewyork.us/cb2/wp-co...
1,September 2024,https://cbmanhattan.cityofnewyork.us/cb2/wp-co...
2,July 2024,https://cbmanhattan.cityofnewyork.us/cb2/wp-co...
3,June 2024,https://cbmanhattan.cityofnewyork.us/cb2/wp-co...
4,May 2024,https://cbmanhattan.cityofnewyork.us/cb2/wp-co...


# Export the df 

I stole this code from the internet and fixed it up, link [here](https://www.geeksforgeeks.org/downloading-pdfs-with-python-using-requests-and-beautifulsoup/)

In [9]:
import os
import requests
import pandas as pd
import pdfplumber
import ocrmypdf
import time
from PyPDF2 import PdfReader

# Load the DataFrame
input_file = "/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2.csv"
df = pd.read_csv(input_file)

# Folders to store downloaded and processed PDFs
folder_name = "Manhattan_CB2_PDFs"
processed_folder = "Manhattan_CB2_PDFs_Processed"
os.makedirs(folder_name, exist_ok=True)
os.makedirs(processed_folder, exist_ok=True)

# Initialize a list to store content
content_list = []
failed_files = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    url = row['URL']
    date = row['Date']

    try:
        print(f"Processing: {date} ({url})")
        file_name = os.path.basename(url)
        file_path = os.path.join(folder_name, file_name)
        processed_file_path = os.path.join(processed_folder, file_name)

        # Skip downloading if the file already exists
        if not os.path.exists(file_path):
            response = requests.get(url)
            with open(file_path, 'wb') as pdf_file:
                pdf_file.write(response.content)
            print(f"Downloaded: {file_path}")
        else:
            print(f"File already exists: {file_path}")

        # Add a delay to prevent scraping too quickly
        time.sleep(2)

        # Skip OCR if processed file already exists
        if os.path.exists(processed_file_path):
            print(f"OCR-processed file already exists: {processed_file_path}")
            reader = PdfReader(processed_file_path)
            pdf_text = "".join(page.extract_text() or "" for page in reader.pages)
        else:
            # Extract text using pdfplumber
            pdf_text = ""
            try:
                with pdfplumber.open(file_path) as pdf:
                    for page in pdf.pages:
                        pdf_text += page.extract_text() or ""
            except Exception as e:
                print(f"pdfplumber failed for {file_name}: {e}")
                pdf_text = ""

            # If pdfplumber fails or no text is extracted, use OCR
            if not pdf_text.strip():
                print(f"Falling back to OCR for {file_name}")
                try:
                    # Apply OCR to the PDF
                    ocrmypdf.ocr(file_path, processed_file_path, deskew=True, force_ocr=True)
                    
                    # Extract text from the OCR-processed PDF
                    reader = PdfReader(processed_file_path)
                    pdf_text = "".join(page.extract_text() or "" for page in reader.pages)

                    # If OCR also fails, log the file
                    if not pdf_text.strip():
                        raise ValueError("OCR failed to extract text")
                except Exception as ocr_error:
                    print(f"OCR failed for {file_name}: {ocr_error}")
                    failed_files.append(file_name)
                    pdf_text = "Manual review required"

        content_list.append(pdf_text.strip())

    except Exception as e:
        print(f"Error processing {date}: {e}")
        content_list.append("Error extracting content")
        failed_files.append(file_name)

# Add the content as a new column in the DataFrame
df['Content'] = content_list
df.to_csv("Manhattan_CB2.csv", index=False)

# Save failed files for manual review
with open("failed_files.log", "w") as log_file:
    log_file.write("\n".join(failed_files))

print("Processing complete. Check 'Manhattan_CB2.csv' and 'failed_files.log'")


Processing: October 2024 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2024/11/10-October-2024-Full-Board-Minutes-MND-rev_EGS.pdf)
File already exists: Manhattan_CB2_PDFs/10-October-2024-Full-Board-Minutes-MND-rev_EGS.pdf
Processing: September 2024 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2024/11/09-September-2024-Full-Board-Minutes-MND-rev.pdf)
File already exists: Manhattan_CB2_PDFs/09-September-2024-Full-Board-Minutes-MND-rev.pdf
Processing: July 2024 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2024/11/07-July-2024-Full-Board-Minutes-Final-2.pdf)
File already exists: Manhattan_CB2_PDFs/07-July-2024-Full-Board-Minutes-Final-2.pdf
Processing: June 2024 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2024/07/06-June-2024-Full-Board-Minutes-Final-for-Board-Package.pdf)
File already exists: Manhattan_CB2_PDFs/06-June-2024-Full-Board-Minutes-Final-for-Board-Package.pdf
Processing: May 2024 (ht

cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/04-April-2024-Full-Board-Minutes_EGS-edits.pdf'


OCR failed for 04-April-2024-Full-Board-Minutes_EGS-edits.pdf: 
Processing: March 2024 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2024/07/03-March-2024-Full-Board-Minutes-proposed-final.pdf)
File already exists: Manhattan_CB2_PDFs/03-March-2024-Full-Board-Minutes-proposed-final.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/03-March-2024-Full-Board-Minutes-proposed-final.pdf'


pdfplumber failed for 03-March-2024-Full-Board-Minutes-proposed-final.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 03-March-2024-Full-Board-Minutes-proposed-final.pdf
OCR failed for 03-March-2024-Full-Board-Minutes-proposed-final.pdf: 
Processing: February 2024 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2024/07/02-February-2024-Full-Board-Minutes-1.pdf)
File already exists: Manhattan_CB2_PDFs/02-February-2024-Full-Board-Minutes-1.pdf
Processing: January 2024 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2024/07/01-January-2024-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/01-January-2024-Full-Board-Minutes.pdf
Processing: November 2023 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2023/12/11-November-2023-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/11-November-2023-Full-Board-Minutes.pdf
Processing: October 2023 (https://cbmanhattan.cityofnewyork.us/cb2

cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/10-October-2023-Full-Board-Minutes_R1-1.17.23.pdf'


pdfplumber failed for 10-October-2023-Full-Board-Minutes_R1-1.17.23.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 10-October-2023-Full-Board-Minutes_R1-1.17.23.pdf
OCR failed for 10-October-2023-Full-Board-Minutes_R1-1.17.23.pdf: 
Processing: September 2023 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2023/12/09-August-and-September-2023-Full-Board-Minutes-2.pdf)
File already exists: Manhattan_CB2_PDFs/09-August-and-September-2023-Full-Board-Minutes-2.pdf
Processing: July 2023 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2023/12/July-2023-Full-Board-Minutes-2.pdf)
File already exists: Manhattan_CB2_PDFs/July-2023-Full-Board-Minutes-2.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/July-2023-Full-Board-Minutes-2.pdf'


pdfplumber failed for July-2023-Full-Board-Minutes-2.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for July-2023-Full-Board-Minutes-2.pdf
OCR failed for July-2023-Full-Board-Minutes-2.pdf: 
Processing: June 2023 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2023/12/Final-Draft-06-June_2023-Full-Board-Minutes-MND-FLJul-19-Edits.pdf)
File already exists: Manhattan_CB2_PDFs/Final-Draft-06-June_2023-Full-Board-Minutes-MND-FLJul-19-Edits.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/Final-Draft-06-June_2023-Full-Board-Minutes-MND-FLJul-19-Edits.pdf'


pdfplumber failed for Final-Draft-06-June_2023-Full-Board-Minutes-MND-FLJul-19-Edits.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for Final-Draft-06-June_2023-Full-Board-Minutes-MND-FLJul-19-Edits.pdf
OCR failed for Final-Draft-06-June_2023-Full-Board-Minutes-MND-FLJul-19-Edits.pdf: 
Processing: May 2023 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2023/07/05-May_2023-Full-Board-Minutes-1.pdf)
File already exists: Manhattan_CB2_PDFs/05-May_2023-Full-Board-Minutes-1.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/05-May_2023-Full-Board-Minutes-1.pdf'


pdfplumber failed for 05-May_2023-Full-Board-Minutes-1.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 05-May_2023-Full-Board-Minutes-1.pdf
OCR failed for 05-May_2023-Full-Board-Minutes-1.pdf: 
Processing: April 2023 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2023/07/04-April_2023-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/04-April_2023-Full-Board-Minutes.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/04-April_2023-Full-Board-Minutes.pdf'


pdfplumber failed for 04-April_2023-Full-Board-Minutes.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 04-April_2023-Full-Board-Minutes.pdf
OCR failed for 04-April_2023-Full-Board-Minutes.pdf: 
Processing: March 2023 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2023/07/03-March_2023-Full-Board-Minutes-1.pdf)
File already exists: Manhattan_CB2_PDFs/03-March_2023-Full-Board-Minutes-1.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/03-March_2023-Full-Board-Minutes-1.pdf'


pdfplumber failed for 03-March_2023-Full-Board-Minutes-1.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 03-March_2023-Full-Board-Minutes-1.pdf
OCR failed for 03-March_2023-Full-Board-Minutes-1.pdf: 
Processing: February 2023 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2023/07/02-February-2023-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/02-February-2023-Full-Board-Minutes.pdf
Processing: January 2023 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2023/02/01-January_2023-Full-Board-Minutes-1.pdf)
File already exists: Manhattan_CB2_PDFs/01-January_2023-Full-Board-Minutes-1.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/01-January_2023-Full-Board-Minutes-1.pdf'


pdfplumber failed for 01-January_2023-Full-Board-Minutes-1.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 01-January_2023-Full-Board-Minutes-1.pdf
OCR failed for 01-January_2023-Full-Board-Minutes-1.pdf: 
Processing: December 2022 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2023/01/12-December-2022-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/12-December-2022-Full-Board-Minutes.pdf
Processing: November 2022 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2022/12/11-November-2022-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/11-November-2022-Full-Board-Minutes.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/11-November-2022-Full-Board-Minutes.pdf'


pdfplumber failed for 11-November-2022-Full-Board-Minutes.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 11-November-2022-Full-Board-Minutes.pdf
OCR failed for 11-November-2022-Full-Board-Minutes.pdf: 
Processing: October 2022 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2022/12/10-October_2022-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/10-October_2022-Full-Board-Minutes.pdf
Processing: September 2022 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2022/12/09-September-2022-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/09-September-2022-Full-Board-Minutes.pdf
Processing: July 2022 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2022/12/07-July-2022-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/07-July-2022-Full-Board-Minutes.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/07-July-2022-Full-Board-Minutes.pdf'


pdfplumber failed for 07-July-2022-Full-Board-Minutes.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 07-July-2022-Full-Board-Minutes.pdf
OCR failed for 07-July-2022-Full-Board-Minutes.pdf: 
Processing: June 2022 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2022/12/06-June-2022-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/06-June-2022-Full-Board-Minutes.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/06-June-2022-Full-Board-Minutes.pdf'


pdfplumber failed for 06-June-2022-Full-Board-Minutes.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 06-June-2022-Full-Board-Minutes.pdf
OCR failed for 06-June-2022-Full-Board-Minutes.pdf: 
Processing: May 2022 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2022/07/05-May-2022-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/05-May-2022-Full-Board-Minutes.pdf
Processing: April 2022 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2022/05/04-April-2022-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/04-April-2022-Full-Board-Minutes.pdf
Processing: March 2022 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2022/05/03-March-2022-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/03-March-2022-Full-Board-Minutes.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/03-March-2022-Full-Board-Minutes.pdf'


pdfplumber failed for 03-March-2022-Full-Board-Minutes.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 03-March-2022-Full-Board-Minutes.pdf
OCR failed for 03-March-2022-Full-Board-Minutes.pdf: 
Processing: February 2022 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2022/04/02-February-2022-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/02-February-2022-Full-Board-Minutes.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/02-February-2022-Full-Board-Minutes.pdf'


pdfplumber failed for 02-February-2022-Full-Board-Minutes.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 02-February-2022-Full-Board-Minutes.pdf
OCR failed for 02-February-2022-Full-Board-Minutes.pdf: 
Processing: January 2022 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2022/02/1-January-2022-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/1-January-2022-Full-Board-Minutes.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/1-January-2022-Full-Board-Minutes.pdf'


pdfplumber failed for 1-January-2022-Full-Board-Minutes.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 1-January-2022-Full-Board-Minutes.pdf
OCR failed for 1-January-2022-Full-Board-Minutes.pdf: 
Processing: December 2021 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2022/05/12-December-2021-FB.pdf)
File already exists: Manhattan_CB2_PDFs/12-December-2021-FB.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/12-December-2021-FB.pdf'


pdfplumber failed for 12-December-2021-FB.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 12-December-2021-FB.pdf
OCR failed for 12-December-2021-FB.pdf: 
Processing: November 2021 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2022/01/11-November-2021-FB-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/11-November-2021-FB-Minutes.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/11-November-2021-FB-Minutes.pdf'


pdfplumber failed for 11-November-2021-FB-Minutes.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 11-November-2021-FB-Minutes.pdf
OCR failed for 11-November-2021-FB-Minutes.pdf: 
Processing: October 2021 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2022/01/10-October-2021-FB-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/10-October-2021-FB-Minutes.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/10-October-2021-FB-Minutes.pdf'


pdfplumber failed for 10-October-2021-FB-Minutes.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 10-October-2021-FB-Minutes.pdf
OCR failed for 10-October-2021-FB-Minutes.pdf: 
Processing: September 2021 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2021/10/09-September-2021-FB-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/09-September-2021-FB-Minutes.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/09-September-2021-FB-Minutes.pdf'


pdfplumber failed for 09-September-2021-FB-Minutes.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 09-September-2021-FB-Minutes.pdf
OCR failed for 09-September-2021-FB-Minutes.pdf: 
Processing: July 2021 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2021/09/07-July-2021.pdf)
File already exists: Manhattan_CB2_PDFs/07-July-2021.pdf
Processing: June 2021 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2021/07/06-June-2021-FB-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/06-June-2021-FB-Minutes.pdf
Processing: May 2021 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2021/07/05-May-2021-FB-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/05-May-2021-FB-Minutes.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/05-May-2021-FB-Minutes.pdf'


pdfplumber failed for 05-May-2021-FB-Minutes.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 05-May-2021-FB-Minutes.pdf
OCR failed for 05-May-2021-FB-Minutes.pdf: 
Processing: April 2021 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2021/06/04-April-2021-FB-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/04-April-2021-FB-Minutes.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/04-April-2021-FB-Minutes.pdf'


pdfplumber failed for 04-April-2021-FB-Minutes.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 04-April-2021-FB-Minutes.pdf
OCR failed for 04-April-2021-FB-Minutes.pdf: 
Processing: March 2021 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2021/04/03-March-2021-FB-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/03-March-2021-FB-Minutes.pdf
Processing: February 2021 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2021/03/02-February-2021-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/02-February-2021-Minutes.pdf
Processing: January 2021 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2022/11/01-January-2021-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/01-January-2021-Full-Board-Minutes.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/01-January-2021-Full-Board-Minutes.pdf'


pdfplumber failed for 01-January-2021-Full-Board-Minutes.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 01-January-2021-Full-Board-Minutes.pdf
OCR failed for 01-January-2021-Full-Board-Minutes.pdf: 
Processing: December 2020 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2021/02/12-December-2020.pdf)
File already exists: Manhattan_CB2_PDFs/12-December-2020.pdf
Processing: November 2020 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2020/12/11-November-2020.pdf)
File already exists: Manhattan_CB2_PDFs/11-November-2020.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/11-November-2020.pdf'


pdfplumber failed for 11-November-2020.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 11-November-2020.pdf
OCR failed for 11-November-2020.pdf: 
Processing: October 2020 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2021/02/10-October-2020_updated.pdf)
File already exists: Manhattan_CB2_PDFs/10-October-2020_updated.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/10-October-2020_updated.pdf'


pdfplumber failed for 10-October-2020_updated.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 10-October-2020_updated.pdf
OCR failed for 10-October-2020_updated.pdf: 
Processing: September 2020 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2020/10/09-September-2020.pdf)
File already exists: Manhattan_CB2_PDFs/09-September-2020.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/09-September-2020.pdf'


pdfplumber failed for 09-September-2020.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 09-September-2020.pdf
OCR failed for 09-September-2020.pdf: 
Processing: August 2020 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2020/09/08-August-2020.pdf)
File already exists: Manhattan_CB2_PDFs/08-August-2020.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/08-August-2020.pdf'


pdfplumber failed for 08-August-2020.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 08-August-2020.pdf
OCR failed for 08-August-2020.pdf: 
Processing: July 2020 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2020/08/07-July-2020-Full-Board-Minutes.pdf)
File already exists: Manhattan_CB2_PDFs/07-July-2020-Full-Board-Minutes.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/07-July-2020-Full-Board-Minutes.pdf'


pdfplumber failed for 07-July-2020-Full-Board-Minutes.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 07-July-2020-Full-Board-Minutes.pdf
OCR failed for 07-July-2020-Full-Board-Minutes.pdf: 
Processing: June 2020 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2020/07/06-June-2020.pdf)
File already exists: Manhattan_CB2_PDFs/06-June-2020.pdf
Processing: May 2020 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2020/07/05-May-2020-FB.pdf)
File already exists: Manhattan_CB2_PDFs/05-May-2020-FB.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/05-May-2020-FB.pdf'


pdfplumber failed for 05-May-2020-FB.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 05-May-2020-FB.pdf
OCR failed for 05-May-2020-FB.pdf: 
Processing: April 2020 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2020/07/04-April-2020-FB.pdf)
File already exists: Manhattan_CB2_PDFs/04-April-2020-FB.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/04-April-2020-FB.pdf'


pdfplumber failed for 04-April-2020-FB.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 04-April-2020-FB.pdf
OCR failed for 04-April-2020-FB.pdf: 
Processing: March 2020 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2020/05/03-March-2020-FB.pdf)
File already exists: Manhattan_CB2_PDFs/03-March-2020-FB.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/03-March-2020-FB.pdf'


pdfplumber failed for 03-March-2020-FB.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 03-March-2020-FB.pdf
OCR failed for 03-March-2020-FB.pdf: 
Processing: February 2020 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2021/02/02-February-2020.pdf)
File already exists: Manhattan_CB2_PDFs/02-February-2020.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/02-February-2020.pdf'


pdfplumber failed for 02-February-2020.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 02-February-2020.pdf
OCR failed for 02-February-2020.pdf: 
Processing: January 2020 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2020/04/01-January-2020.pdf)
File already exists: Manhattan_CB2_PDFs/01-January-2020.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/01-January-2020.pdf'


pdfplumber failed for 01-January-2020.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 01-January-2020.pdf
OCR failed for 01-January-2020.pdf: 
Processing: December 2019 (https://cbmanhattan.cityofnewyork.us/cb2/wp-content/uploads/sites/9/2020/04/12-December-2019.pdf)
File already exists: Manhattan_CB2_PDFs/12-December-2019.pdf


cannot identify image file '/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB2/Manhattan_CB2_PDFs/12-December-2019.pdf'


pdfplumber failed for 12-December-2019.pdf: No /Root object! - Is this really a PDF?
Falling back to OCR for 12-December-2019.pdf
OCR failed for 12-December-2019.pdf: 
Processing: November 2019 (/cb2/wp-content/uploads/sites/9/downloads/pdf/fullboard_2019/11 November 2019.pdf)
Error processing November 2019: Invalid URL '/cb2/wp-content/uploads/sites/9/downloads/pdf/fullboard_2019/11 November 2019.pdf': No schema supplied. Perhaps you meant http:///cb2/wp-content/uploads/sites/9/downloads/pdf/fullboard_2019/11 November 2019.pdf?
Processing: October 2019 (/cb2/wp-content/uploads/sites/9/downloads/pdf/fullboard_2019/10 October 2019.pdf)
Error processing October 2019: Invalid URL '/cb2/wp-content/uploads/sites/9/downloads/pdf/fullboard_2019/10 October 2019.pdf': No schema supplied. Perhaps you meant http:///cb2/wp-content/uploads/sites/9/downloads/pdf/fullboard_2019/10 October 2019.pdf?
Processing: September 2019 (/cb2/wp-content/uploads/sites/9/downloads/pdf/fullboard_2019/09 September 2

# Print PDF 

I used this link from Soma's website [here](https://jonathansoma.com/everything/pdfs/ocr-tools/) 

In [5]:
import pandas as pd

csv_path = "/Users/chivo/Downloads/Foundations/HW/auto_update_web/Manhattan_CB1_with_content.csv"
data = pd.read_csv(csv_path)

print(data.columns)

Index(['Date', 'URL', 'Content'], dtype='object')
