In [18]:
import pandas as pd
from utils import *
import os
import fitz
from pdf2image import convert_from_path
import pytesseract
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chelseajimenez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chelseajimenez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/chelseajimenez/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chelseajimenez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Step 1: Change Working Directory

In [19]:
# Change the working directory
os.chdir('/Users/chelseajimenez/Desktop/Policy-Language-Analysis/data/BP_AR_3511')

#### Step 2: Load Data

In [20]:
policy_pdf_collection_doc_path = "/Users/chelseajimenez/Desktop/Policy-Language-Analysis/data/District Level Master Spreadsheet-UCB Policy Analysis - BP 3511.xlsx" # insert pdf collection doc path here

policy_pdfs = pd.read_excel(policy_pdf_collection_doc_path)

In [21]:
import fitz
import pytesseract
from pdf2image import convert_from_path

def extract_text_from_pdf(pdf_path):
    '''Function to extract text from PDF'''
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        if text.strip() == "":
            text = extract_text_with_ocr(pdf_path)
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None
    
def extract_text_with_ocr(pdf_path):
    '''Function to extract text from an image-based PDF using OCR'''
    try:
        images = convert_from_path(pdf_path)
        text = ""
        for image in images:
            text += pytesseract.image_to_string(image)
        return text
    except Exception as e:
        print(f"Error processing {pdf_path} with OCR: {e}")
        return None
    
def extract_and_update_pdf_texts(policy_pdfs, path_column, text_column):
    """
    Extracts text from PDFs whose paths are specified and updates the 'PDF Text' column 
    if the text is missing and the path to PDF is not null.
    
    Parameters:
    - policy_pdfs (pd.DataFrame): DataFrame containing 'Path to PDF' and 'PDF Text' columns.
    - path_column (str): name of the column that contains pdfs to read.
    - text_column (str): name of the column to extract text into.
    
    Returns:
    - pd.DataFrame: Updated DataFrame with extracted text in 'PDF Text' column where applicable.
    """
    # Filter rows where the respective 'Path to PDF' is not null and 'PDF Text' is null
    filtered_policy_pdfs = policy_pdfs[policy_pdfs[path_column].notna() & policy_pdfs[text_column].isna()]
    
    # Iteratte through the filtered DataFrame and extract text
    for index, row in filtered_policy_pdfs.iterrows():
        pdf_path = row[path_column]
        text = extract_text_from_pdf(pdf_path)
        if text:
            policy_pdfs.at[index, text_column] = text
    
    return policy_pdfs

def check_and_report_missing_texts(policy_pdfs, policy_name_column, path_column, text_column):
    """
    Checks if all rows with a specific column value and non-null PDF paths have text extracted,
    and reports any rows that are missing text.
    
    Parameters:
    - policy_pdfs (pd.DataFrame): DataFrame containing 'Path to PDF' and 'PDF Text' columns.
    - policy_name_column (str): The column name to check for a specific value (e.g., 'BP: 6142.5 Environmental Education').
    - path_column (str): name of the column that contains pdfs to read.
    - text_column (str): name of the column to extract text into.
    
    """
    # Filter rows where the specified column has a value of 1 and 'Path to PDF' is not null
    check_df = policy_pdfs[(policy_pdfs[policy_name_column] == 1) & (policy_pdfs[path_column].notna())]
    
    # Check for rows where 'PDF Text' is null
    missing_texts = check_df[check_df[text_column].isna()][path_column]
    
    if missing_texts.empty:
        print("All relevant rows have text extracted.")
    else:
        print("Some rows are missing text:")
        print(missing_texts)

#### Step 3: Extract Text from PDFs

In [22]:
policy_pdfs = extract_and_update_pdf_texts(policy_pdfs, 'BP3511: Path to PDF', 'BP3511: PDF Text')

# Check all relevant columns and pdfs have been extracted
check_and_report_missing_texts(policy_pdfs, 'District Name', 'BP3511: Path to PDF', 'BP3511: PDF Text')

Status: ADOPTED
Original Adopted Date: 08/25/2009 | Last Revised Date: 10/17/2019 | Last Reviewed Date: 10/17/2019
The Governing Board recognizes the environmental and financial benefits that can be derived from conserving
energy, water, and other natural resources, preparing for extreme weather and other natural events, and providing an
environment that promotes the health and well-being of students and staff. To support district goals for energy and
water management, the Superintendent or designee shall develop a resource management program which may
include strategies for implementing effective and sustainable resource use practices, exploring the use of renewable
and clean energy technology and/or sources, reducing energy and water consumption, and promoting conservation
principles in the educational program.
The Superintendent or designee may solicit input from staff, students, and parents/guardians regarding the district's
resource management program. The Superintendent or design

All relevant rows have text extracted.


### Step 4: Clean up Text

In [23]:
def clean_text(text): 
    """Cleans up textual data by removing specific patterns, tokenizing, lemmatizing, removing stopwords, and normalizing spaces."""
    try:
        if isinstance(text, str):
            # Remove "legal reference" and everything after
            text = re.sub(r'legal reference.*$', '', text, flags=re.IGNORECASE)
            text = re.sub(r'board policy manual.*$', '', text, flags=re.IGNORECASE)
            # Replace multiple spaces and newline characters with a single space
            text = " ".join(text.split())
            # Tokenize text
            tokens = word_tokenize(text)
            # Initialize stopwords and lemmatizer
            stop_words = set(stopwords.words('english'))
            lemmatizer = WordNetLemmatizer()
            # Clean tokens: remove punctuation, stopwords, and lemmatize
            tokens = [
                lemmatizer.lemmatize(re.sub(r'[^\w\s]', '', token.lower()))
                for token in tokens
                if re.sub(r'[^\w\s]', '', token) and token.lower() not in stop_words
            ]
            # Join cleaned tokens into a single string with proper spaces
            cleaned_text = " ".join(tokens)
            return cleaned_text
        return ""
    except Exception as e:
        print(f"Error cleaning text: {e}")
        return ""# Apply cleaning to the extracted text column

In [24]:
# Apply cleaning to the extracted text column
policy_pdfs['BP3511: PDF Text'] = policy_pdfs['BP3511: PDF Text'].apply(clean_text)

# Print cleaned text
for index, row in policy_pdfs.iterrows():
    print(f"District Name: {row['District Name']}")
    print(f"Extracted Text:\n{row['BP3511: PDF Text']}")
    print("="*80)

District Name: Alameda Unified
Extracted Text:
policy 3511 energy water management status adopted original adopted date 08252009 last revised date 10172019 last reviewed date 10172019 governing board recognizes environmental financial benefit derived conserving energy water natural resource preparing extreme weather natural event providing environment promotes health wellbeing student staff support district goal energy water management superintendent designee shall develop resource management program may include strategy implementing effective sustainable resource use practice exploring use renewable clean energy technology andor source reducing energy water consumption promoting conservation principle educational program superintendent designee may solicit input staff student parentsguardians regarding district s resource management program superintendent designee shall provide staff student training guidance best practice achieve district s goal may establish reward program recognize

### Step 5: Load cleaned Text into Data folder for Analysis

In [25]:
policy_pdfs.to_csv('/Users/chelseajimenez/Desktop/Policy-Language-Analysis/cleaned_data/BP3511_cleaned_policy_pdfs.csv', index=False)