In [1]:
import pandas as pd
from utils import *
import os
import fitz
from pdf2image import convert_from_path
import pytesseract
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chelseajimenez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chelseajimenez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/chelseajimenez/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chelseajimenez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Step 1: Change Working Directory

In [14]:
# Change the working directory
os.chdir('/Users/chelseajimenez/Desktop/Policy-Language-Analysis/data/AR_7111')

#### Step 2: Load Data

In [15]:
policy_pdf_collection_doc_path = "/Users/chelseajimenez/Desktop/Policy-Language-Analysis/data/District Level Master Spreadsheet-UCB Policy Analysis - BP_AR 7111.xlsx" # insert pdf collection doc path here

policy_pdfs = pd.read_excel(policy_pdf_collection_doc_path)

In [16]:
import fitz
import pytesseract
from pdf2image import convert_from_path

def extract_text_from_pdf(pdf_path):
    '''Function to extract text from PDF'''
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        if text.strip() == "":
            text = extract_text_with_ocr(pdf_path)
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None
    
def extract_text_with_ocr(pdf_path):
    '''Function to extract text from an image-based PDF using OCR'''
    try:
        images = convert_from_path(pdf_path)
        text = ""
        for image in images:
            text += pytesseract.image_to_string(image)
        return text
    except Exception as e:
        print(f"Error processing {pdf_path} with OCR: {e}")
        return None
    
def extract_and_update_pdf_texts(policy_pdfs, path_column, text_column):
    """
    Extracts text from PDFs whose paths are specified and updates the 'PDF Text' column 
    if the text is missing and the path to PDF is not null.
    
    Parameters:
    - policy_pdfs (pd.DataFrame): DataFrame containing 'Path to PDF' and 'PDF Text' columns.
    - path_column (str): name of the column that contains pdfs to read.
    - text_column (str): name of the column to extract text into.
    
    Returns:
    - pd.DataFrame: Updated DataFrame with extracted text in 'PDF Text' column where applicable.
    """
    # Filter rows where the respective 'Path to PDF' is not null and 'PDF Text' is null
    filtered_policy_pdfs = policy_pdfs[policy_pdfs[path_column].notna() & policy_pdfs[text_column].isna()]
    
    # Iteratte through the filtered DataFrame and extract text
    for index, row in filtered_policy_pdfs.iterrows():
        pdf_path = row[path_column]
        text = extract_text_from_pdf(pdf_path)
        if text:
            policy_pdfs.at[index, text_column] = text
    
    return policy_pdfs

def check_and_report_missing_texts(policy_pdfs, policy_name_column, path_column, text_column):
    """
    Checks if all rows with a specific column value and non-null PDF paths have text extracted,
    and reports any rows that are missing text.
    
    Parameters:
    - policy_pdfs (pd.DataFrame): DataFrame containing 'Path to PDF' and 'PDF Text' columns.
    - policy_name_column (str): The column name to check for a specific value (e.g., 'BP: 6142.5 Environmental Education').
    - path_column (str): name of the column that contains pdfs to read.
    - text_column (str): name of the column to extract text into.
    
    """
    # Filter rows where the specified column has a value of 1 and 'Path to PDF' is not null
    check_df = policy_pdfs[(policy_pdfs[policy_name_column] == 1) & (policy_pdfs[path_column].notna())]
    
    # Check for rows where 'PDF Text' is null
    missing_texts = check_df[check_df[text_column].isna()][path_column]
    
    if missing_texts.empty:
        print("All relevant rows have text extracted.")
    else:
        print("Some rows are missing text:")
        print(missing_texts)

#### Step 3: Extract Text from PDFs

In [17]:
policy_pdfs = extract_and_update_pdf_texts(policy_pdfs, 'AR7111: Path to PDF', 'AR7111: PDF Text')

# Check all relevant columns and pdfs have been extracted
check_and_report_missing_texts(policy_pdfs, 'District Name', 'AR7111: Path to PDF', 'AR7111: PDF Text')

Status: ADOPTED
Original Adopted Date: 08/25/2009 | Last Reviewed Date: 08/25/2009
The Superintendent or designee shall periodically evaluate the adequacy, design, and conditions of existing district
facilities to determine whether they meet the needs of the instructional program and provide a healthful and pleasing
environment for students and staff. He/she also shall determine whether district facilities fulfill legal requirements
for safety and structural soundness, access for the disabled, and energy conservation.
In addition, the Superintendent or designee shall regularly calculate the capacity of existing school buildings to
adequately house the district's current students and projected enrollments.
Any identified needs for repair, modernization, or construction shall be incorporated into the district's facilities
planning process.
Structural Safety
In the event that the Department of General Services or any licensed structural engineer or licensed architect finds
and reports to 

All relevant rows have text extracted.


### Step 4: Clean up Text

In [18]:
def clean_text(text): 
    """Cleans up textual data by removing specific patterns, tokenizing, lemmatizing, removing stopwords, and normalizing spaces."""
    try:
        if isinstance(text, str):
            # Remove "legal reference" and everything after
            text = re.sub(r'legal reference.*$', '', text, flags=re.IGNORECASE)
            text = re.sub(r'board policy manual.*$', '', text, flags=re.IGNORECASE)
            # Replace multiple spaces and newline characters with a single space
            text = " ".join(text.split())
            # Tokenize text
            tokens = word_tokenize(text)
            # Initialize stopwords and lemmatizer
            stop_words = set(stopwords.words('english'))
            lemmatizer = WordNetLemmatizer()
            # Clean tokens: remove punctuation, stopwords, and lemmatize
            tokens = [
                lemmatizer.lemmatize(re.sub(r'[^\w\s]', '', token.lower()))
                for token in tokens
                if re.sub(r'[^\w\s]', '', token) and token.lower() not in stop_words
            ]
            # Join cleaned tokens into a single string with proper spaces
            cleaned_text = " ".join(tokens)
            return cleaned_text
        return ""
    except Exception as e:
        print(f"Error cleaning text: {e}")
        return ""# Apply cleaning to the extracted text column

In [19]:
# Apply cleaning to the extracted text column
policy_pdfs['AR7111: PDF Text'] = policy_pdfs['AR7111: PDF Text'].apply(clean_text)

# Print cleaned text
for index, row in policy_pdfs.iterrows():
    print(f"District Name: {row['District Name']}")
    print(f"Extracted Text:\n{row['AR7111: PDF Text']}")
    print("="*80)

District Name: Alameda Unified
Extracted Text:
regulation 7111 evaluating existing building status adopted original adopted date 08252009 last reviewed date 08252009 superintendent designee shall periodically evaluate adequacy design condition existing district facility determine whether meet need instructional program provide healthful pleasing environment student staff heshe also shall determine whether district facility fulfill legal requirement safety structural soundness access disabled energy conservation addition superintendent designee shall regularly calculate capacity existing school building adequately house district s current student projected enrollment identified need repair modernization construction shall incorporated district s facility planning process structural safety event department general service licensed structural engineer licensed architect find report board education district building unsafe use superintendent designee shall immediately obtain estimate cost 

### Step 5: Load cleaned Text into Data folder for Analysis

In [20]:
policy_pdfs.to_csv('/Users/chelseajimenez/Desktop/Policy-Language-Analysis/cleaned_data/AR7111_cleaned_policy_pdfs.csv', index=False)