In [1]:
pip install pdfplumber

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install --upgrade bottleneck

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import re
import pandas as pd
import pdfplumber

In [4]:
#Function to extract text
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

In [5]:
#Function to Preprocess Text
def preprocess_text(text):
    '''
    strip whitespace, "/n"
    '''
    normalized_text = re.sub(r'\s+', ' ', text)
    normalized_text = ''.join(char for char in normalized_text if char.isprintable())
    return normalized_text

In [6]:
#function to apply regex
def extract_data_using_regex(text, patterns):
    extracted_data = {}
    for label, pattern in patterns.items():
        match = re.search(pattern, normalized_text)
        if match:
            # Check if the match has any capture groups
            if match.lastindex and match.lastindex >= 1:  # Ensure there's at least one capture group
                extracted_data[label] = match.group(1)
            else:
                extracted_data[label] = None  # No capture groups in the match
        else:
            extracted_data[label] = None  # If no match, store None
    return extracted_data

In [None]:
# Folder containing the PDFs
folder_path = "downloaded_pdfs/press_releases"

In [None]:
# List all PDF files in the folder
pdf_files = [file for file in os.listdir(folder_path) if file.lower().endswith('.pdf')]

In [7]:
# Print the list of PDF file names
for pdf in pdf_files:
    print(pdf)

colorado-county.pdf
alaska.pdf
smith-county.pdf
upton-county.pdf
runnels-county.pdf


In [64]:
#define regex patterns
patterns = {
    'defendant': r'Investigation\s(?:of|into|regarding)\s([A-Za-z]+(?:\s[A-Za-z]+)*(?:\s(?:County|City|District))?)',
    'case_number': r'DJ\s*(?:#|No\.)\s*(\d{3}-\d{1,2}-\d{2})',
    'complaint': r'alleging\s+that\s+([^\s].*?)\sin\sviolation\sof\sTitle\sII',
    'outcome': r'IV\.\sRemedial Measures[\s\S](.*)\.'
}

In [65]:
#create list of things I want to extract
data_list = []

# Iterate through each PDF and extract data
for pdf in pdf_files:
    # Construct the full path to the PDF file
    pdf_path = os.path.join(folder_path, pdf)

    # Check if the file exists (just to avoid potential issues)
    if os.path.exists(pdf_path):
        text = extract_text_from_pdf(pdf_path)  # Pass the full path to the function
        normalized_text = preprocess_text(text)
        extracted_data = extract_data_using_regex(normalized_text, patterns)
        data_list.append(extracted_data)
    else:
        print(f"File not found: {pdf_path}")
        

In [66]:
# Convert the list of extracted data into a DataFrame
df = pd.DataFrame(data_list)

In [67]:
# Display the DataFrame
df

Unnamed: 0,defendant,case_number,complaint,outcome
0,Colorado County,204-74-38,the County’s election website (the Website) is...,"To remedy these violations, the County must ta..."
1,Alaska,204-6-54,"several of the State’s voting services, progra...",To remedy the deficiencies discussed above and...
2,Smith County,204-75-15,the County’s election website (the Website) is...,"To remedy these violations, the County must ta..."
3,Upton County,204-76-24,the County’s election website (the Website) is...,"To remedy these violations, the County must ta..."
4,Runnels County Election Website Under Title II...,204-76-22,the County’s election website (the Website) is...,"To remedy these violations, the County must ta..."


In [68]:
# Do the same thing with a different folder
folder_path = "downloaded_pdfs/agreements"
pdf_files = [file for file in os.listdir(folder_path) if file.lower().endswith('.pdf')]
for pdf in pdf_files:
    print(pdf)

colorado-county.pdf
newton-county.pdf
jefferson-county.pdf
smith-county.pdf
upton_county.pdf
st-louis.pdf
runnels-county.pdf
travis-county.pdf


In [80]:
# Define new regex patterns for data extraction
agreements_re = {
    'defendant': r'AGREEMENT[\s\S]*?BETWEEN THE UNITED STATES OF AMERICA[\s\S]*?AND\s(.*)\sDJ',
    'case_number': r'DJ\s*(?:#|No\.)\s*(\d{3}-\d{1,2}-\d{2})',
    'complaint': r'alleging\sthat(.*)',
    'outcome': r'(?:shall|agrees|will)(.*)$'     
}

In [81]:
#Apply new patterns
data_list_2 = []

# Iterate through each PDF and extract data
for pdf in pdf_files:
    # Construct the full path to the PDF file
    pdf_path = os.path.join(folder_path, pdf)

    # Check if the file exists (just to avoid potential issues)
    if os.path.exists(pdf_path):
        text = extract_text_from_pdf(pdf_path)  # Pass the full path to the function
        normalized_text = preprocess_text(text)
        extracted_data = extract_data_using_regex(normalized_text, agreements_re)
        data_list_2.append(extracted_data)
    else:
        print(f"File not found: {pdf_path}")
        

In [82]:
df_2 = pd.DataFrame(data_list_2)

In [83]:
df_2

Unnamed: 0,defendant,case_number,complaint,outcome
0,"COLORADO COUNTY, TEXAS",204-74-38,the County’s Election Website was inaccessibl...,to refrain from filing a civil suit in this m...
1,"THE NEWTON COUNTY, ARKANSAS BOARD OF ELECTION ...",204-10-40,the Board’s polling places contain architectu...,include the Board and all Page 1 of 12of the ...
2,"THE JEFFERSON COUNTY, KENTUCKY BOARD OF ELECTI...",204-31-98,the Board’s polling places contain architectu...,"include the Board and all of its members, off..."
3,"SMITH COUNTY, TEXAS",204-75-15,the County’s Election Website was inaccessibl...,to refrain from filing a civil suit in this m...
4,"UPTON COUNTY, TEXAS",204-76-24,the County’s Election Website was inaccessibl...,to refrain from filing a civil suit in this m...
5,THE BOARD OF ELECTION COMMISSIONERS FOR THE CI...,204-42-15,in some instances (1) the Board’s polling pla...,"include the Board and all of its members, off..."
6,"RUNNELS COUNTY, TEXAS",204-76-22,the County’s Election Website was inaccessibl...,to refrain from filing a civil suit in this m...
7,THE TRAVIS COUNTY CLERK’S OFFICE DJ No. 204-76...,204-76-22,Travis County’s voting program discriminates ...,include the Travis County Clerk’s Office and ...


In [84]:
#merge datasets
complaint_information = pd.concat([df, df_2], axis = 0)

In [85]:
complaint_information

Unnamed: 0,defendant,case_number,complaint,outcome
0,Colorado County,204-74-38,the County’s election website (the Website) is...,"To remedy these violations, the County must ta..."
1,Alaska,204-6-54,"several of the State’s voting services, progra...",To remedy the deficiencies discussed above and...
2,Smith County,204-75-15,the County’s election website (the Website) is...,"To remedy these violations, the County must ta..."
3,Upton County,204-76-24,the County’s election website (the Website) is...,"To remedy these violations, the County must ta..."
4,Runnels County Election Website Under Title II...,204-76-22,the County’s election website (the Website) is...,"To remedy these violations, the County must ta..."
0,"COLORADO COUNTY, TEXAS",204-74-38,the County’s Election Website was inaccessibl...,to refrain from filing a civil suit in this m...
1,"THE NEWTON COUNTY, ARKANSAS BOARD OF ELECTION ...",204-10-40,the Board’s polling places contain architectu...,include the Board and all Page 1 of 12of the ...
2,"THE JEFFERSON COUNTY, KENTUCKY BOARD OF ELECTI...",204-31-98,the Board’s polling places contain architectu...,"include the Board and all of its members, off..."
3,"SMITH COUNTY, TEXAS",204-75-15,the County’s Election Website was inaccessibl...,to refrain from filing a civil suit in this m...
4,"UPTON COUNTY, TEXAS",204-76-24,the County’s Election Website was inaccessibl...,to refrain from filing a civil suit in this m...


In [87]:
#export
complaint_information.to_csv('complaint_information.csv')