### Preprocessing OCR and preparation of City of Seattle homeless encampment sweep pdf data for use with data viz and NLP of Find it Fix It requests to the City of Seattle (from a FOI request)

PDFs downloaded from: https://www.seattle.gov/homelessness/unauthorized-encampments/encampment-removals

In [1]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import sys 
import datetime as dt
import string
#OCR related
from PIL import Image 
import pytesseract 
from pdf2image import convert_from_path 
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

#### General plan:

1. convert each pdf to a jpeg (using pdf2image convert_from_path)
2. convert text in each jpeg to text (using pytesseract and PIL)

3. save the following info from the extracted text:
    * site_address
    * date_insp
    * date_cleanup
    * referred_by

In [11]:
#paths 
pdf_path = 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_PDFs'
jpeg_path = 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs'
text_path = 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_TXTs'

In [3]:
#list pdfs in path
pdfs_in_path = os.listdir(pdf_path)
print(len(pdfs_in_path), '\n', pdfs_in_path[0])

#create path names for pdfs in path
pdf_path_names = []
for pdf in pdfs_in_path:
    path_name = pdf_path + '/' + pdf
    pdf_path_names.append(path_name)

print(len(pdf_path_names))
pdf_path_names[0:3]

104 
 01-30-19 6th Ave and Yesler.pdf
104


['C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_PDFs/01-30-19 6th Ave and Yesler.pdf',
 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_PDFs/02-26-19 SW Florida St from 13th Ave SW to 11th Ave SW.pdf',
 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_PDFs/03.05.2019-Aurora-Ave-N-N-128th-St.pdf']

In [16]:
#convert pdf to jpeg images
#right now we only care about data contained on the fist page of the pdf

for pdf in pdf_path_names:
    try:
        convert_from_path(pdf, 
                      dpi=200, 
                      output_file=pdf,
                      output_folder=jpeg_path, 
                      first_page=1, last_page=1, 
                      fmt='jpeg')
    except:
        print(pdf)

C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_PDFs/06-26-19 3rd _amp_ Walker OBSTRUCTION.pdf


In [18]:
#list jpegs in path
jpegs_in_path = os.listdir(jpeg_path)
print(len(jpegs_in_path), '\n', jpegs_in_path[0])

#create path names for jpegs in path, jpegs have more than one page, save each page separately
jpeg_path_names = []
for jpeg in jpegs_in_path:
    path_name = jpeg_path + '/' + jpeg
    jpeg_path_names.append(path_name)

print(len(jpeg_path_names))
jpeg_path_names[0]

103 
 01-30-19 6th Ave and Yesler.pdf0001-01.jpg
103


'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/01-30-19 6th Ave and Yesler.pdf0001-01.jpg'

In [82]:
text = str(((pytesseract.image_to_string(Image.open(jpeg_path_names[0]))))) 
text_split = text.split("\n")

In [21]:
text_split

['G)} city of Seattle SITE JOURNAL',
 '',
 'Encampment Response Team',
 '',
 'SITE JOURNAL CONTENTS',
 '',
 '. Completed Site Journal . Exhibit C: Outreach Report',
 '. Exhibit A: Site Inspection Photos ° Exhibit D: Clean Up & Storage Photos',
 '. Exhibit B: Site Posting Photos . Exhibit E: Storage Detail',
 '',
 'A. SITE INSPECTION',
 '',
 'To be filled out by the Field Coordinator prior to any Full Encampment Clean Up and as part of any Obstruction or Hazard Removal. Site Journals',
 '',
 'and photos should be saved in the appropriate folder in the G:\\FAC\\Encampments\\Encampment clean ups directory.',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '',
 'Site: 6" Ave and Yesler Date of First Inspection: 01-14-19',
 '',
 'Site Address: 6" Ave and Yesler Date of Clean-Up: 01/30/2019',
 'Inspection By: — Laura Beck CSR # Not Applicable',
 '',
 'Referred By: SERIS, Community Photos to FAS? (1 Yes O No',
 '',
 ' ',
 '',
 'SITE OCCUPANCY DATA',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '

In [83]:
site_insp = [s for s in text_split if "Site:" in s]
insp_date = re.findall(r"Inspection.*", site_insp[0])[0].split(" ")[1]
print(insp_date)
add_cleanup = [s for s in text_split if "Site Address" in s]
site_add = re.findall(r"Address: (.*) Date", add_cleanup[0])[0]
print(site_add)
date = re.findall(r"Clean-Up: (.*)", add_cleanup[0])[0]
print(date)
ref = [s for s in text_split if "Referred By:" in s]
ref_by = re.findall(r"Referred By: (.*) Photos", ref[0])[0]
print(ref_by)

01-14-19
6" Ave and Yesler
01/30/2019
SERIS, Community
