### Preprocessing OCR and preparation of City of Seattle homeless encampment sweep pdf data for use with data viz and NLP of Find it Fix It requests to the City of Seattle (from a FOI request)

PDFs downloaded from: https://www.seattle.gov/homelessness/unauthorized-encampments/encampment-removals

In [129]:
#getting and working with data
import pandas as pd
import numpy as np
import re
import os
import sys 
import datetime as dt
import string
#OCR related
from PIL import Image 
import pytesseract 
from pdf2image import convert_from_path 
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

#visualizing results
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context('poster', rc={'font.size':35,
                              'axes.titlesize':50,
                              'axes.labelsize':35})

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

#### General plan:

1. convert each pdf to a jpeg (using pdf2image convert_from_path)
2. convert text in each jpeg to text (using pytesseract and PIL)

3. save the following info from the extracted text:
    * site_address
    * date_insp
    * date_cleanup
    * referred_by

In [3]:
#paths 
pdf_path = 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_PDFs'
jpeg_path = 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs'

In [None]:
#list pdfs in path
pdfs_in_path = os.listdir(pdf_path)
print(len(pdfs_in_path), '\n', pdfs_in_path[0])

#create path names for pdfs in path
pdf_path_names = []
for pdf in pdfs_in_path:
    path_name = pdf_path + '/' + pdf
    pdf_path_names.append(path_name)

print(len(pdf_path_names))
pdf_path_names[0:3]

In [None]:
#convert pdf to jpeg images
#right now we only care about data contained on the fist page of the pdf

for pdf in pdf_path_names:
    try:
        convert_from_path(pdf, 
                      dpi=200, 
                      output_file=pdf,
                      output_folder=jpeg_path, 
                      first_page=1, last_page=1, 
                      fmt='jpeg')
    except:
        print(pdf)

In [4]:
#list jpegs in path
jpegs_in_path = os.listdir(jpeg_path)
print(len(jpegs_in_path), '\n', jpegs_in_path[0])

#create path names for jpegs in path, jpegs have more than one page, save each page separately
jpeg_path_names = []
for jpeg in jpegs_in_path:
    path_name = jpeg_path + '/' + jpeg
    jpeg_path_names.append(path_name)

print(len(jpeg_path_names))
jpeg_path_names[0:10]

103 
 01-30-19 6th Ave and Yesler.pdf0001-01.jpg
103


['C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/01-30-19 6th Ave and Yesler.pdf0001-01.jpg',
 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/02-26-19 SW Florida St from 13th Ave SW to 11th Ave SW.pdf0001-01.jpg',
 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/03.05.2019-Aurora-Ave-N-N-128th-St.pdf0001-01.jpg',
 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/03.12.2019-8th-Ave-S-S-Holgate-St.pdf0001-01.jpg',
 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/03.18-21.2019 Fremont Troll Area.pdf0001-01.jpg',
 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/05-28-19 Klictitat Ave SW under the SW Spokane St and West Seattle Bridge.pdf0001-01.jpg',
 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/06-21-19 Corson Ave S and S Carstens Pl OBSTRUCTION.pdf0001-01.jpg',
 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/06-29-19 Hillside West of Sound Tr

In [5]:
def OCR_from_jpeg(path_name):
    ### takes in path of image, performs OCR, saves split text ###
    
    #OCR with pytesseract
    text = str(((pytesseract.image_to_string(Image.open(path_name))))) 
    text_split = text.split("\n")
    
    return text_split

In [10]:
def info_from_OCR(text_split, site_words, name):
    ### takes in path of image, performs OCR, saves specific info as dic ###
    
    sweep_OCR_dic = {}
    sweep_OCR_dic['name'] = name.split("/")[-1]
    
    #OCR with pytesseract
    text = str(((pytesseract.image_to_string(Image.open(path_name))))) 
    text_split = text.split("\n")
    
    #date of inspection
    for key_word in site_words:
        try:
            site_insp = [s for s in text_split if key_word in s]
            sweep_OCR_dic['date_insp'] = re.findall(r"Inspection:.*", site_insp[0])[0].split(" ")[-1]
        except:
            pass
            
    #address of site clean up
    for key_word in address_words:
        try:
            add_cleanup = [s for s in text_split if key_word in s]
            sweep_OCR_dic['site_address'] = re.findall(r"Address: (.*) Date", add_cleanup[0])[0]
        except:
            pass
    
    #date of clean up
    try:
        sweep_OCR_dic['date_cleanup'] = re.findall(r"(Clean-Up: |Date of clean: |Date of Clean-Up: )(.*)", add_cleanup[0])[0][-1]
    except:
        date_cleanup = [s for s in text_split if 'Date of Clean-Up: ' in s]
        sweep_OCR_dic['date_cleanup'] = re.findall(r"(Clean-Up: |Date of clean: |Date of Clean-Up: )(.*)", date_cleanup[0])[0][-1]
        
    #who referred the site for clean up
    ref = [s for s in text_split if "Referred By:" in s]
    sweep_OCR_dic['ref_by'] = re.findall(r"Referred By: (.*) (Photos|CSR|SPD)", ref[0])[0]

    return sweep_OCR_dic

In [12]:
sweep_info_df = pd.DataFrame()
except_paths = []
i=0
site_words = ['Site:', 'CSR Listing:', 'CRS Listing:', 'CSR Address:']
address_words = ['CSR Address:', 'Site Address:']

for path_name in jpeg_path_names:
    
    print(i)
    
    #OCR
    try:
        text_split = OCR_from_jpeg(path_name)
    except:
        print(path_name)
        print('cannot perform OCR')
    
    #get desired sweep metadata
    try:
        dic = info_from_OCR(text_split, site_words, name=path_name)
        df = pd.DataFrame.from_dict(dic, orient='index').T
    
        if sweep_info_df.shape[0] == 0:
            sweep_info_df = df
        else:
            sweep_info_df = sweep_info_df.append(df)

    except:
        print(path_name)
        print('cannot get info from text_split')
        except_paths.append(path_name)
        
    i+=1

print(sweep_info_df.shape)
print(len(except_paths), '\n')
sweep_info_df.head()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/1.8.19-1st-Ave-S-and-Denver-Ave-S.pdf0001-01.jpg
cannot get info from text_split
30
31
32
33
34
35
C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/10-24-2018-WSBridge-Harbor-Ave-Spokane-St.pdf0001-01.jpg
cannot get info from text_split
36
37
38
39
40
C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/11.01.18-Yesler-Way-6th-Ave-I-5-under-Yesler-Way-overpass.pdf0001-01.jpg
cannot get info from text_split
41
42
C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/11.07.18-Airport-Way-S-from-S-Massachu.pdf0001-01.jpg
cannot get info from text_split
43
44
45
C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/11.14.18-Georgetown-Pump-Station-along-8th-Ave-S-to-Myrtle-St.pdf0001-01.jpg
cannot get info from text_split
46
C:/Users/Schindler/Documents/ProgrammingFun/FIFI/OCR/sweep_JPEGs/11.20.18-

Unnamed: 0,date_cleanup,date_insp,name,ref_by,site_address
0,01/30/2019,01-14-19,01-30-19 6th Ave and Yesler.pdf0001-01.jpg,"(SERIS, Community, Photos)","6"" Ave and Yesler"
0,2/26/19,2/20/19,02-26-19 SW Florida St from 13th Ave SW to 11th Ave SW.pdf0001-01.jpg,"(SPU, Photos)",SW Florida St from 13th Ave SW to 11th Ave SW
0,03-05-19,01-14-19,03.05.2019-Aurora-Ave-N-N-128th-St.pdf0001-01.jpg,"(SPU, Photos)",Stone Ave N to Aurora Ave N from N 125th St to N 130th St
0,3/12/19,2/7/19,03.12.2019-8th-Ave-S-S-Holgate-St.pdf0001-01.jpg,"(SPU, Photos)",I-5 SB to the SODO Trail from S Massachusetts St toS Walker St
0,3/18-21/19,11/30/18,03.18-21.2019 Fremont Troll Area.pdf0001-01.jpg,"(CSR, Community, Photos)","N 36"" St to N 38* St from Linden Ave N to Winslow PLN"


In [127]:
#clean up data frame
sweep_info_df_final = sweep_info_df.reset_index(drop=True)
sweep_info_df_final.replace(np.nan, 'empty', inplace=True)
#remove 'photos' from ref column
sweep_info_df_final['ref_by'] = sweep_info_df_final['ref_by'].apply(lambda x: x[0] if x[-1] == 'Photos' else x)

#fix dates - multiple formats and some with a date span
sweep_info_df_final['date_insp'] = sweep_info_df_final['date_insp'].apply(lambda x: re.split('/|-', x))
sweep_info_df_final['date_insp'] = sweep_info_df_final['date_insp'].apply(lambda x: x[0]+'-'+x[1]+'-'+x[-1] if len(x) > 1 else np.nan)
sweep_info_df_final['date_insp'] = pd.to_datetime(sweep_info_df_final['date_insp'], infer_datetime_format=True, errors='ignore')

sweep_info_df_final['date_cleanup'] = sweep_info_df_final['date_cleanup'].apply(lambda x: re.split('/|-', x))
sweep_info_df_final['date_cleanup'] = sweep_info_df_final['date_cleanup'].apply(lambda x: x[0]+'-'+x[1]+'-'+x[-1] if len(x) > 1 else np.nan)
sweep_info_df_final['date_cleanup'] = pd.to_datetime(sweep_info_df_final['date_cleanup'], infer_datetime_format=True, errors='ignore')

sweep_info_df_final.head()


Unnamed: 0,date_cleanup,date_insp,name,ref_by,site_address
0,2019-01-30,01-14-19,01-30-19 6th Ave and Yesler.pdf0001-01.jpg,"SERIS, Community","6"" Ave and Yesler"
1,2019-02-26,2-20-19,02-26-19 SW Florida St from 13th Ave SW to 11th Ave SW.pdf0001-01.jpg,SPU,SW Florida St from 13th Ave SW to 11th Ave SW
2,2019-03-05,01-14-19,03.05.2019-Aurora-Ave-N-N-128th-St.pdf0001-01.jpg,SPU,Stone Ave N to Aurora Ave N from N 125th St to N 130th St
3,2019-03-12,2-7-19,03.12.2019-8th-Ave-S-S-Holgate-St.pdf0001-01.jpg,SPU,I-5 SB to the SODO Trail from S Massachusetts St toS Walker St
4,2019-03-18,11-30-18,03.18-21.2019 Fremont Troll Area.pdf0001-01.jpg,"CSR, Community","N 36"" St to N 38* St from Linden Ave N to Winslow PLN"


In [131]:
sweep_info_df_final.to_csv('sweep_info_df_final.csv')
except_paths = pd.DataFrame(data=except_paths)
except_paths.to_csv('except_paths.csv')

In [133]:
sweep_info_df_final['ref_by'].value_counts()

CSR, Community                   18
SPU                              8 
SERIS, Community                 8 
Community, CSR                   8 
SPU, Community, CSR              5 
SPU, SERIS, Community            4 
(Community,, CSR)                4 
SPU, CSR, Community              4 
(SPU,, CSR)                      3 
SPU, Community                   3 
WSDOT, SERIS, Community          2 
WSDOT, CSR, Community            2 
(WSDOT, Community,, CSR)         2 
WSDOT, Community, CSR            2 
Sound Transit, Community, CSR    2 
WSDOT                            2 
SPD, CSR, Community              2 
CSR, SPU, Community              1 
(SERIS, CRS, Community,, SPD)    1 
BNSF, Community, CSR             1 
SPU, CSR                         1 
CSR, WSDOT, Community            1 
SDOT, Sound Transit              1 
CSR, EPA, SPU                    1 
© CSR, Community                 1 
(WDSOT, Community,, CSR)         1 
Community/CSR                    1 
Citizen, SERIS              