## OO Tax rolls

E. Quinn 2/14/2020

This notebook uses pdfminer to extract the information from EG tax rolls

The documentation for pdfminer is at:

https://buildmedia.readthedocs.org/media/pdf/pdfminer-docs/latest/pdfminer-docs.pdf

## Import standard python datascience packages

In [None]:
import math
import re
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Import pdfminer packages

In [None]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal

### Show the directory we are running in

In [None]:
!pwd

## Read the pdf and create a dictionary with the contents of each text box

### Function read_pdf() reads a PDF and returns a dictionary containing the contents

Strategy for this document:  

Save information from each element in the LTTextBox objects in a dictionary including:

- x0 horizontal coordinate of the upper left corner of the text box
- x1 horizontal coordinate of the lower right corner of the text box
- y0 vertical coordinate of the upper left corner of the text box
- y1 vertical coordinate of the lower right corner of the text box
- page number 
- sequence number of text box within this page
- text contained in the text box, converted to ascii

Parsing the text is complicated by the fact that that a text box may span multiple columns and/or rows, and the text box groupings vary quite a bit depending on the page contents and layout.

However, with a bit of luck the structure of the document will allow the contents to be deciphered with the following heuristics:

- Text boxes containing left justified columns will tend to have nearly the same x0 coordinates
- Text boxes containing right justified columns will tend to have nearly the same x1 coordinates
- The codes for fund, account code, and object code are numeric and have fixed lengths
- Extraneous information is often preceded or followed by a series of underscore and newline characters
- Last name can be distinguished because is the only field that is all characters followed by a comma
- Last name may be preceded by between one and three numerical fields:  fund, account, object.  If it is, the x0 value is shifted to the left.
    - Three numerical fields precede the name:  assume they are fund, account, object
    - Two numerical fields precede the name: assume they are account, object
    - One numerical field precedes the name: assume it is object
    

In [None]:
def read_pdf(path):
    document = open(path, 'rb')                                     #read a pdf and create a document object
    rsrcmgr = PDFResourceManager()                                  #create a resource manager
    laparams = LAParams()                                           #set the parameters for analysis
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)          #create a PDF page aggregator object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    pdf={}                                                          #dictionary to hold the results

    pageno = -1                                                     #initialize page coounter to zero

    for page in PDFPage.get_pages(document):                        #loop through the pdf page by page
        pageno = pageno + 1                                         #increment the page number
        pdf[pageno] = {}                                            #dictionary for this page
        interpreter.process_page(page)                              # receive the LTPage object for the page.
        layout = device.get_result()                                # create layout object
        tbox_no=0                                                   # index for element number
        for element in layout:
            if (type(element).__name__=='LTTextBoxHorizontal'):     #loop through text boxes
                tbox_no += 1                                        #increment text box number
                pdf[pageno][tbox_no] = {}                           #dictionary for text boxes within page
                x0 = round(element.x0,2)                            #x0 coordinate of textbox corner
                x1 = round(element.x1,2)                            #x1 coordinate of textbox corner
                y0 = round(element.y0,2)                            #y0 coordinate of textbox corner
                y1 = round(element.y1,2)                            #y1 coordinate of textbox corner
                txt = element.get_text().encode('ascii', 'ignore')  #text converted to ascii
                pdf[pageno][tbox_no]['x0'] = x0                     #create x0 coordinate entry
                pdf[pageno][tbox_no]['x1'] = x1                     #create x1 coordinate entry
                pdf[pageno][tbox_no]['y0'] = y0                     #create y0 coordinate entry
                pdf[pageno][tbox_no]['y1'] = y1                     #create y1 coordinate entry

                pdf[pageno][tbox_no]['text'] = ''.join(chr(c) for c in txt) #convert bytes to string
    return(pdf)

### Read the tax roll pdf

In [None]:
def read_taxroll(tr):
    
    accts = {}

    for k in tr.keys():                                             #loop through pages
        for key in tr[k].keys():                                    #loop through text boxes in page
            if ('text' in tr[k][key].keys()):                       #look at 'text' elements
                text = tr[k][key]['text']                           #extract text
                lines = text.split('\n')                            #split into lines
                for line in lines:                                  #loop through lines
                    if (len(line) > 10):                            #only look at lines longer than 10 chars
                        if (line[0:9].isdigit()):                   #check for account number
                            acct_no = line[0:9]                     #if first 10 chars are digits
                            if (acct_no not in accts.keys()):       #check if it's already in keys
                                accts[acct_no] = account(acct_no)   #if not, add account object
                        if (find_plat(line) > 0):                   #check for platt string
                            accts[acct_no].add_bill(line)           #if present, add to bills
                        elif ('TOTALS' in line):                    #check for TOTALS line
                            line2 = line[line.find('TOTALS'):]      #if so, add TOTALS to account object
                            accts[acct_no].set_totals(line2)        #using set_totals() function
                        elif (' EX ' in line):                      #check for exemption
                            line2 = line[line.find(' EX'):]         #if present
                            accts[acct_no].add_exemption(line2)     #add exemption
    return(accts)

### Parse the tax roll document

In [None]:
def find_plat(tx):
    plat_index=-1
    words = tx.split()
    for word in words:
        if (len(word)==16):
            if ((word[0:2].isdigit()) & (word[3]=='-') & \
                (word[4:6].isdigit()) & (word[7]=='-') & \
                (word[8:10].isdigit()) & (word[11]=='-') & (word[12:15].isdigit())):
                    plat_index = tx.find(word)
    return(plat_index)

def get_float(st):
    st2 = st.replace(',','')
    nv = float(st2)
    return(nv)
        
class account():                                        #account class
    def __init__(self,acct_no):                         #account class constructor
        self.acct_no = acct_no                          #  acct_no
        self.bills = {}                                 #  bills dictionary
        self.totals_text = None                         #  totals line
        self.totals_valuation = np.NaN
        self.totals_tax = np.NaN
        self.exemptions = {}                            #  exemptions dictionary
        return
    
    def get_acct_no(self):                              #return acct_no
        return(self.acct_no)
    
    def get_bills(self):                                #return bills dictionary
        return(self.bills)
    
    def get_exemptions(self):                           #return exemptions dictionary
        return(self.exemptions)
    
    def get_bill_count(self):                           #return bill ount
        return(len(self.bills))
    
    def set_totals(self,text):                          #set totals text and values
        self.totals_text = text
        words = self.totals_text.split()
        try:
            self.totals_valuation = get_float(words[1]) #word1 is valuation
            self.totals_tax = get_float(words[2])       #word2 is tax
        except IndexError:
            if (self.totals_tax != self.totals_tax):    #check if totals_tax is already saved
                print("Index Error: ",self.get_acct_no(),text)   #if not, flag this account
        
    def get_totals_text(self):                          #return totals text
        return(self.totals_text)
    
    def get_totals_valuation(self):                     #return valuation from totals
        return(self.totals_valuation)
    
    def get_totals_tax(self):                           #return tax from totals
        return(self.totals_tax)
    
    def get_bill(self,i):                               #return a specific bill
        try:
            return(self.bills[i])
        except KeyError:
            return(None)
    
    def add_exemption(self,exemp):                      #add exemption to dictionary
        xm = exemp[3:40].strip()                        #strip 'EX ' from beginning, extract type
        examt = get_float(exemp[40:].strip())           #get amt from second half of line
        if (xm not in self.exemptions.keys()):
            self.exemptions[xm] = examt                 #set type as key for amt
        else:
            self.exemptions[xm] += examt                #if duplicate, just add in exemption
        return
    
    def add_bill(self,bil):                             #add bill to dictionary
        ix = 1+len(self.bills)                          #increment bill index
        self.bills[ix] = bill(bil)                      #add bill object to dictionary
        return
    
class bill():                                           #bill class
    def __init__(self,text):                            #constructor     
        self.text = text                                #set text
        self.platt = ''                                 #set platt
        self.tax = np.NaN                               #set tax missing
        self.valuation = np.NaN                         #set valuation missing
        re_class = ''                                   #set real estate class missing
        self.bill_no = ''                               #set bill number missing
        self.address = ''                               #set address missing
        self.state_code = ''                            #set state code missing
                                                        #        
        words = text.split()                            #extract platt from text string
        for word in words:                              #platt should be the on word with:
            if (len(word)==16):                         #  length 16
                if (word[3]=='-'):                      #  the fourth character being a hyphen
                    self.platt = word                   #save the platt string
                    platt_ix = text.find(word)          #save the starting position for future reference
                                                        #extract real estate class from text string
        reix = text.find(' RE ')                        #look for the ' RE ' substring
        self.re_class = text[reix+7:platt_ix].strip()   #take substring 7 chars past RE to start of platt
                                                        #extract bill number
        substr = text[:reix]                            #take the text string up to ' RE '
        words = substr.split()                          #
        first = last = next(iter(words), '')            #get first and last words 
        for last in iter(words):                        #
            pass                                        #
        if (last.isdigit()):                            #if the last word encountered before RE is all digits
            self.bill_no = last                         #then save it as self.bill_no
                                                        #extract state code
        substr = text[reix+3:]                          #take the text string beyond ' RE '
        words = substr.split()                          #
        first = last = next(iter(words), '')            #get first and last words 
        for last in iter(words):                        #
            pass                                        #
        self.state_code = first                         #then save it as self.state_code
                                                        #extract the tax amount
        first = lastx = next(iter(text.split()), '')    #get first and last words from text string
        for lastx in iter(text.split()):                #
            pass                                        #
        if ('.' in lastx.replace(',','')):              #if the last word contains a period
            self.tax = get_float(lastx.replace(',',''))  #save it as the tax amount
        elif (lastx.replace(',','').isdigit()):         #otherwise if the last word is digits and commas
            self.valuation = get_float(lastx.replace(',',''))   #save it as valuation
            self.tax = 0.0                              #and set the tax equal to zero
                                                        #
                                                        #extract the valuation
        if (self.valuation != self.valuation):          #only if it's still np.NaN
            substr=text[:text.find(lastx)]              #take text string up to second last word
            first = lastx = next(iter(substr.split()), '')  #get first and last words from substr string
            for lastx in iter(substr.split()):           #
                pass   
            if (lastx.replace(',','').isdigit()):        #if the second last word is digits and commas
                self.valuation = get_float(lastx.replace(',',''))   #save it as valuation
                                                        #
        substr = text[platt_ix+16:text.find(lastx)]     #take text from end of platt to start of valuation
        self.address = substr.strip()                   #remove leading and trailing spaces
            
        return
    
    def get_platt(self):                                #return plat
        return(self.platt)
    
    def get_text(self):                                 #return text
        return(self.text)
    
    def get_class(self):
        return(self.re_class)
    
    def get_bill_no(self):
        return(self.bill_no)

    def get_state_code(self):
        return(self.state_code)

    def get_address(self):
        return(self.address)
    
    def get_tax(self):
        return(self.tax)
    
    def get_valuation(self):
        return(self.valuation)