## OO Tax rolls

E. Quinn 2/14/2020

This notebook uses pdfminer to extract the information from EG tax rolls

The documentation for pdfminer is at:

https://buildmedia.readthedocs.org/media/pdf/pdfminer-docs/latest/pdfminer-docs.pdf

## Import standard python datascience packages

In [None]:
import math
import re
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Import pdfminer packages

In [None]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal

### Show the directory we are running in

In [None]:
!pwd

### set path for tax roll files

In [None]:
taxroll_path = '../../../re_tax_rolls/EG_taxrolls/'

## Read the pdf and create a dictionary with the contents of each text box

### Function read_pdf() reads a PDF and returns a dictionary containing the contents

Strategy for this document:  

Save information from each element in the LTTextBox objects in a dictionary including:

- x0 horizontal coordinate of the upper left corner of the text box
- x1 horizontal coordinate of the lower right corner of the text box
- y0 vertical coordinate of the upper left corner of the text box
- y1 vertical coordinate of the lower right corner of the text box
- page number 
- sequence number of text box within this page
- text contained in the text box, converted to ascii

Parsing the text is complicated by the fact that that a text box may span multiple columns and/or rows, and the text box groupings vary quite a bit depending on the page contents and layout.

However, with a bit of luck the structure of the document will allow the contents to be deciphered with the following heuristics:

- Text boxes containing left justified columns will tend to have nearly the same x0 coordinates
- Text boxes containing right justified columns will tend to have nearly the same x1 coordinates
- The codes for fund, account code, and object code are numeric and have fixed lengths
- Extraneous information is often preceded or followed by a series of underscore and newline characters
- Last name can be distinguished because is the only field that is all characters followed by a comma
- Last name may be preceded by between one and three numerical fields:  fund, account, object.  If it is, the x0 value is shifted to the left.
    - Three numerical fields precede the name:  assume they are fund, account, object
    - Two numerical fields precede the name: assume they are account, object
    - One numerical field precedes the name: assume it is object
    

In [None]:
def read_pdf(path):
    document = open(path, 'rb')                                     #read a pdf and create a document object
    rsrcmgr = PDFResourceManager()                                  #create a resource manager
    laparams = LAParams()                                           #set the parameters for analysis
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)          #create a PDF page aggregator object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    pdf={}                                                          #dictionary to hold the results

    pageno = -1                                                     #initialize page coounter to zero

    for page in PDFPage.get_pages(document):                        #loop through the pdf page by page
        pageno = pageno + 1                                         #increment the page number
        pdf[pageno] = {}                                            #dictionary for this page
        interpreter.process_page(page)                              # receive the LTPage object for the page.
        layout = device.get_result()                                # create layout object
        tbox_no=0                                                   # index for element number
        for element in layout:
            if (type(element).__name__=='LTTextBoxHorizontal'):     #loop through text boxes
                tbox_no += 1                                        #increment text box number
                pdf[pageno][tbox_no] = {}                           #dictionary for text boxes within page
                x0 = round(element.x0,2)                            #x0 coordinate of textbox corner
                x1 = round(element.x1,2)                            #x1 coordinate of textbox corner
                y0 = round(element.y0,2)                            #y0 coordinate of textbox corner
                y1 = round(element.y1,2)                            #y1 coordinate of textbox corner
                txt = element.get_text().encode('ascii', 'ignore')  #text converted to ascii
                pdf[pageno][tbox_no]['x0'] = x0                     #create x0 coordinate entry
                pdf[pageno][tbox_no]['x1'] = x1                     #create x1 coordinate entry
                pdf[pageno][tbox_no]['y0'] = y0                     #create y0 coordinate entry
                pdf[pageno][tbox_no]['y1'] = y1                     #create y1 coordinate entry

                pdf[pageno][tbox_no]['text'] = ''.join(chr(c) for c in txt) #convert bytes to string
    return(pdf)

### Parse the tax roll document

In [None]:
def find_plat(tx):
    plat_index=-1
    words = tx.split()
    for word in words:
        if (len(word)==16):
            if ((word[0:2].isdigit()) & (word[3]=='-') & \
                (word[4:6].isdigit()) & (word[7]=='-') & \
                (word[8:10].isdigit()) & (word[11]=='-') & (word[12:15].isdigit())):
                    plat_index = tx.find(word)
    return(plat_index)

def get_float(st):
    st2 = st.replace(',','')
    try:
        nv = float(st2)
    except ValueError:
        nv = np.NaN
    return(nv)

def get_last3_numeric(txt):                         #function returns the last 3 words on a line as numbers
    last3 = []                                      #initialize list for last 3 numbers
    words = txt.split()                             #split the text ine into words
    for i in np.arange(1,4):                        #for i=1,2,3
        last3.append(np.NaN)                        #append np.NaN as default
        if (len(words) >= i):                       #replace it with floating point number word is numeric
            word = words[len(words)-i]                          
            last3[i-1] = abs(get_float(word.replace(',','')))
    return(last3)
        
class account():                                        #account class
    def __init__(self,acct_no):                         #account class constructor
        self.acct_no = acct_no                          #  acct_no
        self.bills = {}                                 #  bills dictionary
        self.totals_text = None                         #  totals line
        self.totals_valuation = np.NaN
        self.totals_tax = np.NaN
        self.exemptions = {}                            #  exemptions dictionary
        return
    
    def get_acct_no(self):                              #return acct_no
        return(self.acct_no)
    
    def get_bills(self):                                #return bills dictionary
        return(self.bills)
    
    def get_exemptions(self):                           #return exemptions dictionary
        return(self.exemptions)
    
    def get_bill_count(self):                           #return bill ount
        return(len(self.bills))
    
    def set_totals(self,text):                          #set totals text and values
        substr = text[text.find('TOTALS') + 6:]         #get text after 'TOTALS'.''
        words = substr.split()                          #
        first = last = next(iter(words), '')            #get first and last words 
        for last in iter(words):                        #
            pass                                        #
        if (first.replace(',','').isdigit()):           #
            self.totals_valuation = get_float(first.replace(',',''))
        if ('.' in last):
            self.totals_tax = get_float(last.replace(',',''))
        
    def get_totals_text(self):                          #return totals text
        return(self.totals_text)
    
    def get_totals_valuation(self):                     #return valuation from totals
        return(self.totals_valuation)
    
    def get_totals_tax(self):                           #return tax from totals
        return(self.totals_tax)
    
    def get_bill(self,i):                               #return a specific bill
        try:
            return(self.bills[i])
        except KeyError:
            return(None)
    
    def add_exemption(self,exemp):                      #add exemption to dictionary
        xmp = exemp[3:]                                 #strip of 'EX ' from the beginning
        words = xmp.split()                             #
        first = last = next(iter(words), '')            #get first and last words 
        for last in iter(words):                        #
            pass                                        #
        xm = xmp[:xmp.find(last)].strip()               #remove exemption amount
        examt = abs(get_float(last.replace(',','')))    #get amt from last word
        if (xm not in self.exemptions.keys()):
            self.exemptions[xm] = examt                 #set type as key for amt
        else:
            self.exemptions[xm] += examt                #if duplicate, just add in exemption
        return
    
    def add_bill(self,bil):                             #add bill to dictionary
        ix = 1+len(self.bills)                          #increment bill index
        self.bills[ix] = bill(bil)                      #add bill object to dictionary
        return
    
class bill():                                           #bill class
    def __init__(self,text):                            #constructor     
        self.text = text                                #set text
        self.platt = ''                                 #set platt
        self.tax = np.NaN                               #set tax missing
        self.valuation = np.NaN                         #set valuation missing
        re_class = ''                                   #set real estate class missing
        self.bill_no = ''                               #set bill number missing
        self.address = ''                               #set address missing
        self.state_code = ''                            #set state code missing
                                                        #        
        words = text.split()                            #extract platt from text string
        for word in words:                              #platt should be the on word with:
            if (len(word)==16):                         #  length 16
                if (word[3]=='-'):                      #  the fourth character being a hyphen
                    self.platt = word                   #save the platt string
                    platt_ix = text.find(word)          #save the starting position for future reference
                                                        #extract real estate class from text string
        reix = text.find(' RE ')                        #look for the ' RE ' substring
        self.re_class = text[reix+7:platt_ix].strip()   #take substring 7 chars past RE to start of platt
                                                        #extract bill number
        substr = text[:reix]                            #take the text string up to ' RE '
        words = substr.split()                          #
        first = last = next(iter(words), '')            #get first and last words 
        for last in iter(words):                        #
            pass                                        #
        if (last.isdigit()):                            #if the last word encountered before RE is all digits
            self.bill_no = last                         #then save it as self.bill_no
                                                        #extract state code
        substr = text[reix+3:]                          #take the text string beyond ' RE '
        words = substr.split()                          #
        first = last = next(iter(words), '')            #get first and last words 
        for last in iter(words):                        #
            pass                                        #
        self.state_code = first                         #then save it as self.state_code
                                                        #extract the tax amount and valuation
        nums = get_last3_numeric(text)                  #get numeric values of the last 3 words on the line
        if (nums[2] != nums[2]):                        #if third value is np.NaN, use first two values
            self.tax = nums[0]                          #last number is tax
            self.valuation = nums[1]                    #next to last is valuation
            numeric_values = 2                          #number of numeric values
        else:                                           #if three numeric values use first and third
            self.tax = nums[0]                          #tax is first value
            if (nums[2] > nums[0]):                     #if third value is greater than tax 
                self.valuation = nums[2]                #assume it is the valuation
                numeric_values = 3                      #three numeric values
            else:                                       #otherwise use the second value
                self.valuation = nums[1]                #as the valuation
                numeric_values = 2                      #two numeric values
                                                        #
        words = text.split()                            #
        last_numeric = words[len(words)-numeric_values] #last of the numeric values
        substr = text[platt_ix+16:text.find(last_numeric)]  #take tet from end of platt to start of valuation
        self.address = substr.strip()                   #remove leading and trailing spaces
            
        return
    
    def get_platt(self):                                #return plat
        return(self.platt)
    
    def get_text(self):                                 #return text
        return(self.text)
    
    def get_class(self):
        return(self.re_class)
    
    def get_bill_no(self):
        return(self.bill_no)

    def get_state_code(self):
        return(self.state_code)

    def get_address(self):
        return(self.address)
    
    def get_tax(self):
        return(self.tax)
    
    def get_valuation(self):
        return(self.valuation)

### Function reads the tax roll pdf

In [None]:
def read_taxroll(tr):
    
    accts = {}

    for k in tr.keys():                                             #loop through pages
        for key in tr[k].keys():                                    #loop through text boxes in page
            if ('text' in tr[k][key].keys()):                       #look at 'text' elements
                text = tr[k][key]['text']                           #extract text
                lines = text.split('\n')                            #split into lines
                for line in lines:                                  #loop through lines
                    if (len(line) > 10):                            #only look at lines longer than 10 chars
                        if (line[0:9].isdigit()):                   #check for account number
                            acct_no = line[0:9]                     #if first 10 chars are digits
                            if (acct_no not in accts.keys()):       #check if it's already in keys
                                accts[acct_no] = account(acct_no)   #if not, add account object
                        if (find_plat(line) > 0):                   #check for platt string
                            accts[acct_no].add_bill(line)           #if present, add to bills
                        elif ('TOTALS' in line):                    #check for TOTALS line
                            line2 = line[line.find('TOTALS'):]      #if so, add TOTALS to account object
                            accts[acct_no].set_totals(line2)        #using set_totals() function
                        elif (' EX ' in line):                      #check for exemption
                            line2 = line[line.find(' EX '):]        #if present
                            accts[acct_no].add_exemption(line2)     #add exemption
    return(accts)

### Read the tax roll documents and save the decoded contents

In [None]:
taxrolls = {}

pdfs = {}

#fyears = np.arange(1994,2021)
fyears = np.arange(2010,2021)
for fy in fyears:
    fn = taxroll_path + 'EG_RE_Tax_Roll_FY' + str(fy) + '_assessed_12_31_' + str(fy-2) + '.pdf'
    print(fn)
    pdfs[fy] = read_pdf(fn)
    taxrolls[fy] = read_taxroll(pdfs[fy])

### Show the fiscal years read

In [None]:
taxrolls.keys()

In [None]:
platts = {}


for fyear in taxrolls.keys():                                   #loop through fiscal years
    for acct_no in taxrolls[fyear].keys():                      #loop through acct no
        b = taxrolls[fyear][acct_no].get_bills()                #get tax bills for acct
        for bn in b.keys():                                     #loop through bills
            p = b[bn].get_platt()                               #get platt
            if p not in platts.keys():                          #add to platts if new
                platts[p] = {}
                platts[p]['address'] = b[bn].get_address()
                platts[p]['classes'] = {}
            c = b[bn].get_class()                               #get class
            if c not in platts[p]['classes'].keys():            #add to class list if new
                platts[p]['classes'][c] = {}
            if fyear not in platts[p]['classes'][c].keys():     #add to fyears if new
                platts[p]['classes'][c][fyear]={}
            t = round(b[bn].get_tax(),2)                        #get tax, round to penny
            v = b[bn].get_valuation()                           #get valuation
            if 'tax' not in platts[p]['classes'][c][fyear].keys():
                platts[p]['classes'][c][fyear]['tax'] = t
            else:
                platts[p]['classes'][c][fyear]['tax'] += t
            if 'valuation' not in platts[p]['classes'][c][fyear].keys():
                platts[p]['classes'][c][fyear]['valuation'] = v
            else:
                platts[p]['classes'][c][fyear]['valuation'] = v
        
platts

### Show number of platts

In [None]:
len(platts)

### Compute median tax change percentages and directions

In [None]:
rf = {}                                                         #dictionary of counts for rates by year

for p in platts.keys():                                         #loop through platts
    for c in platts[p]['classes'].keys():                       #select only single family homes
        if (c == 'ONE FAMI'):                                   #
            for fy in platts[p]['classes'][c].keys():           #loop through fiscal years
                if fy not in rf.keys():                         #add fiscal year to dictionary if it's not there
                    rf[fy] = {}                                 #
                t = platts[p]['classes'][c][fy]['tax']          #get tax amount for this property
                v = platts[p]['classes'][c][fy]['valuation']    #get valuation for this property
                r = round(t/(v/1000),2)                         #compute the tax rate
                if (r not in rf[fy].keys()):                    #if this rate is not in the dictionary,
                    rf[fy][r] = 0                               #set up a counter for it
                rf[fy][r]+=1                                    #increment the count by one
                
rates = {}                                                      #dictionary for modes of rates by year

for fy in rf.keys():                                            #loop through fiscal years
    if fy not in rates.keys():                                  #add fiscal year to keys if it's not there
        max_count = 0                                           #initialize max count to zero
        rates[fy] = {}                                          #initialize rate dictionary for this year
        rates[fy]['rate'] = np.NaN                              #initialize rate to np.NaN
    for r in rf[fy].keys():                                     #loop through individual rates
        if (rf[fy][r] > max_count):                             #find the one that occurs most frequently
            max_count = rf[fy][r]                               #
            rates[fy]['rate'] = r                               #

for fy in rates.keys():
    if (fy-1 in rates.keys()):
        rates[fy]['prev_rate'] = rates[fy-1]['rate']
        rates[fy]['rate_pct_chg'] = round(100*((rates[fy]['rate']/rates[fy]['prev_rate']) - 1.0),2)
        rates[fy]['gt'] = 0
        rates[fy]['le'] = 0
        rates[fy]['veq'] = 0
        rates[fy]['vneq'] = 0
        rates[fy]['tax_pct_chg'] = np.zeros(0)
        rates[fy]['rate_pct_chg'] = 0.0
        rates[fy]['rate_pct_chg_gt_tax'] = 0
        rates[fy]['rate_pct_chg_le_tax'] = 0

for p in platts.keys():
    for c in platts[p]['classes'].keys():
        if (c == 'ONE FAMI'):
            for fy in sorted(platts[p]['classes'][c].keys()):
                fy2 = fy-1
                if fy2 in platts[p]['classes'][c].keys():
                    t1 = platts[p]['classes'][c][fy]['tax']
                    t2 = platts[p]['classes'][c][fy2]['tax']
                    r1 = rates[fy]['rate']
                    r2 = rates[fy2]['rate']
                    v1 = platts[p]['classes'][c][fy]['valuation']
                    v2 = platts[p]['classes'][c][fy2]['valuation']
                    rate_pct_chg = 100*((r1/r2)-1.0)
                    rates[fy]['rate_pct_chg'] = round(rate_pct_chg,2)
                    try:
                        tax_pct_chg = 100*((t1/t2)-1.0)
                        if ('tax_pct_chg' in rates[fy].keys()):
                            rates[fy]['tax_pct_chg'] = np.append(rates[fy]['tax_pct_chg'],tax_pct_chg)
                        if (abs(rate_pct_chg) > abs(tax_pct_chg)):
                            rates[fy]['rate_pct_chg_gt_tax'] += 1
                        else:
                            rates[fy]['rate_pct_chg_le_tax'] += 1
                    except ZeroDivisionError:
                        tax_pct_chg = 0.0
                    if (t1 > t2):
                        if ('gt' in rates[fy].keys()):
                            rates[fy]['gt']+=1
                    else:
                        if ('le' in rates[fy].keys()):
                            rates[fy]['le']+=1
                    if (v2 == v1):
                        if ('veq' in rates[fy].keys()):
                            rates[fy]['veq']+=1
                    else:
                        if ('vneq' in rates[fy].keys()):
                            rates[fy]['vneq']+=1
    
for fy in rates.keys():
    if ('tax_pct_chg' in rates[fy].keys()):
        rates[fy]['median_tax_pct_chg'] = round(np.median(rates[fy]['tax_pct_chg']),2)
        del rates[fy]['tax_pct_chg']

### Compute change components

In [None]:
for p in platts.keys():
    for c in platts[p]['classes'].keys():
        if (c == 'ONE FAMI'):
            for fy in sorted(platts[p]['classes'][c].keys()):
                fy2 = fy-1
                if fy2 in platts[p]['classes'][c].keys():
                    t1 = platts[p]['classes'][c][fy]['tax']
                    t2 = platts[p]['classes'][c][fy2]['tax']
                    r1 = rates[fy]['rate']
                    r2 = rates[fy2]['rate']
                    v1 = platts[p]['classes'][c][fy]['valuation']
                    v2 = platts[p]['classes'][c][fy2]['valuation']
                    delta_r = round(r1-r2,2)
                    delta_v = round(v1-v2,2)
                    delta_t = round(t1-t2,2)
                    delta_t_dr = round(v2*delta_r/1000.,2)
                    delta_t_dv = round(r2*delta_v/1000.,2)
                    delta_t_drdv = round(delta_r*delta_v/1000,2)
                    print(p,fy,delta_t,delta_r,delta_v)
                    print(delta_t_dr,delta_t_dv,delta_t_drdv)
                    print(round(delta_t_dr+delta_t_dv+delta_t_drdv,2))
                    


### Display raw results dictionary

In [None]:
rates

### Standard deviation and correlation - revaluation years

In [None]:
rate_chg_list = np.zeros(0)
median_chg_list = np.zeros(0)


for fy in rates.keys():
    if ('rate_pct_chg' in rates[fy].keys()):
        if (rates[fy]['vneq'] > rates[fy]['veq']):
            rate_chg_list = np.append(rate_chg_list,rates[fy]['rate_pct_chg'])
            median_chg_list = np.append(median_chg_list,rates[fy]['median_tax_pct_chg'])

print('standard deviation of percentage changes in the tax rate',round(np.std(rate_chg_list),2))
print('standard deviation of the median percentage change in the tax bills',round(np.std(median_chg_list),2))
print('correlation of percent changes - rates and bills',round(np.corrcoef(rate_chg_list,median_chg_list)[0,1],2))

### Standard deviation and correlation - nonrevaluation years

In [None]:
rate_chg_list = np.zeros(0)
median_chg_list = np.zeros(0)


for fy in rates.keys():
    if ('rate_pct_chg' in rates[fy].keys()):
        if (rates[fy]['vneq'] < rates[fy]['veq']):
            rate_chg_list = np.append(rate_chg_list,rates[fy]['rate_pct_chg'])
            median_chg_list = np.append(median_chg_list,rates[fy]['median_tax_pct_chg'])

print('standard deviation of percentage changes in the tax rate',round(np.std(rate_chg_list),2))
print('standard deviation of the median percentage change in the tax bills',round(np.std(median_chg_list),2))
print('correlation of percent changes - rates and bills',np.corrcoef(rate_chg_list,median_chg_list)[0,1])

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

# data to plot
rate_chg_list = []
median_chg_list = []

for fy in rates.keys():
    if ('rate_pct_chg' in rates[fy].keys()):
        rate_chg_list.append(rates[fy]['rate_pct_chg'])
        median_chg_list.append(rates[fy]['median_tax_pct_chg'])
        
n_groups = len(rate_chg_list)

# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.5
opacity = 0.8

rects1 = plt.bar(index, rate_chg_list, bar_width,
alpha=opacity,
color='b',
label='Percent Change in Tax Rate')

rects2 = plt.bar(index + bar_width, median_chg_list, bar_width,
alpha=opacity,
color='g',
label='Median Percent Change in Tax Bills')

plt.xlabel('Fiscal Year')
plt.ylabel('Percent Change')
plt.rc('axes', titlesize=22)     # fontsize of the axes title
plt.rc('axes', labelsize=16)    # fontsize of the x and y labels
plt.title('Percent Change in Tax Rate and Median Percent Change in Tax Bills')
plt.xticks(index + bar_width, (np.arange(1995,2021)))
plt.legend()
plt.rcParams.update({'figure.autolayout': True})
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(12.5, 5.5)
fig.savefig('../../finance_subcommittee/Tax_rate_vs_tax_bill.png', dpi=100)

In [None]:
with open('../../rate_vs_tax.csv', 'w') as cvsout:

    for p in platts.keys():
        if ('ONE FAMI' in platts[p]['classes'].keys()):
            addr = platts[p]['adress']
            s = '"' + p + '","' + addr + '"'
            for fy in np.arange(2020,1993,-1):
                if (fy in platts[p]['classes']['ONE FAMI'].keys()):
                    s += ',' + str(round(platts[p]['classes']['ONE FAMI'][fy]['tax'],2))
                    s += ',' + str(platts[p]['classes']['ONE FAMI'][fy]['valuation'])
                else:
                    s += ',,'
            s+='\n'
            cvsout.write(s) 