## Read earnings reports (OO version)

E. Quinn 12/22/2019

This notebook uses pdfminer to extract the information from the individual earnings report

The documentation for pdfminer is at:

https://buildmedia.readthedocs.org/media/pdf/pdfminer-docs/latest/pdfminer-docs.pdf

Maintenance:

* 3/6/2020  
  * Add check date and number
* 3/7/2020  
  * Align personnel classes with support professionals structure
  * Implement salary step capture for support professionals
* 4/8/2020
  * Rewrite logic to base data structure on check number and check date
  * Simplify payment decoding logic to take advantage of having check date
  * Data corrections for check dates and numbers:
    * Adjust 5 check dates to aliign with nearest payday
    * Generate 4 artificial check numbers for zero earnings lines
    
To do:
* Replace computed salary matrix for FY2020 with numbers from contract (correct small rounding errors)

## Import standard python datascience packages

In [None]:
import math
import re
import copy
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from datetime import datetime, timedelta, date
from datascience import *
import uuid

## Import pdfminer packages

In [None]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal

### Show the directory we are running in

In [None]:
!pwd

### EG accounting codes class

provides descriptions for EG accounting codes and mapping to UCOA codes

In [None]:
class EG_acct_codes():
    def __init__(self):
        self.EG_account_codes ={
            '71100105': 'K Frenchtown',
            '71100107': 'K MDBK',
            '71109105': 'Title 1  Frenchtown',
            '71109107': 'Title 1  MDBK',
            '71110105': 'Grade 1 Frenchtown',
            '71110107': 'Grade 1 MDBK',
            '71120105': 'Grade 2 Frenchtown',
            '71120107': 'Grade 2 MDBK',
            '71121102': 'Art Eldredge',
            '71121103': 'Art Cole',
            '71121105': 'Art Frenchtown',
            '71121106': 'Art EGHS',
            '71121107': 'Art MDBK',
            '71121108': 'Art Hanaford',
            '71123103': 'ELA Cole',
            '71123106': 'ELA EGHS',
            '71123108': 'ELA Hanaford',
            '71124103': 'Foreign Language Cole',
            '71124106': 'Foreign Language EGHS',
            '71125102': 'PE/health Eldredge',
            '71125103': 'PE/health Cole',
            '71125105': 'PE/health Frenchtown',
            '71125106': 'PE/health EGHS',
            '71125107': 'PE/health MDBK',
            '71125108': 'PE/health Hanaford',
            '71126103': 'Tech Cole',
            '71126306': 'Tech EGHS',
            '71127103': 'Math Cole',
            '71127106': 'Math EGHS',
            '71128102': 'Music Eldredge',
            '71128103': 'Music Cole',
            '71128105': 'Music Frenchtown',
            '71128106': 'Music EGHS',
            '71128107': 'Music MDBK',
            '71128108': 'Music Hanaford',
            '71129103': 'Science Cole',
            '71129106': 'Science EGHS',
            '71130102': 'Grade 3 Eldredge',
            '71130105': 'Grade 3 Frenchtown',
            '71130108': 'Grade 3 Hanaford',
            '71130406': 'Business/Computer  EGHS',
            '71131103': 'Social Studies Cole',
            '71131106': 'Social Studies EGHS',
            '71140102': 'Grade 4 Eldredge',
            '71140108': 'Grade 4 Hanaford',
            '71140403': 'Computer Cole',
            '71140406': 'Computer EGHS',
            '71141102': 'Reading Eldredge',
            '71141103': 'Reading Cole',
            '71141105': 'Reading Frenchtown',
            '71141106': 'Reading EGHS',
            '71141107': 'Reading MDBK',
            '71141108': 'Reading Hanaford',
            '71150102': 'Grade 5 Eldredge',
            '71150108': 'Grade 5 Hanaford',
            '71180102': 'SPED Eldredge',
            '71180103': 'SPED EGHS',
            '71180105': 'SPED Frenchtown',
            '71180106': 'SPED EGHS',
            '71180107': 'SPED MDBK',
            '71180108': 'SPED Hanaford',
            '71181102': 'SPED Life Skills Eldredge',
            '71181103': 'SPED Life Skills Cole',
            '71181105': 'SPED Life Skills Frenchtown',
            '71181106': 'SPED Life Skills EGHS',
            '71181107': 'SPED Life Skills MDBK',
            '71182107': 'SPED EGHS',
            '71191302': 'ESL Eldredge',
            '71191303': 'ESL Cole',
            '71191305': 'ESL Frenchtown',
            '71191306': 'ESL EGHS',
            '71191307': 'ESL MDBK',
            '71191308': 'ESL Hanaford',
            '71210202': 'Teacher Subs Eldredge',
            '71210203': 'Teacher Subs Cole',
            '71210205': 'Teacher Subs Frenchtown',
            '71210206': 'Teacher Subs EGHS',
            '71210207': 'Teacher Subs MDBK',
            '71210208': 'Teacher Subs Hanaford',
            '71210402': 'Long Term Subs Eldredge',
            '71210403': 'Long Term Subs Cole',
            '71210405': 'Long Term Subs Frenchtown',
            '71210406': 'Long Term Subs EGHS',
            '71210407': 'Long Term Subs MDBK',
            '71210408': 'Long Term Subs Hanaford',
            '71223102': 'Para Subs Eldredge',
            '71223103': 'Para Subs Cole',
            '71223105': 'Para Subs Frenchtown',
            '71223106': 'Para Subs EGHS',
            '71223107': 'Para Subs MDBK',
            '71223108': 'Para Subs Hanaford',
            '71231503': 'Guidance Cole',
            '71231506': 'Guidance EGHS',
            '71246702': 'Librarian Eldredge',
            '71246703': 'Librarian Cole',
            '71246705': 'Librarian Frenchtown',
            '71246706': 'Librarian EGHS',
            '71246707': 'Librarian MDBK',
            '71246708': 'Librarian Hanaford',
            '71269506': 'Nurse EGHS',
            '71270102': 'Nurse Subs Eldredge',
            '71270103': 'Nurse Subs Cole',
            '71270105': 'Nurse Subs Frenchtown',
            '71270106': 'Nurse Subs EGHS',
            '71270107': 'Nurse Subs MDBK',
            '71270108': 'Nurse Subs Hanaford',
            '71270302': 'Nurse Eldredge',
            '71270303': 'Nurse Cole',
            '71270305': 'Nurse Frenchtown',
            '71270306': 'Nurse EGHS',
            '71270307': 'Nurse MDBK',
            '71270308': 'Nurse Hanaford',
            '71301106': 'SPED EGHS',
            '71301602': 'Social Worker Eldredge',
            '71301603': 'Social Worker Cole',
            '71301605': 'Social Worker Frenchtown',
            '71301606': 'Social Worker EGHS',
            '71301607': 'Social Worker MDBK',
            '71301608': 'Social Worker Hanaford',
            '71302702': 'OT Eldredge',
            '71302703': 'OT Cole',
            '71302705': 'OT Frenchtown',
            '71302706': 'OT EGHS',
            '71302707': 'OT MDBK',
            '71302708': 'OT Hanaford',
            '71308102': 'Adaptive PE Eldredge',
            '71308103': 'Adaptive PE Cole',
            '71308105': 'Adaptive PE Frenchtown',
            '71308106': 'Adaptive PE EGHS',
            '71308107': 'Adaptive PE MDBK',
            '71308108': 'Adaptive PE Hanaford',
            '71310106': 'History EGHS',
            '71311702': 'Psychologist Eldredge',
            '71311703': 'Psychologist Cole',
            '71311705': 'Psychologist Frenchtown',
            '71311706': 'Psychologist EGHS',
            '71311707': 'Psycholotist MDBK',
            '71311708': 'Psychologist Hanaford',
            '71321802': 'Speech Eldredge',
            '71321803': 'Speech Cole',
            '71321805': 'Speech Frenchtown',
            '71321806': 'Speech EGHS',
            '71321807': 'Speech MDBK',
            '71321808': 'Speech Hanaford',
            '71347302': 'Custodian Subs Eldredge',
            '71347303': 'Custodian Subs Cole',
            '71347305': 'Custodian Subs Frenchtown',
            '71347306': 'Custodian Subs EGHS',
            '71347307': 'Custodian Subs MDBK',
            '71347308': 'Custodian Subs Hanaford'
        }
        self.local_to_ucoa = {
            '711001': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 1,'JC':1100,'Sub Desc':'K'},
            '711101': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 3,'JC':1100, 'Sub Desc': 'Grade 1'},
            '711201': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 4,'JC':1100, 'Sub Desc': 'Grade 2'},
            '711301': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 5,'JC':1100, 'Sub Desc': 'Grade 3'},
            '711401': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 6,'JC':1100, 'Sub Desc': 'Grade 4'},
            '711501': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 7,'JC':1100, 'Sub Desc': 'Grade 5'},
            '711211': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 200,'JC':1100, 'Sub Desc': 'Art'},
            '711231': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 500,'JC':1100, 'Sub Desc': 'ELA'},
            '711241': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 700,'JC':1100, 'Sub Desc': 'Foreign Language'},
            '711251': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 1200,'JC':1100, 'Sub Desc': 'PE/health'},
            '711271': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 1500,'JC':1100, 'Sub Desc': 'Math'},
            '711281': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 1600,'JC':1100, 'Sub Desc': 'Music'},
            '711291': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 1700,'JC':1100, 'Sub Desc': 'Science'},
            '711311': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 1900,'JC':1100, 'Sub Desc': 'Social Studies'},
            '711411': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 2400,'JC':1100, 'Sub Desc': 'Reading'},
            '711261': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 2000,'JC':1100, 'Sub Desc': 'Tech/computer'},
            '711404': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 2000,'JC':1100, 'Sub Desc': 'Tech/computer'},
            '712467': {'Fund':1000000,'Prog':10,'Func':212,'Sub': 2600,'JC':1600, 'Sub Desc': 'Librarian'},
            '713027': {'Fund':1000000,'Prog':20,'Func':232,'Sub': 2125,'JC':1700, 'Sub Desc': 'Occupational Therapy'},
            '713218': {'Fund':1000000,'Prog':20,'Func':232,'Sub': 2122,'JC':1700, 'Sub Desc': 'Speech Therapy'},
            '713117': {'Fund':1000000,'Prog':20,'Func':232,'Sub': 2121,'JC':1700, 'Sub Desc': 'Psychologist'},
            '713016': {'Fund':1000000,'Prog':20,'Func':232,'Sub': 2120,'JC':1700, 'Sub Desc': 'Social Worker'},
            '719130': {'Fund':1000000,'Prog':40,'Func':111,'Sub': 600,'JC':1300, 'Sub Desc': 'ESL'},
            '711304': {'Fund':1000000,'Prog':10,'Func':111,'Sub': 1800,'JC':1100, 'Sub Desc': 'Business/comp'},
            '712315': {'Fund':1000000,'Prog':10,'Func':211,'Sub': 800,'JC':1500, 'Sub Desc': 'Guidance'}
        }
        return
        
    def get_eg_acct_desc(self,acct):
        """Provides descriptions for accounting codes in EG MUNIS system."""
        try:
            return(self.EG_account_codes[acct])
        except KeyError:
            return('(no description)')
        
    def get_eg_acct_UCOA(self,acct):
        """Provides UCOA codes for accounting codes in EG MUNIS system."""
        try:
            return(self.local_to_ucoa[acct])
        except KeyError:
            return({})

### RIDE Object labels class

provides a lookup table for RIDE UCOA Object Descriptions

In [None]:
class RIDE_Obj_labels():
    """Provides a dictionary of RIDE Obj descriptions"""
    
    def __init__(self):                                        #download RIDE UCOA data and extract Obj codes
            
        self.obj_labels = {}

        fydf = pd.read_csv("../Obj.csv") #read the file for this fiscal year
        
        objdd = fydf.to_dict()

        for key in objdd['obj'].keys():
            obj = int(str(objdd['obj'][key]))
            self.obj_labels[obj] = objdd['obj_desc'][key]
            
    def get_obj_desc(self,obj):                                            #look up RIDE Obj description
        """Provides a lookup function for Obj description"""
        try:
            obj_desc = self.obj_labels[obj]                                #return description if found
        except KeyError:
            obj_desc = '(none)'                                            #otherwise return '(none)'
                
        return(obj_desc)    

### pay_check class

Represents a line in the earnings report

In [None]:
class pay_check():                                                            #generic check class
    def __init__(self,check_number,name,check_date,payperiod_object):         #constructor
        self.check_number   = check_number
        self.name           = name
        self.check_date     = check_date
        self.items          = {}
        self.pay_period     = payperiod_object
        self.fiscal_year    = self.pay_period.get_fiscal_year(check_date)
        self.school_year    = self.pay_period.get_school_year(check_date)
        self.calendar_year  = check_date.year
        return

    def get_name(self):
        return(self.name)
        
    def get_date(self):
        return(self.check_date)
    
    def get_number(self):
        return(self.check_number)
    
    def get_fiscal_year(self):
        return(self.fiscal_year)

    def get_school_year(self):
        return(self.school_year)

    def get_calendar_year(self):
        return(self.calendar_year)
    
    def get_items(self):
        return(self.items)
    
    def add_item(self,fund,acct,obj,position,rate,earnings,acct_desc,obj_desc,acct_UCOA,stepinfo):
        item_number = len(self.items) + 1
        self.items[item_number] = {'fund':fund,'acct':acct,'obj':obj,'position':position, \
            'rate':rate,'earnings':earnings,'acct_desc':acct_desc,'obj_desc':obj_desc, \
            'acct_UCOA':acct_UCOA,'step_info':stepinfo}
        return

### Payperiod class

Represents a two-week pay period

In [None]:
class payperiod():                                      #class for dates at end of payperiods
    """Provides a dictionary of payroll periods"""
    
    def __init__(self):
    
        self.payperiods = {}                            #initialize payperiods dictionary

        delta = timedelta(days=14)                      #14 days per pay period
        first_payperiod = date(2009,7,3)                #first pay period of FY2010
        cutoff_date = date(2026,7,1)
        
        current_payperiod = first_payperiod
        current_year = first_payperiod.year
        current_month = first_payperiod.month

        while (current_payperiod < cutoff_date):         #generate pay periods through cutoff date
            self.payperiods[current_payperiod] = {}     #create empty dictionary for this payperiod
            fyear = current_payperiod.year              #compute fiscal year
            if (current_payperiod.month >= 7):          #for months7-12 it's current year plus 1
               fyear += 1
            self.payperiods[current_payperiod]['fyear'] = fyear
            schyear = current_payperiod.year
            if (current_payperiod.month < 8):
               schyear = str(current_payperiod.year-1) + '-' + str(current_payperiod.year)
            elif (current_payperiod.month > 8):
               schyear = str(current_payperiod.year) + '-' + str(current_payperiod.year+1)
            elif (current_payperiod.day < 14):
               schyear = str(current_payperiod.year-1) + '-' + str(current_payperiod.year)
            else:
               schyear = str(current_payperiod.year) + '-' + str(current_payperiod.year+1)
            self.payperiods[current_payperiod]['school_year'] = schyear
            self.payperiods[current_payperiod]['calendar_year'] = current_payperiod.year
            if ((current_payperiod.month == 6) & \
                ((current_payperiod+delta).month==7) & \
                ((current_payperiod+delta).day > 1)):
                self.payperiods[current_payperiod]['spans_fyear'] = 'True'
            else:
                self.payperiods[current_payperiod]['spans_fyear'] = 'False'
            current_payperiod += delta                                  #increment date by 14 days
        
        self.payperiods[date(2016,11,10)] = self.payperiods.pop(date(2016,11,11))
        return
    
    def get_payperiods(self):
        return(self.payperiods)
    
    def get_payperiod_end(self,fyear,ppno):                              #look up the date of the nth pay period
        """Lookup end date of payroll periods in a given fiscal year"""
        try:
            ppend = self.payperiods[fyear][ppno]
        except KeyError:
            ppend = np.NaN
            
        return(ppend)    
        
    def get_fiscal_year(self,xdate):                    #look up the fiscal year given a date
        """Lookup fiscal year given date"""
        fyr = xdate.year
        mon = xdate.month
        if (mon > 6):
            fyr += 1
        return(fyr)
    
    def get_school_year(self,xdate):                    #look up the fiscal year given a date
        """Lookup school year given date"""
        return(self.payperiods[xdate]['school_year'])
    
    def get_payperiod_no(self,fyear,xdate):             #look up the pay period number for a date
        """Lookup payroll period given a date"""
        period=1
        while self.payperiods[fyear][period] >= xdate:
            period += 1
        return(period)
    
    def get_next_payday(self,y,m,d):                                    #find the next payday after given date
        """Find the end of the current pay period given a date"""
        d = date(y,m,d)                                                 #convert y,m,d to date value
    
        for pdate in self.payperiods.keys():                            #look through paydates
            if (payday >= d):                                       #return the first one greater than 
                return(payday)                                      #the date supplied
            
        return(np.NaN)
    
    def get_previous_payday(self,cdate):                            #find the previous payday
        """Find the date of the previous payday"""
        
        tdate = np.NaN
        
        for pdate in sorted(self.payperiods.keys()):                #look through paydates
            if (pdate <= cdate):                                    #return the last one less than or equal
                tdate = pdate
            
        return(tdate)

In [None]:
class teacher_salary_matrix():
    def __init__(self):                                       #constructor

        self.cba_cols ={'B': 0,'B+30': 1,'M': 2,'M+30': 3,'M2/CAGS': 4, 'D': 5}
        
        self.cba = np.zeros((10, 10, 6))    #salary matrix is 3-D numpy array indexed by: fyear, step, column
        
                                                                #start with FY2016 (2015-2016) salary matrix
        self.cba[3,:,:] = np.array([
            [41286, 42900, 43871, 44505, 44893, 45186],
            [44871, 46484, 47454, 48085, 48474, 48771],
            [48494, 50106, 51078, 51709, 52098, 52393],
            [52118, 53729, 54700, 55332, 55722, 56018],
            [55743, 57354, 58328, 58958, 59347, 59642],
            [59366, 60979, 61951, 62583, 62974, 63266],
            [62991, 64605, 65574, 66206, 66596, 66892],
            [66616, 68228, 69199, 69829, 69806, 70515],
            [71741, 73353, 74323, 74954, 75345, 75639],
            [78898, 80675, 81743, 82438, 82866, 83190]]) 
        
                                                                #FY2017 (2016-2017) is the same as FY2016
        self.cba[4,:,:] = self.cba[3,:,:]
        
                                                                #FY2018 (2017-2018) 2% increase
        self.cba[5,:,:] = np.around(1.02*self.cba[4,:,:],0)
        
                                                                #FY2019 (2018-2019) 2.25% increase
        self.cba[6,:,:] = np.around(1.0225*self.cba[5,:,:],0)
        
                                                                #FY2020 (2019-2020) same as FY2019
        self.cba[7,:,:] = self.cba[6,:,:]
        
                                                                #FY2021 (2020-2021) 2% increase
        self.cba[8,:,:] = np.around(1.02*self.cba[7,:,:],0)
        
                                                                #FY2022 (2021-2022) 2.25% increase
        self.cba[9,:,:] = np.around(1.0225*self.cba[8,:,:],0)

        
                                                                #FY2015: back out 2% increase from FY2016
        self.cba[2,:,:] = np.around(self.cba[3,:,:]/1.02,0) 
        
                                                                #FY2014: back out 2.5% increase from FY2015
        self.cba[1,:,:] = np.around(self.cba[2,:,:]/1.025,0)  
        
                                                                #FY2013: back out 1.01% from FY2014 for steps 1-9
        self.cba[0,0:8,:] = np.around(self.cba[1,0:8,:]/1.01,0)
                                                                #FY2013: back out 2.25% from FY2014 for step 10
        self.cba[0,9,:]   = np.around(self.cba[1,9,:]/1.0225,0)  
        return            
    
    def get_cba_matrix(self):
        return(self.cba)
    
    def get_salary(self,fyear,step,col):                        #look up salary by year, column, step
        """Returns CBA salary given fiscal year, column, and step for FY2013-FY2022."""
        yr = fyear - 2013                                       #year index 0 is 2013
        s  = step-1                                             #step index is one less than the step number
        c = col                                                 #column within the CBA salary matrix
        
        try:
            return self.cba[yr,s,c]                             #return the value if it exists
        except KeyError:                                        #otherwise raise error condition
            print("KeyError in get_salary: ",yr,s,c)
        except IndexError:
            print("IndexError in get_salary: ",yr,s,c)
    
    def decode_earnings(self,check_date,rate,earnings,pp):  #compute step, FTE, # of payments for teachers
        
        dct = {}
        
        f = [1.0,1/2.,1/10.,1/5.,4/5.,3/10.,\
            4/10.,6/10.,1/20.,1/3.,2/3.,1/4.,\
            3/4.,1/5.,2/5.,3/5.,4/5.,1/6.,5/6.,\
            1/7.,2/7.,3/7.,4/7.,5/7.,6/7.,1/8.,\
            3/8.,5/8.,7/8.,1/9.,2/9.,4/9.,5/9.,\
             7/9.,8/9.,7/10.,9/10.]                    #possible FTE fractions
        
        cbacols ={'B': 0,'B+30': 1,'M': 2,\
                  'M+30': 3,'M2/CAGS': 4, 'D': 5}               #cba salary matrix columns

        salary = round(184.0*rate,0)                            #salary is 184 times daily rate
        lower_bound = salary - 1.0                              #lower limit for tolerance
        upper_bound = salary + 1.0                              #upper limit for tolerance
        step_code = ''                                          #initialize step code
        n_payments = [26.0,21.0]                                #possible number of payments: 26 or 21                              
        fte = np.NaN
        payments = np.NaN
        min_abs_diff = 10000. 
        
        fyear = pp.get_fiscal_year(check_date)
        school_year_string = pp.get_school_year(check_date)
        sy = int(school_year_string[5:])               #
                                                                #search salary matrix for a match
        for step in np.arange(1,11):                        #loop through steps
            for col in self.cba_cols.keys():                #loop through columns
                cbasal = self.get_salary(sy,\
                    step,self.cba_cols[col])                #salary from CBA matrix
                if( (lower_bound <= cbasal) &\
                    (cbasal <= upper_bound) ):               #computed salary within $1 of CBA
                    step_code = str(sy) + '-' + col + '-' + str(step)    #code is:  yyyy-cat-step

        if ((rate > 200.) | (earnings > 1000.)):       #determine fte and payments from rate
            for p in n_payments:                                #loop through payments 26 and 21
                for frac in f:                                  #loop through the FTE fractions in the list
                    diff = abs(earnings - 184.0*rate*frac/p)    #compute the difference from earnings
                    if (diff < min_abs_diff):                   #see if this is the smallest so far
                        min_abs_diff = diff                     #if it is, then save the min difference
                        payments = p                            #save the number of payments
                        fte = frac                              #save the FTE fraction
        elif ((rate > 0.0) & \
              (rate < 100.) & \
              (earnings > 500.)):                               #do this when rate is too low to be a daily rate
            for p in n_payments:                                #loop through possible payments 26 and 21
                for frac in f:                                  #loop through possible FTE fractions
                    for s in np.arange(10):                 #loop through CBA salary steps
                        for c in cbacols.keys():            #loop through salary matrix columns
                            sal = self.get_salary(sy,s+1,cbacols[c])    #look up salary in CBA tables
                            try:
                                diff = abs(earnings - frac*sal/p)      #compute delta from earnings
                            except TypeError:
                                diff = 100000.0
                            if (diff < min_abs_diff):       #if this is the smallest difference so far:
                                min_abs_diff = diff         #save the smallest value
                                fte = frac                  #save the FTE fraction
                                mc = c                      #save the salary matrix column
                                yr = sy                     #save the year used for the CBA salary
                                salary = sal
                                payments = p                #save the number of payments
                                step_code = str(yr) + '-' + mc + '-' + str(s+1)  #construct the step code
        dct['step'] = step_code                                 #save results in Teacher object textbox
        dct['payments'] = payments
        dct['fte'] = fte
        dct['mindiff'] = round(min_abs_diff,4)
        dct['salary'] = round(salary,0)

        return(dct)

### personnel classes

provides functionality related to HR 

In [None]:
class Person():                                                         #generic employee class
    def __init__(self,name):                                            #constructor
        self.name = name
        self.payperiods = {}
        return
        
    def add_check(self,check_date,check):
        if check_date not in self.payperiods.keys():
            self.payperiods[check_date] = {}
        check_seq = 1+len(self.payperiods[check_date])
        self.payperiods[check_date][check_seq] = check
        return
    
    def get_name(self):                                                 #return name of person
        return(self.name)
    
    def get_payperiods(self):
        return(self.payperiods)
    
    def get_payperiod(self,check_date):
        try:
            return(self.payperiods[check_date])
        except IndexError:
            return({})

## Read the pdf and create a dictionary with the contents of each text box

### Function read_pdf() reads a PDF and returns a dictionary containing the contents

Strategy for this document:  

Save information from each element in the LTTextBox objects in a dictionary including:

- x0 horizontal coordinate of the upper left corner of the text box
- x1 horizontal coordinate of the lower right corner of the text box
- y0 vertical coordinate of the upper left corner of the text box
- y1 vertical coordinate of the lower right corner of the text box
- page number 
- sequence number of text box within this page
- text contained in the text box, converted to ascii

Parsing the text is complicated by the fact that that a text box may span multiple columns and/or rows, and the text box groupings vary quite a bit depending on the page contents and layout.

However, with a bit of luck the structure of the document will allow the contents to be deciphered with the following heuristics:

- Text boxes containing left justified columns will tend to have nearly the same x0 coordinates
- Text boxes containing right justified columns will tend to have nearly the same x1 coordinates
- The codes for fund, account code, and object code are numeric and have fixed lengths
- Extraneous information is often preceded or followed by a series of underscore and newline characters
- Last name can be distinguished because is the only field that is all characters followed by a comma
- Last name may be preceded by between one and three numerical fields:  fund, account, object.  If it is, the x0 value is shifted to the left.
    - Three numerical fields precede the name:  assume they are fund, account, object
    - Two numerical fields precede the name: assume they are account, object
    - One numerical field precedes the name: assume it is object
    

In [None]:
def read_pdf(path):
    document = open(path, 'rb')                                     #read a pdf and create a document object
    rsrcmgr = PDFResourceManager()                                  #create a resource manager
    laparams = LAParams()                                           #set the parameters for analysis
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)          #create a PDF page aggregator object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    pdf={}                                                          #dictionary to hold the results

    pageno = -1                                                     #initialize page coounter to zero

    for page in PDFPage.get_pages(document):                        #loop through the pdf page by page
        pageno = pageno + 1                                         #increment the page number
        pdf[pageno] = {}                                            #dictionary for this page
        interpreter.process_page(page)                              # receive the LTPage object for the page.
        layout = device.get_result()                                # create layout object
        tbox_no=0                                                   # index for element number
        for element in layout:
            if (type(element).__name__=='LTTextBoxHorizontal'):     #loop through text boxes
                tbox_no += 1                                        #increment text box number
                pdf[pageno][tbox_no] = {}                           #dictionary for text boxes within page
                x0 = round(element.x0,2)                            #x0 coordinate of textbox corner
                x1 = round(element.x1,2)                            #x1 coordinate of textbox corner
                y0 = round(element.y0,2)                            #y0 coordinate of textbox corner
                y1 = round(element.y1,2)                            #y1 coordinate of textbox corner
                txt = element.get_text().encode('ascii', 'ignore')  #text converted to ascii
                pdf[pageno][tbox_no]['x0'] = x0                     #create x0 coordinate entry
                pdf[pageno][tbox_no]['x1'] = x1                     #create x1 coordinate entry
                pdf[pageno][tbox_no]['y0'] = y0                     #create y0 coordinate entry
                pdf[pageno][tbox_no]['y1'] = y1                     #create y1 coordinate entry

                pdf[pageno][tbox_no]['text'] = ''.join(chr(c) for c in txt) #convert bytes to string
    return(pdf)

### Utility functions

In [None]:
#remove the commas from earnings and rate values

def remove_commas(st):
    newstr = st.replace(',','')                     #remove commas from string
    return(newstr)

In [None]:
#remove the headings fields 

def remove_headings(st):
    lines = st.split('\n')                         #split the string at newline characters '\n'
    for line in lines:                             #loop through the resulting lines
        if (line.startswith('FUND ') |\
           (line.startswith('POSITION')) |\
           (line.startswith('RATE')) |\
           (line.startswith('ACCT-')) |\
           (line.startswith('CHECK')) |\
           (line.startswith('_'))):                #check for strings that appear only in headings
            try:
                newline_index = st.index('\n')     #if present, remove this line from the text string
                st = st[newline_index+1:]
            except ValueError:
                print('Value Error',st)            #recover from Value Error and print string
        else:
            return(st)                             #if no headings, just return
    return('')

### Read the FY2017 earnings report

In [None]:
#p17 = read_pdf('../FY17 Gene_Redacted.pdf')
p17 = read_pdf('../FY17 Gene.pdf')

### Read the FY2018 earnings report

In [None]:
#p18 = read_pdf('../FY18 Gene_Redacted.pdf')
p18 = read_pdf('../FY18 Gene.pdf')

### Build a dictionary with only those text boxes containing names

Use the following algorithm to identify text boxes that contain names:

- x0, horizontal coordinate of the upper left corner of the text box, is less than 162
- the text string contains at least one comma

In [None]:
def get_names(dct):

    dnames = {}

    fund = ''
    acct = ''
    obj  = ''
    
    for page in sorted(dct.keys()):                                #loop through text box dictionary by page # 
        if (page not in dnames.keys()):                            #page number is highest level key
            dnames[page] = {}                                      #initialize entry for this page
        for tb in sorted(dct[page].keys()):                        #loop through all text boxes on this page
            if (dct[page][tb]['x0'] < 162.0):                      #those with names start to the left of x0=162
                txt = str(dct[page][tb]['text'])                   #convert the 'text' element to a string
                if (',' in txt):                                   #every name contains a comma
                    txt = remove_headings(txt)
                    lines = txt.split('\n')                        #split text into lines
                    words = lines[0].split()                       #split first line into words
                    for word in words:                             #loop through and strip out fund, acct, obj
                        if (word.isdigit()):
                            if (len(word)==4):                     # 4 digits means fund
                                fund = word
                            if (len(word)==8):                     # 8 digits means acct-code
                                acct = word
                            if (len(word)==5):                     # 5 digits means obj
                                obj = word
                            txt = txt[len(word)+1:]                # remove fund/acct/obj from txt
                    dnames[page][tb] = {}                          #initialize dictionary for this page
                    dnames[page][tb]['x0'] = dct[page][tb]['x0']
                    dnames[page][tb]['x1'] = dct[page][tb]['x1']
                    dnames[page][tb]['y0'] = dct[page][tb]['y0']
                    dnames[page][tb]['y1'] = dct[page][tb]['y1']
                    dnames[page][tb]['fund'] = fund
                    dnames[page][tb]['acct'] = acct
                    dnames[page][tb]['obj'] = obj
                    dnames[page][tb]['text'] = txt
    return(dnames)

### Consolidate text boxes that overlap on the vertical scale and contain names

In [None]:
def consolidate_name_boxes(names):
    newnames = {}
    
    for page in sorted(names.keys()):                                        #loop through pages of pdf
        newnames[page] = {}                                                  #initialize new names dictionary
        skip = make_array()                                                  #initialize list of boxes to skip
    
        for tb in sorted(names[page].keys()):                                #loop through text boxes on this page
            for tb2 in sorted(names[page].keys()):                           #compare this one to the others
                if ((tb2 > tb) & \
                    (names[page][tb]['y0'] <= names[page][tb2]['y1']) & \
                    (names[page][tb2]['y0'] <= names[page][tb]['y1'])):      
                    d = {}                                                   #initialize replacement entry
                    d['x0'] = names[page][tb]['x0']                          #keep x0    
                    d['x1'] = names[page][tb2]['x1']                         #replace x1 with tb2 value
                    d['y0'] = names[page][tb2]['y0']                         #replace y0 with tb2 value
                    d['y1'] = names[page][tb]['y1']                          #keep y1 value
                    d['text'] = names[page][tb]['text'] +\
                        names[page][tb2]['text']                             #contatenate text strings
                    d['fund'] = names[page][tb]['fund']                      #copy fund, acct, and obj
                    d['acct'] = names[page][tb]['acct']
                    d['obj'] = names[page][tb]['obj']
                    newnames[page][tb2] = d                                  #plug into dictionary
                    skip = np.append(skip,tb)                                #add old boxes to skip list
                    skip = np.append(skip,tb2)
            if (tb not in skip):                                             #if no match, check skip list 
                newnames[page][tb] = names[page][tb]                         #just copy if not in skip list
                    
    return(newnames)

In [None]:
def combdd(cn,pdf):
    
    dd = {}
    
    for page in sorted(cn.keys()):
        if page not in dd.keys():
            dd[page] = {}
        for tb in sorted(cn[page].keys()):                               #loop through consolidated name textboxes
            dd[page][tb] = cn[page][tb]
            y0  = dd[page][tb]['y0']                                      #extract vertical coordinates
            y1  = dd[page][tb]['y1']
            txt = dd[page][tb]['text']                           #extract text
            for tb2 in sorted(pdf[page].keys()):                            #loop through the other boxes in pdf
                if (tb != tb2):                                             #ignore if same box as names
                    tx0 = pdf[page][tb2]['x0']                              #get horizontal offset
                    ty0 = pdf[page][tb2]['y0']                              #check whether the vertical 
                    ty1 = pdf[page][tb2]['y1']                              #range of this box overlaps that
                    if ((y0 <= ty1) & (ty0 <= y1)):                         #of the name box
                        txt = remove_headings(pdf[page][tb2]['text'])
                        if ((312.0 < tx0) & (tx0 < 316.0)):                 #match to DATE/NUMBER
                            dd[page][tb]['numbers1'] = txt
                        if ((383.0 < tx0) & (tx0 < 395.0)):                 #match to NUMBER
                            if 'numbers2' not in dd[page][tb].keys():
                                dd[page][tb]['numbers2'] = txt
                            else:
                                dd[page][tb]['numbers2'] += txt
                        if ((437.0 < tx0) & (tx0 < 440.0)):                 #match to POSITION
                            dd[page][tb]['positions'] = txt
                        if ((509.0 < tx0) & (tx0 < 533.0)):                 #match to RATE 
                            dd[page][tb]['rates'] = remove_commas(txt)
                        if ((558.0 < tx0) & (tx0 < 630.0)):                 #match to ACCT-EARNINGS
                            dd[page][tb]['earnings'] = remove_commas(txt)

    return(dd)

In [None]:
def get_lines(nn):
    
    lld = {}
    
    for page in sorted(nn.keys()):
        if page not in lld.keys():
            lld[page] = {}
        for tb in sorted(nn[page].keys()):
            if tb not in lld[page].keys():
                lld[page][tb]              = {}
                lld[page][tb]['names']     = []
                lld[page][tb]['checks']    = []
                lld[page][tb]['dates']     = []
                lld[page][tb]['rates']     = []
                lld[page][tb]['earnings']  = []
                lld[page][tb]['positions'] = []
                lld[page][tb]['fund']      = ''
                lld[page][tb]['acct']      = ''
                lld[page][tb]['obj']       = ''
            txt = nn[page][tb]['text']
            words = txt.split('\n')
            for word in words:
                if (len(word) > 1):
                    lld[page][tb]['names'].append(word)
            if 'numbers1' in nn[page][tb].keys():
                txt = nn[page][tb]['numbers1']
                words = txt.split('\n')
                for word in words:
                    if word.isdigit():
                        lld[page][tb]['checks'].append(word)
                    elif '/' in word:
                        lld[page][tb]['dates'].append(word)
            if 'numbers2' in nn[page][tb].keys():
                txt = nn[page][tb]['numbers2']
                words = txt.split('\n')
                for word in words:
                    if word.isdigit():
                        lld[page][tb]['checks'].append(word)
            if 'rates' in nn[page][tb].keys():
                txt = nn[page][tb]['rates']
                words = txt.split('\n')
                for word in words:
                    if '.' in word:
                        lld[page][tb]['rates'].append(float(word))
            if 'positions' in nn[page][tb].keys():
                txt = nn[page][tb]['positions']
                words = txt.split('\n')
                for word in words:
                    if len(word)>1:
                        lld[page][tb]['positions'].append(word)
            if 'fund' in nn[page][tb].keys():
                lld[page][tb]['fund'] = nn[page][tb]['fund']
            if 'acct' in nn[page][tb].keys():
                lld[page][tb]['acct'] = nn[page][tb]['acct']
            if 'obj' in nn[page][tb].keys():
                lld[page][tb]['obj'] = nn[page][tb]['obj']
            if 'earnings' in nn[page][tb].keys():
                txt = nn[page][tb]['earnings']
                had_underscore = False
                words = txt.split('\n')
                for word in words:
                    if '.' in word:
                        if not had_underscore: 
                            lld[page][tb]['earnings'].append(float(word))
                            had_underscore = False
                    elif '_' in word:
                        had_underscore = True
            if (len(lld[page][tb]['checks']) < len(lld[page][tb]['dates'])):
                new_checks = []
                check_index = 0
                for i in np.arange(len(lld[page][tb]['earnings'])):
                    if (lld[page][tb]['earnings'][i] > 0.0):
                        new_checks.append(lld[page][tb]['checks'][check_index])
                        check_index += 1
                    else:
                        new_checks.append('gen'+str(page) + '-' + str(tb) + '-' + str(i))
                        print("inserting check number: ",page,tb,i)
                lld[page][tb]['checks'] = new_checks
    return(lld)

### Read the earnings reports and process them

In [None]:
def process_earnings(pdf):
    nnd = get_names(pdf)
    cnd = consolidate_name_boxes(nnd)
    newnames = combdd(cnd,pdf)
    lld = get_lines(newnames)
    return(lld)

ll={}

ll[2017] = process_earnings(p17)
ll[2018] = process_earnings(p18)

### Check earnings against totals

In [None]:
totearn = {}

for year in ll.keys():
    if year not in totearn.keys():
        totearn[year] = 0.0
    for page in ll[year].keys():
        for tb in ll[year][page].keys():
            for amt in ll[year][page][tb]['earnings']:
                totearn[year] += amt
                
print(round(totearn[2017],2))       #FY2017 earnings report total is $22,608,024.34
print(round(totearn[2018],2))       #FY2018 earnings report total is $22,409,915.41

In [None]:
checks = {}

pp = payperiod()          #get end dates of payroll periods

egacct = EG_acct_codes()   #EG acct codes

rideobj = RIDE_Obj_labels()

sm = teacher_salary_matrix()

paydays = pp.get_payperiods()

for year in ll.keys():
    for page in ll[year].keys():
        for tb in ll[year][page].keys():
            check_numbers = ll[year][page][tb]['checks']
            names         = ll[year][page][tb]['names']
            check_dates   = ll[year][page][tb]['dates']
            fund          = ll[year][page][tb]['fund']
            acct          = ll[year][page][tb]['acct']
            obj           = ll[year][page][tb]['obj']
            positions     = ll[year][page][tb]['positions']
            rates         = ll[year][page][tb]['rates']
            earnings      = ll[year][page][tb]['earnings']
            obj_desc      = rideobj.get_obj_desc(int(obj))
            acct_desc     = egacct.get_eg_acct_desc(acct)
            acct_UCOA     = egacct.get_eg_acct_UCOA(acct)
            
            for i in np.arange(len(check_numbers)):
                
                check_number    = check_numbers[i]
                name            = names[i]
                date_str        = check_dates[i]
                position        = positions[i]
                rate            = rates[i]
                earned          = earnings[i]
                
                words = date_str.split('/')
                check_date   = date(int(words[2]),int(words[0]),int(words[1]))
                if (check_date not in paydays.keys()):
                    new_date = pp.get_previous_payday(check_date)
                    print('adjusting date',name,check_date,new_date)
                    check_date = new_date
                stepdata = sm.decode_earnings(check_date,rate,earned,pp)
                if (len(stepdata['step']) < 1):
                    stepdata={}
                if check_number not in checks.keys():
                    checks[check_number] = pay_check(check_number,name,check_date,pp)

                checks[check_number].add_item(fund,acct,obj,position,rate,earned, \
                            acct_desc,obj_desc,acct_UCOA,stepdata)

In [None]:
earns = {}

for check in checks.keys():
    check_date = checks[check].get_date()
    if check_date not in earns.keys():
        earns[check_date] = 0.0
    itms = checks[check].get_items()
    for item in itms.keys():
        earns[check_date] += itms[item]['earnings']
    
for ckdate in sorted(earns.keys()):
    print(ckdate,round(earns[ckdate],2))
    

In [None]:
people = {}

for ckno in checks.keys():
    ck   = checks[ckno]
    name = ck.get_name()
    ckdate = ck.get_date()
    if name not in people.keys():
        people[name] = Person(name)
    people[name].add_check(ckdate,ck)
        
len(people)

In [None]:
for name in people.keys():
    paydays = people[name].get_payperiods()
    print(name)
    for ckdate in paydays.keys():
        print('    ',ckdate)
        for seq in paydays[ckdate].keys():
            check = paydays[ckdate][seq]
            cknum = check.get_number()
            items = check.get_items()
            print('        ',seq,cknum,items)

In [None]:
retirees = ['MALLOZZI, JOANN S','HADFIELD, RENEE M','KOWAL, MAUREEN E','JOHNSON, TRESSA', \
           'DEPASQUALE, HELEN L','CAVANAUGH, JUDITH L']

for name in retirees:
    ck = people[name].get_payperiod(date(2018,5,25))
    itms = ck[1].get_items()
    step = itms[1]
    print(name, step)

### Determine distribution of steps for new hires

Anyone whose first paycheck occurs after 8/15/2016 is assumed to have been hired in for FY2017 or FY2018

In [None]:
pdct = {}

for name in people.keys():
    if name not in pdct.keys():
        pdct[name] = {}
        pdct[name]= {'dt':date(2020,1,1),'step':'unk','payments':0,'mindiff':100.0}
    p = people[name].get_payperiods()
    for d in p.keys():
        for i in p[d].keys():
            itms = p[d][i].get_items()
            for j in itms.keys():
                if ('TEACHER' in itms[j]['position']):
                    if (len(itms[j]['step_info']) > 1):
                        step = itms[j]['step_info']['step']
                        pmts = itms[j]['step_info']['payments']
                        mindiff = itms[j]['step_info']['mindiff']
                        if (d < pdct[name]['dt']):
                            pdct[name]['dt'] = d
                            pdct[name]['step'] = step
                            pdct[name]['payments'] = pmts
                            pdct[name]['mindiff'] = mindiff
                        
steps = {}
                    
for name in pdct.keys():
    if ((pdct[name]['dt'] > date(2016,8,15)) & \
        (pdct[name]['dt'] < date(2019,1,1)) & \
        (pdct[name]['mindiff'] < 0.5) & \
        (pdct[name]['payments'] > 22.0)) :
        print(name,pdct[name]['dt'],pdct[name]['step'],pdct[name]['payments'],pdct[name]['mindiff'])
        step = pdct[name]['step'] 
        if step not in steps.keys():
            steps[step] = 0
        steps[step]+=1

print(len(steps))       
        
steps          

In [None]:
p = {}  #positions

for check in checks.keys():
    ck = checks[check]
    name = ck.get_name()
    cdate = ck.get_date()
    items=ck.get_items()
    for itm in items.keys():
        pos = items[itm]['position']
        if (pos == 'TEACHER'):
            if name not in p.keys():
                p[name] = {}
                p[name]['min'] = cdate
                p[name]['max'] = cdate
                p[name]['step'] = 'unk'
            else:
                if (cdate < p[name]['min']):
                    p[name]['min'] = cdate
                if (cdate > p[name]['max']):
                    p[name]['max'] = cdate
                    
first_date = date(2016,8,15)
last_date  = date(2018,6,22)
for name in p.keys():
    mindate = p[name]['min']
    if (mindate > first_date):
        pc = people[name].get_payperiod(mindate)
        pci = pc[1].get_items()
        print(name,pci[1]['step_info'])
    