## Read earnings reports (OO version)

E. Quinn 12/22/2019

This notebook uses pdfminer to extract the information from the individual earnings report

The documentation for pdfminer is at:

https://buildmedia.readthedocs.org/media/pdf/pdfminer-docs/latest/pdfminer-docs.pdf

## Import standard python datascience packages

In [None]:
import math
import re
import copy
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from datetime import *
from datascience import *

## Import pdfminer packages

In [None]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal

### Show the directory we are running in

In [None]:
!pwd

## Read the pdf and create a dictionary with the contents of each text box

### Function read_pdf() reads a PDF and returns a dictionary containing the contents

Strategy for this document:  

Save information from each element in the LTTextBox objects in a dictionary including:

- x0 horizontal coordinate of the upper left corner of the text box
- x1 horizontal coordinate of the lower right corner of the text box
- y0 vertical coordinate of the upper left corner of the text box
- y1 vertical coordinate of the lower right corner of the text box
- page number 
- sequence number of text box within this page
- text contained in the text box, converted to ascii

Parsing the text is complicated by the fact that that a text box may span multiple columns and/or rows, and the text box groupings vary quite a bit depending on the page contents and layout.

However, with a bit of luck the structure of the document will allow the contents to be deciphered with the following heuristics:

- Text boxes containing left justified columns will tend to have nearly the same x0 coordinates
- Text boxes containing right justified columns will tend to have nearly the same x1 coordinates
- The codes for fund, account code, and object code are numeric and have fixed lengths
- Extraneous information is often preceded or followed by a series of underscore and newline characters
- Last name can be distinguished because is the only field that is all characters followed by a comma
- Last name may be preceded by between one and three numerical fields:  fund, account, object.  If it is, the x0 value is shifted to the left.
    - Three numerical fields precede the name:  assume they are fund, account, object
    - Two numerical fields precede the name: assume they are account, object
    - One numerical field precedes the name: assume it is object
    

In [None]:
def read_pdf(path):
    document = open(path, 'rb')                                     #read a pdf and create a document object
    rsrcmgr = PDFResourceManager()                                  #create a resource manager
    laparams = LAParams()                                           #set the parameters for analysis
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)          #create a PDF page aggregator object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    pdf={}                                                          #dictionary to hold the results

    pageno = -1                                                     #initialize page coounter to zero

    for page in PDFPage.get_pages(document):                        #loop through the pdf page by page
        pageno = pageno + 1                                         #increment the page number
        pdf[pageno] = {}                                            #dictionary for this page
        interpreter.process_page(page)                              # receive the LTPage object for the page.
        layout = device.get_result()                                # create layout object
        tbox_no=0                                                   # index for element number
        for element in layout:
            if (type(element).__name__=='LTTextBoxHorizontal'):     #loop through text boxes
                tbox_no += 1                                        #increment text box number
                pdf[pageno][tbox_no] = {}                           #dictionary for text boxes within page
                x0 = round(element.x0,2)                            #x0 coordinate of textbox corner
                x1 = round(element.x1,2)                            #x1 coordinate of textbox corner
                y0 = round(element.y0,2)                            #y0 coordinate of textbox corner
                y1 = round(element.y1,2)                            #y1 coordinate of textbox corner
                txt = element.get_text().encode('ascii', 'ignore')  #text converted to ascii
                pdf[pageno][tbox_no]['x0'] = x0                     #create x0 coordinate entry
                pdf[pageno][tbox_no]['x1'] = x1                     #create x1 coordinate entry
                pdf[pageno][tbox_no]['y0'] = y0                     #create y0 coordinate entry
                pdf[pageno][tbox_no]['y1'] = y1                     #create y1 coordinate entry

                pdf[pageno][tbox_no]['text'] = ''.join(chr(c) for c in txt) #convert bytes to string
    return(pdf)

### Utility functions

In [None]:
#remove the commas from earnings and rate values

def remove_commas(st):
    newstr = st.replace(',','')                     #remove commas from string
    return(newstr)

In [None]:
#remove the headings fields 

def remove_headings(st):
    lines = st.split('\n')                         #split the string at newline characters '\n'
    for line in lines:                             #loop through the resulting lines
        if (line.startswith('FUND ') |\
           (line.startswith('POSITION')) |\
           (line.startswith('RATE')) |\
           (line.startswith('ACCT-')) |\
           (line.startswith('_'))):                #check for strings that appear only in headings
            try:
                newline_index = st.index('\n')     #if present, remove this line from the text string
                st = st[newline_index+1:]
            except ValueError:
                print('Value Error',st)            #recover from Value Error and print string
        else:
            return(st)                             #if no headings, just return
    return('')

### Build a dictionary with only those text boxes containing names

Use the following algorithm to identify text boxes that contain names:

- x0, horizontal coordinate of the upper left corner of the text box, is less than 162
- the text string contains at least one comma

In [None]:
def get_names(dct):

    dnames = {}

    fund = ''
    acct = ''
    obj  = ''
    
    for page in sorted(dct.keys()):                                #loop through text box dictionary by page # 
        if (page not in dnames.keys()):                            #page number is highest level key
            dnames[page] = {}                                      #initialize entry for this page
        for tb in sorted(dct[page].keys()):                        #loop through all text boxes on this page
            if (dct[page][tb]['x0'] < 162.0):                      #those with names start to the left of x0=162
                txt = str(dct[page][tb]['text'])                   #convert the 'text' element to a string
                if (',' in txt):                                   #every name contains a comma
                    txt = remove_headings(txt)
                    lines = txt.split('\n')                        #split text into lines
                    words = lines[0].split()                       #split first line into words
                    for word in words:                             #loop through and strip out fund, acct, obj
                        if (word.isdigit()):
                            if (len(word)==4):                     # 4 digits means fund
                                fund = word
                            if (len(word)==8):                     # 8 digits means acct-code
                                acct = word
                            if (len(word)==5):                     # 5 digits means obj
                                obj = word
                            txt = txt[len(word)+1:]                # remove fund/acct/obj from txt
                    dnames[page][tb] = {}                          #initialize dictionary for this page
                    dnames[page][tb]['x0'] = dct[page][tb]['x0']
                    dnames[page][tb]['x1'] = dct[page][tb]['x1']
                    dnames[page][tb]['y0'] = dct[page][tb]['y0']
                    dnames[page][tb]['y1'] = dct[page][tb]['y1']
                    dnames[page][tb]['fund'] = fund
                    dnames[page][tb]['acct'] = acct
                    dnames[page][tb]['obj'] = obj
                    dnames[page][tb]['text'] = txt
    return(dnames)

### Consolidate text boxes that overlap on the vertical scale and contain names

In [None]:
def consolidate_name_boxes(names):
    newnames = {}
    
    for page in sorted(names.keys()):                                        #loop through pages of pdf
        newnames[page] = {}                                                  #initialize new names dictionary
        skip = make_array()                                                  #initialize list of boxes to skip
    
        for tb in sorted(names[page].keys()):                                #loop through text boxes on this page
            for tb2 in sorted(names[page].keys()):                           #compare this one to the others
                if ((tb2 > tb) & \
                    (names[page][tb]['y0'] <= names[page][tb2]['y1']) & \
                    (names[page][tb2]['y0'] <= names[page][tb]['y1'])):      
                    d = {}                                                   #initialize replacement entry
                    d['x0'] = names[page][tb]['x0']                          #keep x0    
                    d['x1'] = names[page][tb2]['x1']                         #replace x1 with tb2 value
                    d['y0'] = names[page][tb2]['y0']                         #replace y0 with tb2 value
                    d['y1'] = names[page][tb]['y1']                          #keep y1 value
                    d['text'] = names[page][tb]['text'] +\
                        names[page][tb2]['text']                             #contatenate text strings
                    d['fund'] = names[page][tb]['fund']                      #copy fund, acct, and obj
                    d['acct'] = names[page][tb]['acct']
                    d['obj'] = names[page][tb]['obj']
                    newnames[page][tb2] = d                                  #plug into dictionary
                    skip = np.append(skip,tb)                                #add old boxes to skip list
                    skip = np.append(skip,tb2)
            if (tb not in skip):                                             #if no match, check skip list 
                newnames[page][tb] = names[page][tb]                         #just copy if not in skip list
                    
    return(newnames)

### Match consolidated text boxes with names to text boxes for other columns

In [None]:
def get_matches(names,pdf):

    for page in sorted(names.keys()):                                       #loop through pages
        for tb in sorted(names[page].keys()):                               #loop through text boxes
            y0 = names[page][tb]['y0']                                      #extract vertical coordinates
            y1 = names[page][tb]['y1']
            txt = names[page][tb]['text']                                   #extract text
           
            for tb2 in sorted(pdf[page].keys()):                            #loop through the other boxes in pdf
                if (tb != tb2):                                             #ignore if same box as names
                    tx0 = pdf[page][tb2]['x0']                              #get horizontal offset
                    ty0 = pdf[page][tb2]['y0']                              #check whether the vertical 
                    ty1 = pdf[page][tb2]['y1']                              #range of this box overlaps that
                    if ((y0 <= ty1) & (ty0 <= y1)):                         #of the name box
                        txt = remove_headings(pdf[page][tb2]['text'])
                        if ((437.0 < tx0) & (tx0 < 440.0)):                 #match to POSITION
                            names[page][tb]['positions'] = txt
                        if ((509.0 < tx0) & (tx0 < 533.0)):                 #match to RATE 
                            names[page][tb]['rates'] = remove_commas(txt)
                        if ((558.0 < tx0) & (tx0 < 630.0)):                 #match to ACCT-EARNINGS
                            names[page][tb]['earnings'] = remove_commas(txt)
    return                                                              

### Get sum of earnings lines

Used for balancing to total on earnings report

In [None]:
def balancing_totals(dct):
    
    total=0.0                                                       #initialize grand totals

    for page in dct.keys():                                         #loop through pages of earnings report
        for tb in dct[page].keys():                                 #loop through the text boxes on this page
            names    = dct[page][tb]['text'].split('\n')            #split the names string into lines
            earnings = dct[page][tb]['earnings'].split('\n')        #split the earnings string into lines
            
            for i in np.arange(len(names)):                         #loop through names
                if (len(names[i]) > 1):                             #
                    earnamt = float(earnings[i])                    #convert the corresponding earnings to float
                    total += earnamt                                #add to running total
                
    print(round(total,2))                                           #print the grand total

    return

### Read the FY2017 earnings report

In [None]:
p17 = read_pdf('../FY17 Gene_Redacted.pdf')

### Read the FY2018 earnings report

In [None]:
p18 = read_pdf('../FY18 Gene_Redacted.pdf')

### Process the FY17 Earnings report

In [None]:
dnames17 = get_names(p17)                                   #extract name records from pdf
consolidated_names17 = consolidate_name_boxes(dnames17)     #consolidate overlapping text boxes containing names
get_matches(consolidated_names17,p17)                       #match text boxes for other columns by vertical pos

### Process the FY18 Earnings report

In [None]:
dnames18 = get_names(p18)                                   #extract name records from pdf
consolidated_names18 = consolidate_name_boxes(dnames18)     #consolidate overlapping text boxes containing names
get_matches(consolidated_names18,p18)                       #match text boxes for other columns by vertical pos

### Check sum of earnings against total on earnings reports

In [None]:
balancing_totals(consolidated_names17)     #FY2017 earnings report total is $22,608,024.34

In [None]:
balancing_totals(consolidated_names18)     #FY2018 earnings report total is $22,409,915.41

## Define classes

In [None]:
class Person():
    def __init__(self,name):
        self.name = name
        self.start_date = None
        self.retirement_date = None
        self.resignation_date = None
class Teacher(Person):
    def __init__(self,name,roll):
        self.roll = roll
        Child.__init__(self,name)
        
class EG_acct_codes():
    def __init__(self):
        self.EG_account_codes ={
            '71100105': 'K Frenchtown','71100107': 'K MDBK','71109105': 'Title 1  Frenchtown',
            '71109107': 'Title 1  MDBK','71110105': 'Grade 1 Frenchtown','71110107': 'Grade 1 MDBK',
            '71120105': 'Grade 2 Frenchtown','71120107': 'Grade 2 MDBK','71121102': 'Art Eldredge',
            '71121103': 'Art Cole','71121105': 'Art Frenchtown','71121106': 'Art EGHS',
            '71121107': 'Art MDBK','71121108': 'Art Hanaford','71123103': 'ELA Cole',
            '71123106': 'ELA EGHS','71124103': 'Foreign Language Cole','71124106': 'Foreign Language EGHS',
            '71125102': 'PE/health Eldredge','71125103': 'PE/health Cole','71125105': 'PE/health Frenchtown',
            '71125106': 'PE/health EGHS','71125107': 'PE/health MDBK','71125108': 'PE/health Hanaford',
            '71126103': 'Tech Cole','71126306': 'Tech EGHS','71127103': 'Math Cole',
            '71127106': 'Math EGHS','71128102': 'Music Eldredge','71128103': 'Music Cole',
            '71128105': 'Music Frenchtown','71128106': 'Music EGHS','71128107': 'Music MDBK',
            '71128108': 'Music Hanaford','71129103': 'Science Cole','71129106': 'Science EGHS',
            '71130102': 'Grade 3 Eldredge','71130105': 'Grade 3 Frenchtown','71130108': 'Grade 3 Hanaford',
            '71130406': 'Business/Computer  EGHS','71131103': 'Social Studies Cole',
            '71131106': 'Social Studies EGHS','71140102': 'Grade 4 Eldredge','71140108': 'Grade 4 Hanaford',
            '71140403': 'Computer Cole','71140406': 'Computer EGHS','71141102': 'Reading Eldredge',
            '71141103': 'Reading Cole','71141105': 'Reading Frenchtown','71141106': 'Reading EGHS',
            '71141107': 'Reading MDBK','71141108': 'Reading Hanaford','71150102': 'Grade 5 Eldredge',
            '71150108': 'Grade 5 Hanaford','71180102': 'SPED Eldredge','71180103': 'SPED EGHS',
            '71180105': 'SPED Frenchtown','71180106': 'SPED EGHS','71180107': 'SPED MDBK',
            '71180108': 'SPED Hanaford','71181102': 'SPED Life Skills Eldredge',
            '71181103': 'SPED Life Skills Cole','71181105': 'SPED Life Skills Frenchtown',
            '71181106': 'SPED Life Skills EGHS','71181107': 'SPED Life Skills MDBK','71182107': 'SPED EGHS',
            '71191302': 'ESL Eldredge','71191303': 'ESL Cole','71191305': 'ESL Frenchtown',
            '71191306': 'ESL EGHS','71191307': 'ESL MDBK','71191308': 'ESL Hanaford','71231503': 'Guidance Cole',
            '71231506': 'Guidance EGHS','71246702': 'Librarian Eldredge','71246703': 'Librarian Cole',
            '71246705': 'Librarian Frenchtown','71246706': 'Librarian EGHS','71246707': 'Librarian MDBK',
            '71246708': 'Librarian Hanaford','71269506': 'Nurse EGHS','71270302': 'Nurse Eldredge',
            '71270303': 'Nurse Cole','71270305': 'Nurse Frenchtown','71270306': 'Nurse EGHS',
            '71270307': 'Nurse MDBK','71270308': 'Nurse Hanaford','71301106': 'SPED EGHS',
            '71301602': 'Social Worker Eldredge','71301603': 'Social Worker Cole',
            '71301605': 'Social Worker Frenchtown','71301606': 'Social Worker EGHS',
            '71301607': 'Social Worker MDBK','71301608': 'Social Worker Hanaford','71302702': 'OT Eldredge',
            '71302703': 'OT Cole','71302705': 'OT Frenchtown','71302706': 'OT EGHS','71302707': 'OT MDBK',
            '71302708': 'OT Hanaford','71308102': 'Adaptive PE Eldredge','71308103': 'Adaptive PE Cole',
            '71308105': 'Adaptive PE Frenchtown','71308106': 'Adaptive PE EGHS','71308107': 'Adaptive PE MDBK',
            '71308108': 'Adaptive PE Hanaford','71310106': 'History EGHS','71311702': 'Psychologist Eldredge',
            '71311703': 'Psychologist Cole','71311705': 'Psychologist Frenchtown','71311706': 'Psychologist EGHS',
            '71311707': 'Psycholotist MDBK','71311708': 'Psychologist Hanaford','71321802': 'Speech Eldredge',
            '71321803': 'Speech Cole','71321805': 'Speech Frenchtown','71321806': 'Speech EGHS',
            '71321807': 'Speech MDBK','71321808': 'Speech Hanaford'}
        
    def get_eg_acct_desc(self,acct):
        """Provides descriptions for accounting codes in EG MUNIS system."""
        try:
            return(self.EG_account_codes[acct])
        except KeyError:
            return('(no description)')
        
class EG_Salaries():
    """Provides salary matrix lookup function by year, column, and step for FY2013-FY2022."""
    def __init__(self):

        self.cba_cols ={'B': 0,'B+30': 1,'M': 2,'M+30': 3,'M2/CAGS': 4, 'D': 5}
        
        self.cba = np.zeros((10, 10, 6))    #salary matrix is 3-D numpy array indexed by: fyear, step, column
        
                                                          #start with FY2016 (2015-2016) salary matrix
        self.cba[3,:,:] = np.array([
            [41286, 42900, 43871, 44505, 44893, 45186],
            [44871, 46484, 47454, 48085, 48474, 48771],
            [48494, 50106, 51078, 51709, 52098, 52393],
            [52118, 53729, 54700, 55332, 55722, 56018],
            [55743, 57354, 58328, 58958, 59347, 59642],
            [59366, 60979, 61951, 62583, 62974, 63266],
            [62991, 64605, 65574, 66206, 66596, 66892],
            [66616, 68228, 69199, 69829, 69806, 70515],
            [71741, 73353, 74323, 74954, 75345, 75639],
            [78898, 80675, 81743, 82438, 82866, 83190]]) 
        
                                                        #FY2017 is the same as FY2016
        self.cba[4,:,:] = self.cba[3,:,:]
        
                                                        #FY2018 2% increase
        self.cba[5,:,:] = np.around(1.02*self.cba[4,:,:],0)
        
                                                        #FY2019 2.25% increase
        self.cba[6,:,:] = np.around(1.0225*self.cba[5,:,:],0)
        
                                                        #FY2020 same as FY2019
        self.cba[7,:,:] = self.cba[6,:,:]
        
                                                        #FY2021 2% increase
        self.cba[8,:,:] = np.around(1.02*self.cba[7,:,:],0)
        
                                                        #FY2022 2.25% increase
        self.cba[9,:,:] = np.around(1.0225*self.cba[8,:,:],0)

        
                                                        #FY2015: back out 2% increase from FY2016
        self.cba[2,:,:] = np.around(self.cba[3,:,:]/1.02,0) 
        
                                                        #FY2014: back out 2.5% increase from FY2015
        self.cba[1,:,:] = np.around(self.cba[2,:,:]/1.025,0)  
        
                                                        #FY2013: back out 1.01% from FY2014 for steps 1-9
        self.cba[0,0:8,:] = np.around(self.cba[1,0:8,:]/1.01,0)
                                                        #FY2013: back out 2.25% from FY2014 for step 10
        self.cba[0,9,:]   = np.around(self.cba[1,9,:]/1.0225,0)  
    
    def get_salary(self,fyear,col,step):
        """Returns CBA salary given fiscal year, column, and step for FY2013-FY2022."""
        return self.cba[fyear-2013,step-1,self.cba_cols[col]]
    
    def get_step(self,rate,fyear):                                          #determine step value
        """Returns step code given daily rate and fiscal year for FY2013-FY2022."""
        salary = round(184.0*rate,0)        #salary is 184 times daily rate
        lower_bound = salary - 1.0          #lower limit for tolerance
        upper_bound = salary + 1.0              #upper limit for tolerance
                                                                            #search salary matrix for a match
        for j in np.arange(10):
            step = 1+j
            for col in self.cba_cols:
                if( (lower_bound <= self.get_salary(fyear,col,step)) & \
                    (self.get_salary(fyear,col,step) <= upper_bound) ):     #computed salary within $1 of CBA
                    step_code = str(fyear) + '-' + col + '-' + str(step)    #code is:  yyyy-cat-step
                    return(step_code)                                       #return step code that matches
        return('')        
    
    def get_fte_and_payments(self,rate,earnings):
        """Returns FTE,number of annual payments, and min error given rate and earnings."""
        fractions = [1/2.,1/3.,2/3.,1/4.,3/4.,1/5.,2/5.,3/5.,4/5.,1/6.,5/6.,\
            1/7.,2/7.,3/7.,4/7.,5/7.,6/7.,1/8.,3/8.,5/8.,7/8.,\
            1/9.,2/9.,4/9.,5/9.,7/9.,8/9.,1/10.,3/10.,7/10.,9/10.,\
            1/20.,1.0,-1.0,2.0]                                         #possible FTE fractions
        payments  = [21.,26.]                                           #number of payments is 26 or 21
        min_abs_diff = 100000.                                          #initialize difference
    
        if ((earnings > 150.) & (earnings < 4500.)):                    #assume earnings in this range
            for paymts in payments:                                     #loop through possible payments
                for frac in fractions:                                  #loop through possible FTE fractions
                    diff = abs(earnings - frac*rate*184/paymts)         #compute earnings difference
                    if (diff < min_abs_diff):                           #if smallest difference so far:
                        min_abs_diff = diff                             #save difference, fte, payments
                        fte = frac
                        number_of_payments = paymts
        if (min_abs_diff < 0.2):
            rval = [fte,number_of_payments,round(min_abs_diff,4)]
        else:
            rval = [np.NaN,np.NaN,np.NaN]
        return(rval)
    
        return(rval)
    
class RIDE_Obj_labels():
    """Provides a dictionary of RIDE Obj descriptions"""
    
    def __init__(self):
            
        self.obj_labels = {}
            
        self.paths= ["../RIDE/93-All-Expenditure-Account-Strings-with-Descriptions-no-421-422-FY18.csv",\
            "../RIDE/93-All-Expenditure-Account-Strings-with-Descriptions-no-421-422-FY17.csv",\
            "../RIDE/93-All-Expenditure-Account-Strings-with-Descriptions-no-421-422-FY16.csv",\
            "../RIDE/93-All-Expenditure-Account-Strings-with-Descriptions-no-421-422-FY15.csv"]
            
        for path in self.paths:                                             #loop through RIDE 2015-2018 data
            df = pd.read_csv(path, usecols=['Obj','Object Description'])    #read Obj and Object Description  
            dct = df.to_dict()                                              #convert dataframe to dictionary
            for key in dct['Obj'].keys():                                   #build lookup dictionary
                obj = dct['Obj'][key]
                if (obj==obj):
                    if (obj not in self.obj_labels.keys()):
                        obj_desc = dct['Object Description'][key]
                        self.obj_labels[float(obj)] = obj_desc
        return

    def get_obj_desc(self,obj):                                            #look up RIDE Obj description
        """Provides a lookup function for Obj description"""
        try:
            obj_desc = self.obj_labels[obj]                                #return description if found
        except KeyError:
            obj_desc = '(none)'                                            #otherwise return '(none)'
                
        return(obj_desc)

class payperiod():
    """Provides a dictionary of payroll periods"""
    
    def __init__(self):
    
        self.payperiods = {}                            #initialize payperiods dictionary

        delta = timedelta(days=14)                      #14 days per pay period
        x = date(2009,7,3)                              #first pay period of FY2010
        fiscal_year = 2010-1                            #initialize fiscal year
        last_month=6                                    #so that it will update to 2010 at the start

        while (x < date(2026,7,1)):                     #generate pay periods through the end of FY2026
            if ((x.month == 7) & (last_month == 6)):    #test for new fiscal year
                fiscal_year += 1                        #  increment fiscal year
                count = 0                               #  set payperiod count to zero
                if (fiscal_year not in self.payperiods.keys()): #check whether fiscal year is in dictionary yet
                    self.payperiods[fiscal_year] = {}           #if not, add it
            fseq = 1 + count % 26                       #  compute payperiod sequence number
            self.payperiods[fiscal_year][fseq] = x      #add payperiod to dictionary
            count += 1                                  #increment the payperiod count
            last_month = x.month                        #save current month to detect fiscal year end
            x += delta                                  #increment date by 14 days
        
        return
    
    def get_payperiod_end(self,fyear,ppno):
        """Lookup end date of payroll periods in a given fiscal year"""
        try:
            ppend = self.payperiods[fyear][ppno]
        except KeyError:
            ppend = np.NaN
            
        return(ppend)
    
    def get_next_payday(self,y,m,d):
        """Find the end of the current pay period given a date"""
        d = date(y,m,d)
    
        for fyear in self.payperiods.keys():
            for ppno in self.payperiods[fyear].keys():
                payday = self.payperiods[fyear][ppno]
                if (payday >= d):
                    return(payday)
            
        return(np.NaN)
    

### Instantiate classes

Instantiates classes that provide various functions

In [None]:
sm = EG_Salaries()      #functionality associated with salaries
print('get_salary(fyear,step,column): ',sm.get_salary.__doc__)
print('get_step(rate,fyear): ',sm.get_step.__doc__)
print('get_fte_and_payments(self,rate,earnings): ',sm.get_fte_and_payments.__doc__)
print('get_payperiods(self): ',sm.get_payperiods.__doc__)

ac = EG_acct_codes()    #EG accounting codes class
print('get_eg_acct_desc(acct): ',ac.get_eg_acct_desc.__doc__)

od = RIDE_Obj_labels()    #lookup function for RIDE Obj descriptions
print('get_obj_desc(obj): ',od.get_obj_desc.__doc__)

pp = payperiod()          #get end dates of payroll periods
print('get_payperiod_end(fyear,ppno): ',pp.get_payperiod_end.__doc__)

### Assign dates where possible (this is a work in progress)

In [None]:
def assign_dates(rsal):
    
    fyears = {}
    
    for ix in rsal.keys():                #build a dictionary with entries for each fiscal year
        fyear = rsal[ix]['fyear']
        if fyear not in fyears.keys():
            fyears[fyear] = {}
        acct = rsal[ix]['acct']
        if (acct not in fyears[fyear].keys()):
            fyears[fyear][acct] = {}
            fyears[fyear][acct]['min_payments'] = 26
            fyears[fyear][acct]['max_payments'] = 0
            fyears[fyear][acct]['n_payments']   = 0
            fyears[fyear][acct]['payments'] = {}
        payments = np.NaN
        if 'payments' in rsal[ix].keys():
            payments = rsal[ix]['payments']
        if (payments == payments):
            if (payments < fyears[fyear][acct]['min_payments']):
                fyears[fyear][acct]['min_payments'] = rsal[ix]['payments']
            if (payments > fyears[fyear][acct]['max_payments']):
                fyears[fyear][acct]['max_payments'] = rsal[ix]['payments']
        fyears[fyear][acct]['n_payments'] += 1
        fyears[fyear][acct]['payments'][ix] = {}

    for fyear in fyears.keys():
        for acct in fyears[fyear].keys():
            if ((fyears[fyear][acct]['min_payments'] == fyears[fyear][acct]['max_payments'])):    #no change in payments
                n_payments = fyears[fyear][acct]['n_payments']
                if (n_payments==26.0):                                         #exactly 26 payments
                    dix = 1
                    for nix in fyears[fyear][acct]['payments'].keys():
                        fyears[fyear][acct]['payments'][nix]['date'] = pp.get_payperiod_end(fyear,dix)
                        dix += 1
                elif (n_payments ==23.0):                                     #exactly 23 payments
                    dix = 4
                    for nix in fyears[fyear][acct]['payments'].keys():
                        fyears[fyear][acct]['payments'][nix]['date'] = pp.get_payperiod_end(fyear,dix)
                        dix += 1
                elif (n_payments ==21.0):                                     #exactly 21 payments
                    dix = 4
                    for nix in fyears[fyear][acct]['payments'].keys():
                        fyears[fyear][acct]['payments'][nix]['date'] = pp.get_payperiod_end(fyear,dix)
                        dix += 1
                elif (fyears[fyear][acct]['n_payments']==3.0):                #exactly 3 payments
                    dix = 1
                    for nix in fyears[fyear][acct]['payments'].keys():
                        fyears[fyear][acct]['payments'][nix]['date'] = pp.get_payperiod_end(fyear,dix)
                        dix += 1
    return(fyears)

### Get dictionary of people

In [None]:
people = {}

ay = {}                                                           #combine FY17 and FY18
ay[2017] = consolidated_names17
ay[2018] = consolidated_names18
                                                                  #build people dictionary for all years 
for year in ay.keys():                                            # loop through years
    for page in ay[year].keys():                                  # loop through pages of earnings report
        for tb in ay[year][page].keys():                          # loop through text boxes on each page
            fund = ay[year][page][tb]['fund']
            acct = ay[year][page][tb]['acct']
            obj  = ay[year][page][tb]['obj']
            
            txt  = ay[year][page][tb]['text']                      # pick up names
            positions = ay[year][page][tb]['positions']
            rates     = ay[year][page][tb]['rates']
            earnings  = ay[year][page][tb]['earnings']
            
            lines  = txt.split('\n')                               # split names string into lines
            plines = positions.split('\n')
            rlines = rates.split('\n')
            elines = earnings.split('\n')
            
            for i in np.arange(len(lines)):                       # add name to people dictionary
                if (len(lines[i]) > 1):
                    name = lines[i]
                    if (name not in people.keys()):
                        people[name] = {}
                    if (fund not in people[name].keys()):
                        people[name][fund] = {}
                    if (acct not in people[name][fund].keys()):
                        people[name][fund][acct] = {}
                    if (obj not in people[name][fund][acct].keys()):
                        people[name][fund][acct][obj] = {}
                        
                    seq = 1+len(people[name][fund][acct][obj].keys())
                    
                    people[name][fund][acct][obj][seq] = {}
                    people[name][fund][acct][obj][seq]['position'] = plines[i]
                    rate = float(rlines[i])
                    people[name][fund][acct][obj][seq]['rate'] = rate
                    earnings = float(elines[i])
                    people[name][fund][acct][obj][seq]['earnings'] = earnings
                    people[name][fund][acct][obj][seq]['fyear'] = year
                    if (rate > 200.0):
                        step = sm.get_step(rate,year)
                        people[name][fund][acct][obj][seq]['step'] = step
                        rval = sm.get_fte_and_payments(rate,earnings)
                        if (rval[0]==rval[0]):                     #if FTE not NaN
                            people[name][fund][acct][obj][seq]['fte'] =rval[0]
                            people[name][fund][acct][obj][seq]['payments'] = rval[1]
                            people[name][fund][acct][obj][seq]['mindiff'] = rval[2]

### Summarize regular salary for entries on CBA salary matrix

In [None]:
regsal = {}                                                   #dictionary for summary by name

for name in people.keys():                                    #loop through people by name
    for fund in people[name].keys():                          #loop through fund, acct-code
        for acct in people[name][fund].keys():
            for obj in people[name][fund][acct].keys():       #select only obj=51110 regular salary
                if (obj == '51110'):
                    for seq in people[name][fund][acct][obj].keys():                 #select only if step coded
                        if ('step' in people[name][fund][acct][obj][seq].keys()):    #add to summary dictionary
                            if (name not in regsal.keys()):
                                regsal[name] = {}
                            rseq = 1+len(regsal[name].keys())
                            regsal[name][rseq] = {}
                            regsal[name][rseq]['fyear'] = people[name][fund][acct][obj][seq]['fyear']
                            regsal[name][rseq]['acct']  = acct
                            regsal[name][rseq]['step']  = people[name][fund][acct][obj][seq]['step']
                            if ('fte' in people[name][fund][acct][obj][seq].keys()):
                                regsal[name][rseq]['fte']  = people[name][fund][acct][obj][seq]['fte']
                                regsal[name][rseq]['payments']  = people[name][fund][acct][obj][seq]['payments']
                                regsal[name][rseq]['mindiff']  = people[name][fund][acct][obj][seq]['mindiff']
                            regsal[name][rseq]['rate']  = people[name][fund][acct][obj][seq]['rate']
                            regsal[name][rseq]['earnings'] = people[name][fund][acct][obj][seq]['earnings']

### Add dates

In [None]:
for name in regsal.keys():
    pdates = assign_dates(regsal[name])
    for fyear in pdates.keys():
        for acct in pdates[fyear].keys():
            if ('payments' in pdates[fyear][acct].keys()):
                for ix in pdates[fyear][acct]['payments'].keys():
                    try:
                        regsal[name][ix]['date'] = pdates[fyear][acct]['payments'][ix]['date']
                    except KeyError:
                        regsal[name][ix]['date'] = np.NaN

### Print results for those with step values

In [None]:
print("Names with step values: ",len(regsal))         #show the number of distinct names

for name in regsal.keys():                            #compute totals by fiscal year and percent increase
    totsal = {}
    print('\n',name)                                  #start of data for this employee
    for seq in regsal[name].keys():                   #loop through earnings entries and add them up by fyear
        fyear = regsal[name][seq]['fyear']
        if (fyear not in totsal.keys()):
            totsal[fyear] = 0.0
        acct  = regsal[name][seq]['acct'] 
        try:
            acct_desc = ac.get_eg_acct_desc(acct)
        except KeyError:
            acct_desc = '(no label)'
        step  = regsal[name][seq]['step']
        try:
            fte  = round(regsal[name][seq]['fte'],3)
            payments  = regsal[name][seq]['payments']
            mindiff  = regsal[name][seq]['mindiff']
        except KeyError:
            fte = ' '
            payments = ' '
            mindiff = ' '
        rate  = regsal[name][seq]['rate']
        earnings = regsal[name][seq]['earnings']
        try:
            pdate = regsal[name][seq]['date']
        except KeyError:
            pdate = '(no date)'
        totsal[fyear]+= earnings
        print(seq,fyear,acct,acct_desc,pdate,fte,step, payments,mindiff,rate,earnings)
    totstr = ''
    for fy in totsal.keys():
        totstr += str(fy) + ' ' + str(round(totsal[fy],2)) + ' '
    pct_increase = ' '
    if(len(totsal)==2):                             #if more than one year, compute percent change                              
        pct_increase = '  percent change: ' +\
        str(round(100*((totsal[2018]/totsal[2017])-1.0),2))
    totstr = totstr + pct_increase
    print('\n',name,' total regular salary: ',totstr)