## Read earnings reports (OO version)

E. Quinn 12/22/2019

This notebook uses pdfminer to extract the information from the individual earnings report

The documentation for pdfminer is at:

https://buildmedia.readthedocs.org/media/pdf/pdfminer-docs/latest/pdfminer-docs.pdf

Maintenance:

* 3/6/2020  
  * Add check date and number
* 3/7/2020  
  * Align personnel classes with support professionals structure
  * Implement salary step capture for support professionals
* 4/8/2020
  * Rewrite logic to base data structure on check number and check date
  * Simplify payment decoding logic to take advantage of having check date
  * Data corrections for check dates and numbers:
    * Adjust 5 check dates to aliign with nearest payday
    * Generate 4 artificial check numbers for zero earnings lines
* 4/18/2020
  * Add FY2016, FY2015 and FY2019+FY2020ytd (!)
  * Add code to move 12/25/2015 payroll to 12/24/2015 
    
* 7/8/2020
  * Write data to pickle files


## Import standard python datascience packages

In [None]:
import math
import re
import copy
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cloudpickle
import statistics as st
%matplotlib inline

In [None]:
from datetime import datetime, timedelta, date
from datascience import *
from scipy import stats
from statistics import mode, StatisticsError
from collections import Counter
import uuid
import random

## Import pdfminer packages

In [None]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal

### Show the directory we are running in

In [None]:
!pwd

### Load RIDE UCOA labels 

In [None]:
#Load data (deserialize)
with open('../../egsc/UCOA_labels.pkl', 'rb') as handle:
    UCOA_labels = cloudpickle.load(handle)
    
help(UCOA_labels)

### EG accounting codes class

provides descriptions for EG accounting codes and mapping to UCOA codes

In [None]:
#Load data (deserialize)
with open('../../egsc/EG_acct_codes.pkl', 'rb') as handle:
    EG_acct_codes = cloudpickle.load(handle)
    
help(EG_acct_codes)

### pay_check class

Represents a line in the earnings report

In [None]:
class pay_check():                                                            #generic check class
    def __init__(self,check_number,name,check_date,payperiod_object):         #constructor
        self.check_number   = check_number
        self.name           = name
        self.check_date     = check_date
        self.items          = {}
        self.pay_period     = payperiod_object
        self.fiscal_year    = self.pay_period.get_fiscal_year(check_date)
        self.school_year    = self.pay_period.get_school_year(check_date)
        self.calendar_year  = check_date.year
        return

    def get_name(self):
        return(self.name)
        
    def get_date(self):
        return(self.check_date)
    
    def get_number(self):
        return(self.check_number)
    
    def get_fiscal_year(self):
        return(self.fiscal_year)

    def get_school_year(self):
        return(self.school_year)

    def get_calendar_year(self):
        return(self.calendar_year)
    
    def get_items(self):
        return(self.items)
    
    def add_item(self,fund,acct,obj,position,rate,earnings,acct_desc,obj_desc,acct_UCOA,stepinfo):
        item_number = len(self.items) + 1
        self.items[item_number] = {'fund':fund,'acct':acct,'obj':obj,'position':position, \
            'rate':rate,'earnings':earnings,'acct_desc':acct_desc,'obj_desc':obj_desc, \
            'acct_UCOA':acct_UCOA,'step_info':stepinfo}
        return

### Payperiod class

Represents a two-week pay period

In [None]:
with open('../payperiod.pkl', 'rb') as handle:
    payperiod = cloudpickle.load(handle)
    
help(payperiod)

### Teacher salary matrix

In [None]:
with open('../teacher_salary_matrix.pkl', 'rb') as handle:
    teacher_salary_matrix = cloudpickle.load(handle)
    
help(teacher_salary_matrix)

### personnel classes

provides functionality related to HR 

In [None]:
class Person():                                                         #generic employee class
    def __init__(self,name,ppo,ega,ula):                                #constructor
        self.name = name
        self.payperiods = {}                                            #payperiods
        self.ppo = ppo                                                  #payperiod class object
        self.ega = ega                                                  #EG accounting class
        self.ula = ula                                                  #RIDE UCOA labels object
        self.retirement = np.NaN                                        #retirement date
        return
        
    def add_check(self,check_date,check):
        if check_date not in self.payperiods.keys():
            self.payperiods[check_date] = {}
        check_seq = 1+len(self.payperiods[check_date])
        self.payperiods[check_date][check_seq] = check
        return
    
    def get_name(self):                                                 #return name of person
        return(self.name)
    
    def get_position(self,syear,check_no):                              #return name of person
        return(self.name)
    
    def get_payperiods(self):
        return(self.payperiods)
    
    def set_retirement(self,retdate):
        self.retirement = retdate
        return
    
    def get_retirement(self):
        return(self.retirement)
    
    def get_payperiod(self,check_date):
        try:
            return(self.payperiods[check_date])
        except IndexError:
            return({})

## Read the pdf and create a dictionary with the contents of each text box

### Function read_pdf() reads a PDF and returns a dictionary containing the contents

Strategy for this document:  

Save information from each element in the LTTextBox objects in a dictionary including:

- x0 horizontal coordinate of the upper left corner of the text box
- x1 horizontal coordinate of the lower right corner of the text box
- y0 vertical coordinate of the upper left corner of the text box
- y1 vertical coordinate of the lower right corner of the text box
- page number 
- sequence number of text box within this page
- text contained in the text box, converted to ascii

Parsing the text is complicated by the fact that that a text box may span multiple columns and/or rows, and the text box groupings vary quite a bit depending on the page contents and layout.

However, with a bit of luck the structure of the document will allow the contents to be deciphered with the following heuristics:

- Text boxes containing left justified columns will tend to have nearly the same x0 coordinates
- Text boxes containing right justified columns will tend to have nearly the same x1 coordinates
- The codes for fund, account code, and object code are numeric and have fixed lengths
- Extraneous information is often preceded or followed by a series of underscore and newline characters
- Last name can be distinguished because is the only field that is all characters followed by a comma
- Last name may be preceded by between one and three numerical fields:  fund, account, object.  If it is, the x0 value is shifted to the left.
    - Three numerical fields precede the name:  assume they are fund, account, object
    - Two numerical fields precede the name: assume they are account, object
    - One numerical field precedes the name: assume it is object
    

In [None]:
def read_pdf(path):
    document = open(path, 'rb')                                     #read a pdf and create a document object
    rsrcmgr = PDFResourceManager()                                  #create a resource manager
    laparams = LAParams()                                           #set the parameters for analysis
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)          #create a PDF page aggregator object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    pdf={}                                                          #dictionary to hold the results

    pageno = -1                                                     #initialize page coounter to zero

    for page in PDFPage.get_pages(document):                        #loop through the pdf page by page
        pageno = pageno + 1                                         #increment the page number
        pdf[pageno] = {}                                            #dictionary for this page
        interpreter.process_page(page)                              # receive the LTPage object for the page.
        layout = device.get_result()                                # create layout object
        tbox_no=0                                                   # index for element number
        for element in layout:
            if (type(element).__name__=='LTTextBoxHorizontal'):     #loop through text boxes
                tbox_no += 1                                        #increment text box number
                pdf[pageno][tbox_no] = {}                           #dictionary for text boxes within page
                x0 = round(element.x0,2)                            #x0 coordinate of textbox corner
                x1 = round(element.x1,2)                            #x1 coordinate of textbox corner
                y0 = round(element.y0,2)                            #y0 coordinate of textbox corner
                y1 = round(element.y1,2)                            #y1 coordinate of textbox corner
                txt = element.get_text().encode('ascii', 'ignore')  #text converted to ascii
                pdf[pageno][tbox_no]['x0'] = x0                     #create x0 coordinate entry
                pdf[pageno][tbox_no]['x1'] = x1                     #create x1 coordinate entry
                pdf[pageno][tbox_no]['y0'] = y0                     #create y0 coordinate entry
                pdf[pageno][tbox_no]['y1'] = y1                     #create y1 coordinate entry

                pdf[pageno][tbox_no]['text'] = ''.join(chr(c) for c in txt) #convert bytes to string
    return(pdf)

### Utility functions

In [None]:
#remove the commas from earnings and rate values

def remove_commas(st):
    newstr = st.replace(',','')                     #remove commas from string
    return(newstr)

In [None]:
#remove the headings fields 

def remove_headings(st):
    lines = st.split('\n')                         #split the string at newline characters '\n'
    for line in lines:                             #loop through the resulting lines
        if (line.startswith('FUND ') |\
           (line.startswith('POSITION')) |\
           (line.startswith('RATE')) |\
           (line.startswith('ACCT-')) |\
           (line.startswith('CHECK')) |\
           (line.startswith('_'))):                #check for strings that appear only in headings
            try:
                newline_index = st.index('\n')     #if present, remove this line from the text string
                st = st[newline_index+1:]
            except ValueError:
                print('Value Error',st)            #recover from Value Error and print string
        else:
            return(st)                             #if no headings, just return
    return('')

### Read the FY2015 earnings report

In [None]:
p15 = read_pdf('../../finance_subcommittee/earnings/Munis_7-1-2014_to_6-30-2015.pdf')

### Read the FY2016 earnings report

In [None]:
p16 = read_pdf('../../finance_subcommittee/earnings/Munis_7-1-2015_to_6-30-2016.pdf')

### Read the FY2017 earnings report

In [None]:
#p17 = read_pdf('../FY17 Gene_Redacted.pdf')
p17 = read_pdf('../../finance_subcommittee/earnings/Munis_7-1-2016_to_6-30-2017.pdf')

### Read the FY2018 earnings report

In [None]:
#p18 = read_pdf('../FY18 Gene_Redacted.pdf')
p18 = read_pdf('../../finance_subcommittee/earnings/Munis_7-1-2017_to_6-30-2018.pdf')

### Read the FY2019 earnings report

In [None]:
p19 = read_pdf('../../finance_subcommittee/earnings/Munis_7-1-2018_to_7-1-2019.pdf')

### Read the FY2020 through current report

In [None]:
p20 = read_pdf('../../finance_subcommittee/earnings/Munis_7-1-2019_to_current.pdf')

### Build a dictionary with only those text boxes containing names

Use the following algorithm to identify text boxes that contain names:

- x0, horizontal coordinate of the upper left corner of the text box, is less than 162
- the text string contains at least one comma

In [None]:
def get_names(dct):

    dnames = {}

    fund = ''
    acct = ''
    obj  = ''
    
    for page in sorted(dct.keys()):                                #loop through text box dictionary by page # 
        if (page not in dnames.keys()):                            #page number is highest level key
            dnames[page] = {}                                      #initialize entry for this page
        for tb in sorted(dct[page].keys()):                        #loop through all text boxes on this page
            if (dct[page][tb]['x0'] < 162.0):                      #those with names start to the left of x0=162
                txt = str(dct[page][tb]['text'])                   #convert the 'text' element to a string
                if (',' in txt):                                   #every name contains a comma
                    txt = remove_headings(txt)
                    lines = txt.split('\n')                        #split text into lines
                    words = lines[0].split()                       #split first line into words
                    for word in words:                             #loop through and strip out fund, acct, obj
                        if (word.isdigit()):
                            if (len(word)==4):                     # 4 digits means fund
                                fund = word
                            if (len(word)==8):                     # 8 digits means acct-code
                                acct = word
                            if (len(word)==5):                     # 5 digits means obj
                                obj = word
                            txt = txt[len(word)+1:]                # remove fund/acct/obj from txt
                    dnames[page][tb] = {}                          #initialize dictionary for this page
                    dnames[page][tb]['x0'] = dct[page][tb]['x0']
                    dnames[page][tb]['x1'] = dct[page][tb]['x1']
                    dnames[page][tb]['y0'] = dct[page][tb]['y0']
                    dnames[page][tb]['y1'] = dct[page][tb]['y1']
                    dnames[page][tb]['fund'] = fund
                    dnames[page][tb]['acct'] = acct
                    dnames[page][tb]['obj'] = obj
                    dnames[page][tb]['text'] = txt
    return(dnames)

### Consolidate text boxes that overlap on the vertical scale and contain names

In [None]:
def consolidate_name_boxes(names):
    newnames = {}
    
    for page in sorted(names.keys()):                                        #loop through pages of pdf
        newnames[page] = {}                                                  #initialize new names dictionary
        skip = make_array()                                                  #initialize list of boxes to skip
    
        for tb in sorted(names[page].keys()):                                #loop through text boxes on this page
            for tb2 in sorted(names[page].keys()):                           #compare this one to the others
                if ((tb2 > tb) & \
                    (names[page][tb]['y0'] <= names[page][tb2]['y1']) & \
                    (names[page][tb2]['y0'] <= names[page][tb]['y1'])):      
                    d = {}                                                   #initialize replacement entry
                    d['x0'] = names[page][tb]['x0']                          #keep x0    
                    d['x1'] = names[page][tb2]['x1']                         #replace x1 with tb2 value
                    d['y0'] = names[page][tb2]['y0']                         #replace y0 with tb2 value
                    d['y1'] = names[page][tb]['y1']                          #keep y1 value
                    d['text'] = names[page][tb]['text'] +\
                        names[page][tb2]['text']                             #contatenate text strings
                    d['fund'] = names[page][tb]['fund']                      #copy fund, acct, and obj
                    d['acct'] = names[page][tb]['acct']
                    d['obj'] = names[page][tb]['obj']
                    newnames[page][tb2] = d                                  #plug into dictionary
                    skip = np.append(skip,tb)                                #add old boxes to skip list
                    skip = np.append(skip,tb2)
            if (tb not in skip):                                             #if no match, check skip list 
                newnames[page][tb] = names[page][tb]                         #just copy if not in skip list
                    
    return(newnames)

### Build a dictionary with earnings report items by text box

In [None]:
def combdd(cn,pdf):
    
    dd = {}
    
    for page in sorted(cn.keys()):
        if page not in dd.keys():
            dd[page] = {}
        for tb in sorted(cn[page].keys()):                               #loop through consolidated name textboxes
            dd[page][tb] = cn[page][tb]
            y0  = dd[page][tb]['y0']                                      #extract vertical coordinates
            y1  = dd[page][tb]['y1']
            txt = dd[page][tb]['text']                           #extract text
            for tb2 in sorted(pdf[page].keys()):                            #loop through the other boxes in pdf
                if (tb != tb2):                                             #ignore if same box as names
                    tx0 = pdf[page][tb2]['x0']                              #get horizontal offset
                    ty0 = pdf[page][tb2]['y0']                              #check whether the vertical 
                    ty1 = pdf[page][tb2]['y1']                              #range of this box overlaps that
                    if ((y0 <= ty1) & (ty0 <= y1)):                         #of the name box
                        txt = remove_headings(pdf[page][tb2]['text'])
                        if ((312.0 < tx0) & (tx0 < 316.0)):                 #match to DATE/NUMBER
                            dd[page][tb]['numbers1'] = txt
                        if ((383.0 < tx0) & (tx0 < 395.0)):                 #match to NUMBER
                            if 'numbers2' not in dd[page][tb].keys():
                                dd[page][tb]['numbers2'] = txt
                            else:
                                dd[page][tb]['numbers2'] += txt
                        if ((437.0 < tx0) & (tx0 < 440.0)):                 #match to POSITION
                            dd[page][tb]['positions'] = txt
                        if ((509.0 < tx0) & (tx0 < 533.0)):                 #match to RATE 
                            dd[page][tb]['rates'] = remove_commas(txt)
                        if ((558.0 < tx0) & (tx0 < 630.0)):                 #match to ACCT-EARNINGS
                            dd[page][tb]['earnings'] = remove_commas(txt)

    return(dd)

### Assemble data elements across columns

In [None]:
def get_lines(nn):
    
    lld = {}
    
    for page in sorted(nn.keys()):
        if page not in lld.keys():
            lld[page] = {}
        for tb in sorted(nn[page].keys()):
            if tb not in lld[page].keys():
                lld[page][tb]              = {}
                lld[page][tb]['names']     = []
                lld[page][tb]['checks']    = []
                lld[page][tb]['dates']     = []
                lld[page][tb]['rates']     = []
                lld[page][tb]['earnings']  = []
                lld[page][tb]['positions'] = []
                lld[page][tb]['fund']      = ''
                lld[page][tb]['acct']      = ''
                lld[page][tb]['obj']       = ''
            txt = nn[page][tb]['text']
            words = txt.split('\n')
            for word in words:
                if (len(word) > 1):
                    lld[page][tb]['names'].append(word)
            if 'numbers1' in nn[page][tb].keys():
                txt = nn[page][tb]['numbers1']
                words = txt.split('\n')
                for word in words:
                    if word.isdigit():
                        lld[page][tb]['checks'].append(word)
                    elif '/' in word:
                        lld[page][tb]['dates'].append(word)
            if 'numbers2' in nn[page][tb].keys():
                txt = nn[page][tb]['numbers2']
                words = txt.split('\n')
                for word in words:
                    if word.isdigit():
                        lld[page][tb]['checks'].append(word)
            if 'rates' in nn[page][tb].keys():
                txt = nn[page][tb]['rates']
                words = txt.split('\n')
                for word in words:
                    if '.' in word:
                        lld[page][tb]['rates'].append(float(word))
            if 'positions' in nn[page][tb].keys():
                txt = nn[page][tb]['positions']
                words = txt.split('\n')
                for word in words:
                    if len(word)>1:
                        lld[page][tb]['positions'].append(word)
            if 'fund' in nn[page][tb].keys():
                lld[page][tb]['fund'] = nn[page][tb]['fund']
            if 'acct' in nn[page][tb].keys():
                lld[page][tb]['acct'] = nn[page][tb]['acct']
            if 'obj' in nn[page][tb].keys():
                lld[page][tb]['obj'] = nn[page][tb]['obj']
            if 'earnings' in nn[page][tb].keys():
                txt = nn[page][tb]['earnings']
                had_underscore = False
                words = txt.split('\n')
                for word in words:
                    if '.' in word:
                        if not had_underscore: 
                            lld[page][tb]['earnings'].append(float(word))
                            had_underscore = False
                    elif '_' in word:
                        had_underscore = True
            if (len(lld[page][tb]['checks']) < len(lld[page][tb]['dates'])):
                new_checks = []
                check_index = 0
                for i in np.arange(len(lld[page][tb]['earnings'])):
                    if (lld[page][tb]['earnings'][i] > 0.0):
                        new_checks.append(lld[page][tb]['checks'][check_index])
                        check_index += 1
                    else:
                        new_checks.append('gen'+str(page) + '-' + str(tb) + '-' + str(i))
                        print("inserting check number: ",page,tb,i)
                lld[page][tb]['checks'] = new_checks
    return(lld)

### Read the earnings report extracts and process them

In [None]:
def process_earnings(pdf):
    nnd = get_names(pdf)
    cnd = consolidate_name_boxes(nnd)
    newnames = combdd(cnd,pdf)
    lld = get_lines(newnames)
    return(lld)

ll={}

ll[2015] = process_earnings(p15)
ll[2016] = process_earnings(p16)
ll[2017] = process_earnings(p17)
ll[2018] = process_earnings(p18)
ll[2019] = process_earnings(p19)
ll[2020] = process_earnings(p20)

### Check earnings against totals

In [None]:
totearn = {}

gtot = 0.0

for year in ll.keys():
    if year not in totearn.keys():
        totearn[year] = 0.0
    for page in ll[year].keys():
        for tb in ll[year][page].keys():
            for amt in ll[year][page][tb]['earnings']:
                totearn[year] += amt
    gtot += totearn[year]

print(round(totearn[2015],2))       #FY2015 earnings report total is $20,527,796.79
print(round(totearn[2016],2))       #FY2016 earnings report total is $21,988,876.13
print(round(totearn[2017],2))       #FY2017 earnings report total is $22,608,024.34
print(round(totearn[2018],2))       #FY2018 earnings report total is $22,409,915.41
print(round(totearn[2019],2))       #FY2019 earnings report total is $23,372,079.04
print(round(totearn[2020],2))       #FY2020ytd earnings report total is $19,489,886.57
print(gtot)

### Build a dictionary of checks

In [None]:
checks = {}

paydays = payperiod.get_payperiods()

for year in ll.keys():
    for page in ll[year].keys():
        for tb in ll[year][page].keys():
            check_numbers = ll[year][page][tb]['checks']
            names         = ll[year][page][tb]['names']
            check_dates   = ll[year][page][tb]['dates']
            fund          = ll[year][page][tb]['fund']
            acct          = ll[year][page][tb]['acct']
            obj           = ll[year][page][tb]['obj']
            positions     = ll[year][page][tb]['positions']
            rates         = ll[year][page][tb]['rates']
            earnings      = ll[year][page][tb]['earnings']
            obj_desc      = UCOA_labels.get_label('Obj',obj)
            acct_desc     = EG_acct_codes.get_eg_acct_desc(acct)
            acct_UCOA     = EG_acct_codes.get_eg_acct_UCOA(acct)
            
            for i in np.arange(len(check_numbers)):
                
                check_number    = check_numbers[i]
                name            = names[i]
                date_str        = check_dates[i]
                position        = positions[i]
                rate            = rates[i]
                earned          = earnings[i]
                
                words = date_str.split('/')
                check_date   = date(int(words[2]),int(words[0]),int(words[1]))
                if (check_date not in paydays.keys()):
                    new_date = payperiod.get_previous_payday(check_date)
                    print('adjusting date',name,check_date,new_date)
                    check_date = new_date
                stepdata = {}
                if (position == 'TEACHER'):
                    stepdata = teacher_salary_matrix.decode_earnings(check_date,rate,earned,payperiod)
                    if (len(stepdata['step']) < 1):
                        stepdata={}
                if check_number not in checks.keys():
                    checks[check_number] = pay_check(check_number,name,check_date,payperiod)

                checks[check_number].add_item(fund,acct,obj,position,rate,earned, \
                            acct_desc,obj_desc,acct_UCOA,stepdata)
                
print('Number of checks: ',len(checks))

In [None]:
with open('../../finance_subcommittee/earnings_checks.pkl', 'wb') as handle:
    cloudpickle.dump(checks, handle)

### Build a dictionary of people

In [None]:
people = {}

people_uuid = {}

for ckno in checks.keys():
    ck   = checks[ckno]
    name = ck.get_name()
    ckdate = ck.get_date()
    if name not in people.keys():
        people[name] = Person(name,payperiod,EG_acct_codes,UCOA_labels)
        people_uuid[name] = uuid.uuid4()

    people[name].add_check(ckdate,ck)
        
print(len(people))

with open('../../finance_subcommittee/earnings_uuid.pkl', 'wb') as handle:
    cloudpickle.dump(people_uuid, handle)

### Set retirement dates

In [None]:
people['ELSON, SUSAN E'].set_retirement(date(2017,5,26))
people['KOWAL, MAUREEN E'].set_retirement(date(2020,5,22))
people['CAVANAUGH, JUDITH L'].set_retirement(date(2020,5,22))
people['HOFFMANN, ILENE B'].set_retirement(date(2020,5,22))
people['LYONS, EILEEN C'].set_retirement(date(2020,5,22))
people['HADFIELD, RENEE M'].set_retirement(date(2020,5,22))
people['JOHNSON, TRESSA'].set_retirement(date(2020,5,22))
people['MCCOWAN, BARBARA C'].set_retirement(date(2020,5,22))
people['HOSTETLER, LYN A'].set_retirement(date(2020,5,22))
people['MACARUSO, JOANNE E'].set_retirement(date(2020,5,22))
people['MALLOZZI, JOANN S'].set_retirement(date(2020,5,22))
people['ISIBEL, DAVID R'].set_retirement(date(2020,5,22))
people['DEPASQUALE, HELEN L'].set_retirement(date(2020,5,22))


In [None]:
with open('../../finance_subcommittee/earnings_people.pkl', 'wb') as handle:
    cloudpickle.dump(people, handle)

In [None]:
def most_frequent(List): 
    occurence_count = Counter(List) 
    try:
        return occurence_count.most_common(1)[0][0]
    except IndexError:
        return np.NaN

### Build dictionary of earnings history summaries by person with totals

In [None]:
ehist = {}

for name in people.keys():
    if name not in ehist.keys():
        ehist[name] = {}
    ppds = people[name].get_payperiods()
    for ckdate in ppds.keys():
        if ckdate not in ehist[name].keys():
            ehist[name][ckdate] = {}
        for seq in ppds[ckdate].keys():
            check = ppds[ckdate][seq]
            items = check.get_items()
            for key in items.keys():
                obj = items[key]['obj']
                if obj not in ehist[name][ckdate].keys():
                    ehist[name][ckdate][obj] = {}
                    ehist[name][ckdate][obj]['earnings'] = []
                    ehist[name][ckdate][obj]['rate'] = []
                    ehist[name][ckdate][obj]['acct'] = []
                    ehist[name][ckdate][obj]['acct6'] = []
                    ehist[name][ckdate][obj]['step'] = []
                    ehist[name][ckdate][obj]['payments'] = []
                    ehist[name][ckdate][obj]['fte'] = []
                    ehist[name][ckdate][obj]['mindiff'] = []
                    ehist[name][ckdate][obj]['salary'] = []
                    ehist[name][ckdate][obj]['position'] = []
                ehist[name][ckdate][obj]['acct'].append(items[key]['acct'])
                ehist[name][ckdate][obj]['acct6'].append(np.floor(int(items[key]['acct'])/100))
                ehist[name][ckdate][obj]['rate'].append(items[key]['rate'])
                ehist[name][ckdate][obj]['earnings'].append(items[key]['earnings'])
                ehist[name][ckdate][obj]['position'].append(items[key]['position'])
                if 'step' in items[key]['step_info'].keys():
                    ehist[name][ckdate][obj]['step'].append(items[key]['step_info']['step'])
                if 'mindiff' in items[key]['step_info'].keys():
                    ehist[name][ckdate][obj]['mindiff'].append(items[key]['step_info']['mindiff'])
                if 'payments' in items[key]['step_info'].keys():
                    ehist[name][ckdate][obj]['payments'].append(items[key]['step_info']['payments'])
                if 'salary' in items[key]['step_info'].keys():
                    ehist[name][ckdate][obj]['salary'].append(items[key]['step_info']['salary'])
                if 'fte' in items[key]['step_info'].keys():
                    ehist[name][ckdate][obj]['fte'].append(items[key]['step_info']['fte'])
                ehist[name][ckdate][obj]['total_earnings'] = \
                    np.sum(ehist[name][ckdate][obj]['earnings'])
                ehist[name][ckdate][obj]['payments_mode']=\
                    most_frequent(ehist[name][ckdate][obj]['payments'])
                ehist[name][ckdate][obj]['step_mode']=\
                    most_frequent(ehist[name][ckdate][obj]['step'])
                ehist[name][ckdate][obj]['salary_mode']=\
                    most_frequent(ehist[name][ckdate][obj]['salary'])
                ehist[name][ckdate][obj]['acct6_mode']=\
                    most_frequent(ehist[name][ckdate][obj]['acct6'])
                acct_mode = most_frequent(ehist[name][ckdate][obj]['acct'])
                ehist[name][ckdate][obj]['acct_mode']=acct_mode
                ehist[name][ckdate][obj]['fte_mode']=\
                    most_frequent(ehist[name][ckdate][obj]['fte'])
                ehist[name][ckdate][obj]['acct6_desc'] =\
                    EG_acct_codes.get_eg_acct_desc6(str(int(ehist[name][ckdate][obj]['acct6_mode'])))
                ehist[name][ckdate][obj]['acct_desc'] =\
                    EG_acct_codes.get_eg_acct_desc(str(int(ehist[name][ckdate][obj]['acct_mode'])))
                ehist[name][ckdate][obj]['ucoa'] = EG_acct_codes.get_UCOA_from_acct(acct_mode,UCOA_labels)
                total_fte = \
                    ehist[name][ckdate][obj]['total_earnings']/(\
                    ehist[name][ckdate][obj]['salary_mode']/\
                    ehist[name][ckdate][obj]['payments_mode'])
                if (len(ehist[name][ckdate][obj]['fte'])==1):
                    total_fte = ehist[name][ckdate][obj]['fte'][0]
                if (total_fte > 1.0):
                    total_fte = 1.0
                ehist[name][ckdate][obj]['total_fte'] = total_fte
        
len(ehist)

In [None]:
with open('../../finance_subcommittee/earnings_ehist.pkl', 'wb') as handle:
    cloudpickle.dump(ehist, handle)