## Extract salary and step info from MUNIS Individual Earnings Record by Account report

E. Quinn 8/7/2019

This notebook uses pdfminer to extract the content from a MUNIS report showing individual earnings

The documentation for pdfminer is at:

https://buildmedia.readthedocs.org/media/pdf/pdfminer-docs/latest/pdfminer-docs.pdf

## Import standard python datascience packages

In [None]:
import math
import re
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Import pdfminer packages

In [None]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal

### Show the directory we are running in

In [None]:
!pwd

## Perform layout analysis - see section 2.3 of the pdfminer documentation


### Read the pdf and create a document object

In [None]:
document = open('../FY17 Gene_Redacted.pdf', 'rb')

### Create a resource manager object

In [None]:
rsrcmgr = PDFResourceManager()

### Set the parameters for analysis

In [None]:
laparams = LAParams()

### Create a PDF page aggregator object

In [None]:
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

### Store the information in a dictionary

Strategy for this document:  

Save information from each element in the LTTextBox objects in a dictionary including:

- x0 horizontal coordinate of the upper left corner of the text box
- x1 horizontal coordinate of the lower right corner of the text box
- page number 
- sequence number of text box within this page
- text contained in the text box, converted to ascii

Parsing the text is complicated by the fact that that a text box may span multiple columns and/or rows, and the text box groupings vary quite a bit depending on the page contents and layout.

However, with a bit of luck the structure of the document will allow the contents to be deciphered with the following heuristics:

- Text boxes containing left justified columns will tend to have nearly the same x0 coordinates
- Text boxes containing right justified columns will tend to have nearly the same x1 coordinates
- The codes for fund, account code, and object code are numeric and have fixed lengths
- Extraneous information is often preceded or followed by a series of underscore and newline characters
- Last name can be distinguished because is the only field that is all characters followed by a comma
- Last name may be preceded by between one and three numerical fields:  fund, account, object.  If it is, the x0 value is shifted to the left.
    - Three numerical fields precede the name:  assume they are fund, account, object
    - Two numerical fields precede the name: assume they are account, object
    - One numerical field precedes the name: assume it is object

In [None]:
pdf={}                                     #dictionary to hold the results

pageno = 0                                 #initialize page coounter to zero

for page in PDFPage.get_pages(document):   #loop through the pdf page by page
    pageno = pageno + 1                    #increment the page number
    pdf[pageno] = {}                       #dictionary for this page
    interpreter.process_page(page)         # receive the LTPage object for the page.
    layout = device.get_result()           # create layout object
    tbox_no=0                              # index for element number
    for element in layout:
        if (type(element).__name__=='LTTextBoxHorizontal'):             #loop through text boxes
            tbox_no += 1                                                #increment text box number
            pdf[pageno][tbox_no] = {}                                   #dictionary for text boxes within page
            x0 = round(element.x0,2)                                    #x0 coordinate of textbox corner
            x1 = round(element.x1,2)                                    #x1 coordinate of textbox corner
            txt = element.get_text().encode('ascii', 'ignore')          #text converted to ascii
            pdf[pageno][tbox_no]['x0'] = x0                             #create x0 coordinate entry
            pdf[pageno][tbox_no]['x1'] = x1                             #create x1 coordinate entry
            pdf[pageno][tbox_no]['text'] = txt                          #create text entry

### Show the results for the first two pages

In [None]:
pdf[1]     #page 1

In [None]:
pdf[2]   #page 2

### Salary steps FY2017

In [None]:
step_cat=['B','B+30','M','M+30','M2/CAGS','D']

steps_2016_2017      = np.array([
    [41286,44871,48494,52118,55743,59366,62991,66616,71741,78898],  #B           
    [42900,46484,50106,53729,57354,60979,64605,68228,73353,80675],  #B+30        
    [43871,47454,51078,54700,58328,61951,65574,69199,74323,81743],  #M           
    [44505,48085,51709,55332,58958,62583,66206,69829,74954,82438],  #M+30        
    [44893,48474,52098,55722,59347,62974,66596,69806,75345,82866],  #CAGS/2M     
    [45186,48771,52393,56018,59642,63266,66892,70515,75639,83190]   #D          
    ],dtype=float) 

### Subroutine builds a data dictionary with fund, acct-code, object, and name

In [None]:
def updcd(dd,page,fund,acct,obj,lst):
    if (page not in dd.keys()):
        dd[page] = {}
    k = len(dd[page])
    for i in range(0,len(lst)):
        k += 1
        dd[page][k] = {}
        dd[page][k]['fund']     = fund
        dd[page][k]['acctcode'] = acct
        dd[page][k]['objcode']  = obj
        dd[page][k]['name']     = lst[i]
        dd[page][k]['position'] = ''
        dd[page][k]['rate']     =  np.NaN
        dd[page][k]['earnings'] =  np.NaN

### Subroutine updates string columns (position) in the data dictionary

In [None]:
def updstrcol(dd,page,col,lst):
    
    indx = 0
    
    for row in dd[page].keys():             #loop through rows
        if (not dd[page][row][col]):
            if (indx < len(lst)):
                dd[page][row][col] = lst[indx]
                indx += 1

### Subroutine update numeric colums (rate, earnings) in the data dictionary

In [None]:
def updnumcol(dd,page,col,lst):
    
    indx = 0
    try: 
        for row in dd[page].keys():             #loop through rows
            if (math.isnan(dd[page][row][col])):
                if (indx < len(lst)):
                    dd[page][row][col] = float(lst[indx].replace(',', ''))
                    indx += 1
    except KeyError:
        print('KeyError page ' + str(page))
        print(col)
        print(lst)

### Parse the information extracted from the text boxes

In [None]:
cd = {}                               #parsed contents dictionary

page_no = 0

for page in range(1,len(pdf)):                                      #loop through pages
    page_no += 1
    print('page ' + str(page_no))
    
    for tbox in sorted(pdf[page].keys()):                           #loop through textboxes
        text = pdf[page][tbox]['text']                              # text string
        x0 = pdf[page][tbox]['x0']                                  # x coordinate of upper left hand corner
        x1 = pdf[page][tbox]['x1']                                  # x coordinate of loweer right hand corner
        words = text.split('\n')                                    #split text at newlines
        
        if (x0<163.0):       #line that might have a name
            if (('FUND ACCT-CODE' in text) & (len(words) > 3)):     #line with fund, acct-code, objcode plus names
                if (len(words[2]) > 0):                             #ignore lines with null character strings
                    w1 = words[2].split(' ')                        #split the first entry by spaces
                    if (len(w1) > 4):                               #length > 4 mean fund, acct-code and obj present
                        fund = w1[0]                                #fund is the first word
                        acctcode = w1[1]                            #acct-code is the second
                        objcode = w1[2]                             #obj is the third
                        
                        w1l = len(w1[0]) + len(w1[1]) + len(w1[2]) + 2   #get the length of the numeric code string
                    words = words[2:]                                #remove the heading fields
                    words[0] = words[0][1+w1l:]                      #remove the codes preceding the first name
                        
            elif ((x0 > 161.0) & (x0 < 162.0) & (x1 > 200.0)):      #line with just names
                print(' ')
         
            elif ((x0 > 123.0) & (x0 < 124.0)):                     #line with names and objcode
                w1 = words[0].split(' ')                            #split the first entry into obj and name
                objcode = w1[0]                                     #separate the obj code from the name

                w1l = len(w1[0])                                    #remove the object code from the name string
                words[0] = words[0][1+w1l:]                          
                
            elif ((x0 > 68.0) & (x0 < 69.0)):                       #line with names, acct-code, and objcode
                w1 = words[0].split(' ')                            #the acct-code comes first
                acctcode = w1[0]
                
                objcode = w1[1]                                     #followed by obj
                
                w1l = len(w1[0]) + 1 + len(w1[1])                   #remove the numerical values before the name
                words[0] = words[0][1+w1l:]
                
            if not words[len(words)-1]:                             #check if last entry is a null string
                words.remove(words[len(words)-1])                   # if so, discard it

            if ((',' in words[0]) & (not words[0][0].isdigit())):   #filter out earnings columns 
                updcd(cd,page_no,fund,acctcode,objcode,words)       #update the data dictionary for this page
                
        if ((x0>439.0) & (x0 < 440.0)):                             #line that might have a position
            if (words[0] == 'POSITION'):                            #discard heading
                words = words[2:]
            if not words[len(words)-1]:                             #check if last entry is a null string
                words.remove(words[len(words)-1])                   # if so remove it
            
            if (len(words) > 0):                                    #update the data dictionary with position
                updstrcol(cd,page_no,'position',words)                
                
        if ((x1>564.0) & (x1 < 566.0)):                             #line that might have a rate
            if (words[0] == 'RATE'):                                #remove heading
                words = words[2:]
            if not words[len(words)-1]:                             #check if last entry is a null string
                words.remove(words[len(words)-1])                   # if so, discard it
            
            if (len(words) > 0):                                    #add rates to data dictionary
                updnumcol(cd,page_no,'rate',words)
                
        if ((x1>651.0) & (x0 < 653.0)):                             #line that might have earnings
            if (words[0] == 'ACCT-EARNINGS'):                       #remove heading
                words = words[2:]
                
            if not words[len(words)-1]:                             #check if last entry is a null string
                words.remove(words[len(words)-1])                   # if so, discard it
                
            underscores = 0
            ix = 0
            underscore_found=False
            
            for word in words:
                ix += 1
                if (('____' in word) & (not underscore_found)):
                    underscores = ix
                    underscore_found=True
                    words = words[0:underscores-1]
            
            if (len(words) > 0):                                    #add rates to data dictionary
                updnumcol(cd,page_no,'earnings',words)

In [None]:
def find_step(salary,steps):
    s = [-1,-1]
    for i in range(0,steps.shape[0]):
        for j in range(0,steps.shape[1]):
            if (abs(salary - steps[i,j]) < 2):
                stepcat = i
                step = j
                s = [i,j]
    return(s)

### Code salary and step values

In [None]:
for page in cd.keys():
    for indx in cd[page].keys():
        salary= round(184.0*cd[page][indx]['rate'])
        cd[page][indx]['salary'] = salary
        ss = find_step(salary,steps_2016_2017)
        if (ss[0] >= 0):
            cd[page][indx]['stepcat'] = step_cat[ss[0]]
        else:
            cd[page][indx]['stepcat'] = ''
        if (ss[1] >= 0):
            cd[page][indx]['step'] = str(1+ss[1])
        else:
            cd[page][indx]['step'] = ''

### Write earnings to a csv file

In [None]:
with open("../fy2017_earnings.csv", 'w') as out_file:
    st = 'fund,acct,obj,name,position,rate,earnings,salary,stepcat,step\n'
    out_file.write(st)
    
    for page in cd.keys():
        for indx in cd[page].keys():
            st = cd[page][indx]['fund'] + ','
            st = st + cd[page][indx]['acctcode'] + ','
            st = st + cd[page][indx]['objcode'] + ','
            st = st + '"' + cd[page][indx]['name'] + '",'
            st = st + cd[page][indx]['position'] + ','
            st = st + str(cd[page][indx]['rate']) + ','
            st = st + str(cd[page][indx]['earnings']) + ','
            st = st + str(cd[page][indx]['salary']) + ','
            st = st + str(cd[page][indx]['stepcat']) + ','
            st = st + str(cd[page][indx]['step']) + '\n'
            out_file.write(st)