In [24]:
%matplotlib inline
from __future__ import print_function, division


import matplotlib.pyplot as plt
import numpy as np
from os.path import expanduser
import pandas as pd
import re
import requests
from datetime import datetime

import os
import glob


In [3]:
OUT_FOLDER = '../Output-data/'
PATH = '../Data/'

### Exploratory Analysis of Federal Employment Data

There are two types of files: status and dynamic. Status files contain employement status for a given point in time. Dynamic files contain events of employement (hires, transfers, etc).

The dataset for Federal Employement Data is very large. For the exploratory analysis, we decided to subset the data and only analyze years 2007, 2008 and 2009 for dynamic files and Q1 of 2007 for status files.

### Process Dynamic Files

Dynamic files are relatively small. We will combine all files for the three years to create one dataframe and one csv file that can be used for further analysis.

In [34]:
def load_dynamic_file(spath, filetype):


    _id=[]
    _name=[]
    _agency=[]
    _accession=[]
    _effec_dt=[]
    _age=[]
    _pay=[]
    _grade=[]
    _los=[]
    _duty=[]
    _occup=[]
    _occup_cat=[]
    _adj_pay=[]
    _typ_app=[]
    _work_sche=[]
    _file_type=[]

    f = open(spath, 'r')

    for line in f:
    
        _id.append (line[0:9])
        _name.append (line[9:32])
        _agency.append (line[32:36])
        _accession.append (line[36:38])
        _effec_dt.append (datetime.strptime(line[38:46], '%Y%m%d'))
        _age.append (line[46:52])
        _pay.append (line[52:54])
        _grade.append (line[54:56])
        _los.append (line[56:62])
        _duty.append (line[62:71])
        _occup.append (line[71:75])
        _occup_cat.append (line[75:76])
        _adj_pay.append (line[76:82])
        _typ_app.append (line[82:84])
        _work_sche.append (line[84:85])
        _file_type.append (filetype)

    f.close()  
    
    df = pd.DataFrame({'empid': _id, 'agency': _agency, 'accession_ind': _accession, 'effective_dt': _effec_dt,
                       'age': _age, 'pay_plan': _pay, 'grade': _grade, 'los': _los, 'duty_station': _duty, 'occupation': _occup ,
                        'occupation_cat':_occup_cat, 'adjusted_pay':_adj_pay, 'type_appointment': _typ_app, 
                        'work_schedule': _work_sche, 'file_type':_file_type })
    
    return df

In [35]:
first_file=1

    
'''Start with employees of the Deparment of Defense'''
for infile in glob.glob( os.path.join(PATH, '*.DOD.*') ):
    print ("current file is: " + infile)
    df=load_dynamic_file(infile, 'DYN_DOD')
    if first_file==1:
        dyn_files = pd.DataFrame(data=df)
        first_file=0
    else:
        dyn_files=dyn_files.append(df)

    
'''Continue with employees outside of DOD'''    

for infile in glob.glob( os.path.join(PATH, '*.NONDOD.*') ):
    print ("current file is: " + infile)
    df=load_dynamic_file(infile, 'DYN_NDOD')
    dyn_files=dyn_files.append(df)

current file is: ../Data\DEC2007.DOD.FO05M4.TXT
current file is: ../Data\DEC2008.DOD.FO05M4.TXT
current file is: ../Data\DEC2009.DOD.FO05M4.TXT
current file is: ../Data\JUN2007.DOD.FO05M4.TXT
current file is: ../Data\JUN2008.DOD.FO05M4.TXT
current file is: ../Data\JUN2009.DOD.FO05M4.TXT
current file is: ../Data\MAR2007.DOD.FO05M4.TXT
current file is: ../Data\MAR2008.DOD.FO05M4.TXT
current file is: ../Data\MAR2009.DOD.FO05M4.TXT
current file is: ../Data\SEP2007.DOD.FO05M4.TXT
current file is: ../Data\SEP2008.DOD.FO05M4.TXT
current file is: ../Data\SEP2009.DOD.FO05M4.TXT
current file is: ../Data\DEC2007.NONDOD.FO05M3.TXT
current file is: ../Data\DEC2008.NONDOD.FO05M3.TXT
current file is: ../Data\DEC2009.NONDOD.FO05M3.TXT
current file is: ../Data\JUN2007.NONDOD.FO05M3.TXT
current file is: ../Data\JUN2008.NONDOD.FO05M3.TXT
current file is: ../Data\JUN2009.NONDOD.FO05M3.TXT
current file is: ../Data\MAR2007.NONDOD.FO05M3.TXT
current file is: ../Data\MAR2008.NONDOD.FO05M3.TXT
current file is:

In [36]:
# Save dataframe to csv file

dyn_files.to_csv('../Output-data/dyn_files_2007_2009.csv')

In [37]:
dyn_files.describe()

Unnamed: 0,accession_ind,adjusted_pay,age,agency,duty_station,effective_dt,empid,file_type,grade,los,occupation,occupation_cat,pay_plan,type_appointment,work_schedule
count,1603339,1603339.0,1603339,1603339,1603339,1603339,1603339.0,1603339,1603339.0,1603339,1603339.0,1603339,1603339,1603339.0,1603339
unique,17,80557.0,14,556,11000,1557,1128829.0,2,159.0,11,733.0,7,191,17.0,9
top,AD,27504.0,20-24,VATA,#########,2007-04-29 00:00:00,1836554.0,DYN_NDOD,5.0,UNSP,303.0,T,GS,10.0,F
freq,499919,30060.0,234896,198500,805915,16813,16.0,1020127,208780.0,628293,117094.0,369403,972954,295346.0,1259667
first,,,,,,2005-04-03 00:00:00,,,,,,,,,
last,,,,,,2009-12-31 00:00:00,,,,,,,,,


### Process Status Files



In [27]:
def load_status_file(spath, filetype):
    '''Purpose:  Load a status file into a dataframe'''
    '''Note1:    Position of data elements was obtained from 2015-02-11-opm-foia-response.pdf'''
    
    '''Note2:    Name will not be loaded since it is empty in all instances and we will not use it due to privacy concerns'''
    
    _id = []  
    _file_dt = []
    _agency = []
    _accession = []
    _effec_dt = []
    _age = []
    _years_edu = []
    _educ = []
    _pay = []
    _grade = []
    _los = []
    _duty = []
    _occup = []
    _occup_cat = []
    _adj_pay = []
    _typ_app = []
    _work_sche = []
    _nsftp_ind = []
    _file_type = []
    _sup_status = []

    f = open(spath, 'r')

    for line in f:
        _id.append (line[0:9])
        #_name.append (line[9:32])
        _file_dt.append (datetime.strptime(line[32:40], '%Y%m%d'))
        _agency.append (line[40:44])
        _duty.append (line[44:53])
        _age.append (line[53:59]) 
        _educ.append (line[59:61])
        _pay.append (line[61:63])
        _grade.append (line[63:65])
        _los.append (line[65:71])
        _occup.append (line[71:75])
        _occup_cat.append (line[75:76])
        _adj_pay.append (line[76:82])
        _sup_status.append (line[82:83])
        _typ_app.append (line[83:85])
        _work_sche.append (line[85:86])
        _nsftp_ind.append (line[86:87])
        _file_type.append (filetype)
        
    f.close()  
    
    df = pd.DataFrame({'empid': _id,  'file_dt': _file_dt, 'agency': _agency, 'duty_station': _duty, 
                       'age': _age, 'educ level': _educ, 'pay_plan': _pay, 'grade': _grade, 
                       'los': _los, 'occupation': _occup ,'occupation_cat':_occup_cat, 'adjusted_pay':_adj_pay, 
                       'superv_status': _sup_status, 'type_appointment': _typ_app, 'work_schedule': _work_sche, 
                       'nsftp_ind': _nsftp_ind, 'file_type':_file_type })
    
    return df

Status files are very large to handle on my laptop. I will create one file for Q1 -2007.

In [38]:
first_file=1

for infile in glob.glob( os.path.join(PATH, 'Status_*2007_03*') ):
    print ("current file is: " + infile)
    
    if infile[15:18]=='DoD':
        df=load_status_file(infile, 'STA_DOD')
    else:
        df=load_status_file(infile, 'STA_NDOD')
        
    if first_file==1:
        dfstat_2007_03 = pd.DataFrame(data=df)
        first_file=0
    else:
        dfstat_2007_03=dfstat_2007_03.append(df) 
        
        

        


current file is: ../Data\Status_DoD_2007_03.txt
current file is: ../Data\Status_Non_DoD_2007_03.txt


In [39]:
dfstat_2007_03.describe()

Unnamed: 0,adjusted_pay,age,agency,duty_station,educ level,empid,file_dt,file_type,grade,los,nsftp_ind,occupation,occupation_cat,pay_plan,superv_status,type_appointment,work_schedule
count,1844970.0,1844970,1844970,1844970,1844970.0,1844970.0,1844970,1844970,1844970.0,1844970,1844970.0,1844970.0,1844970,1844970,1844970.0,1844970.0,1844970
unique,88590.0,14,529,11828,24.0,1842387.0,1,2,154.0,11,2.0,752.0,7,184,7.0,17.0,9
top,28862.0,50-54,VATA,#########,4.0,4101442.0,2007-03-31 00:00:00,STA_NDOD,12.0,5-9,1.0,301.0,A,GS,8.0,10.0,F
freq,12410.0,323634,220064,1018462,522695.0,4.0,1844970,1182424,212160.0,292531,1600145.0,81427.0,636718,1198573,1576128.0,1122683.0,1679241
first,,,,,,,2007-03-31 00:00:00,,,,,,,,,,
last,,,,,,,2007-03-31 00:00:00,,,,,,,,,,


In [40]:
dfstat_2007_03.to_csv('../Output-data/stat_2007_03.csv')